In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, f1_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [19]:
df = pd.read_csv("attrition_data.csv")
df

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,8410,31,Male,19,Education,5390,Excellent,Medium,Average,2,...,0,Mid,Medium,89,No,No,No,Excellent,Medium,Stayed
1,64756,59,Female,4,Media,5534,Poor,High,Low,3,...,3,Mid,Medium,21,No,No,No,Fair,Low,Stayed
2,30257,24,Female,10,Healthcare,8159,Good,High,Low,0,...,3,Mid,Medium,74,No,No,No,Poor,Low,Stayed
3,65791,36,Female,7,Education,3989,Good,High,High,1,...,2,Mid,Small,50,Yes,No,No,Good,Medium,Stayed
4,65026,56,Male,41,Education,4821,Fair,Very High,Average,0,...,0,Senior,Medium,68,No,No,No,Fair,Medium,Stayed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74493,16243,56,Female,42,Healthcare,7830,Poor,Medium,Average,0,...,0,Senior,Medium,60,No,No,No,Poor,Medium,Stayed
74494,47175,30,Female,15,Education,3856,Good,Medium,Average,2,...,0,Entry,Medium,20,No,No,No,Good,Medium,Left
74495,12409,52,Male,5,Education,5654,Good,Very High,Below Average,0,...,4,Mid,Small,7,No,No,No,Good,High,Left
74496,9554,18,Male,4,Education,5276,Fair,High,Average,0,...,3,Mid,Large,5,No,No,No,Poor,High,Stayed


In [5]:
cols_to_drop = ["Employee ID", "Monthly Income",  'Years at Company']
df = df.drop(cols_to_drop, axis=1)
df

Unnamed: 0,Age,Gender,Job Role,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,Overtime,Distance from Home,Education Level,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,31,Male,Education,Excellent,Medium,Average,2,No,22,Associate Degree,...,0,Mid,Medium,89,No,No,No,Excellent,Medium,Stayed
1,59,Female,Media,Poor,High,Low,3,No,21,Master’s Degree,...,3,Mid,Medium,21,No,No,No,Fair,Low,Stayed
2,24,Female,Healthcare,Good,High,Low,0,No,11,Bachelor’s Degree,...,3,Mid,Medium,74,No,No,No,Poor,Low,Stayed
3,36,Female,Education,Good,High,High,1,No,27,High School,...,2,Mid,Small,50,Yes,No,No,Good,Medium,Stayed
4,56,Male,Education,Fair,Very High,Average,0,Yes,71,High School,...,0,Senior,Medium,68,No,No,No,Fair,Medium,Stayed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74493,56,Female,Healthcare,Poor,Medium,Average,0,Yes,40,Associate Degree,...,0,Senior,Medium,60,No,No,No,Poor,Medium,Stayed
74494,30,Female,Education,Good,Medium,Average,2,Yes,45,Master’s Degree,...,0,Entry,Medium,20,No,No,No,Good,Medium,Left
74495,52,Male,Education,Good,Very High,Below Average,0,No,4,Associate Degree,...,4,Mid,Small,7,No,No,No,Good,High,Left
74496,18,Male,Education,Fair,High,Average,0,No,13,Bachelor’s Degree,...,3,Mid,Large,5,No,No,No,Poor,High,Stayed


In [20]:
df.iloc[200]

Employee ID                       63053
Age                                  48
Gender                           Female
Years at Company                     31
Job Role                     Technology
Monthly Income                    11164
Work-Life Balance                  Good
Job Satisfaction                 Medium
Performance Rating              Average
Number of Promotions                  3
Overtime                             No
Distance from Home                   72
Education Level             High School
Marital Status                   Single
Number of Dependents                  1
Job Level                           Mid
Company Size                     Medium
Company Tenure                       74
Remote Work                          No
Leadership Opportunities             No
Innovation Opportunities             No
Company Reputation                 Poor
Employee Recognition                Low
Attrition                          Left
Name: 200, dtype: object

In [16]:
label_encoders = {}

for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le


In [17]:
df

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,8410,31,1,19,0,5390,0,2,0,2,...,0,1,1,89,0,0,0,0,2,1
1,64756,59,0,4,3,5534,3,0,3,3,...,3,1,1,21,0,0,0,1,1,1
2,30257,24,0,10,2,8159,2,0,3,0,...,3,1,1,74,0,0,0,3,1,1
3,65791,36,0,7,0,3989,2,0,2,1,...,2,1,2,50,1,0,0,2,2,1
4,65026,56,1,41,0,4821,1,3,0,0,...,0,2,1,68,0,0,0,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74493,16243,56,0,42,2,7830,3,2,0,0,...,0,2,1,60,0,0,0,3,2,1
74494,47175,30,0,15,0,3856,2,2,0,2,...,0,0,1,20,0,0,0,2,2,0
74495,12409,52,1,5,0,5654,2,3,1,0,...,4,1,2,7,0,0,0,2,0,0
74496,9554,18,1,4,0,5276,1,0,0,0,...,3,1,0,5,0,0,0,3,0,1


In [7]:
X = df.drop('Attrition', axis=1)
y = df['Attrition']

In [8]:
# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [10]:
# Train the model
model = GradientBoostingClassifier(random_state=42)
model.fit(X, y)

In [11]:
import joblib
# Save the model, scaler, label encoders, and feature names
joblib.dump(model, 'attrition_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(df.drop('Attrition', axis=1).columns.tolist(), 'feature_names.pkl')  # Save feature names

print("Model, scaler, label encoders, and feature names saved!")

Model, scaler, label encoders, and feature names saved!


STREAM

In [10]:
%%writefile app.py

import streamlit as st
import numpy as np
import joblib

# Load the saved model, scaler, label encoders, and feature names
model = joblib.load('attrition_model.pkl')
scaler = joblib.load('scaler.pkl')
label_encoders = joblib.load('label_encoders.pkl')
feature_names = joblib.load('feature_names.pkl')

st.title("Employee Attrition Prediction")

# Create inputs for each feature
user_input = {}
for feature in feature_names:
    if feature in label_encoders:
        options = list(label_encoders[feature].classes_)
        user_input[feature] = st.selectbox(f"{feature}", options)
    else:
        user_input[feature] = st.text_input(f"{feature}")

if st.button("Predict"):
    # Preprocess the input data
    input_data = []
    for feature in feature_names:
        if feature in label_encoders:
            input_data.append(label_encoders[feature].transform([user_input[feature]])[0])
        else:
            input_data.append(float(user_input[feature]))

    input_data = np.array(input_data).reshape(1, -1)
    input_data = scaler.transform(input_data)

    # Make the prediction
    prediction = model.predict(input_data)

    st.write(f"Predicted Attrition: {'Yes' if prediction[0] == 1 else 'No'}")


Writing app.py


In [11]:
!streamlit run app.py &  # The ampersand (&) allows it to run in the background


/bin/bash: line 1: streamlit: command not found


In [None]:

from pyngrok import ngrok

# Now that the Streamlit app is running, open the tunnel
public_url = ngrok.connect(8501)  # ngrok will automatically detect the port 8501
print('Public URL:', public_url)




Public URL: NgrokTunnel: "https://ec0c-34-106-167-209.ngrok-free.app" -> "http://localhost:8501"


TEST

In [None]:
!ngrok config add-authtoken 2khUUiLX9ZZDQFlzPMovQWbvMib_ANq4DjAG4W1G4bLjoKHH


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Convert the numerical predictions back to the original categorical names
y_test_orig = label_encoders['Attrition'].inverse_transform(y_test)
y_pred_orig = label_encoders['Attrition'].inverse_transform(y_pred)

In [None]:
# Calculate the accuracy and f1 score
accuracy = accuracy_score(y_test_orig, y_pred_orig)
f1 = f1_score(y_test_orig, y_pred_orig, average='weighted')


# Print the accuracy score
print("\033[1m**Accuracy**:\033[0m\n", accuracy)

# Print the confusion matrix
print("\n\033[1m**Confusion Matrix**:\033[0m\n", confusion_matrix(y_test_orig, y_pred_orig))

# Print the classification report
print("\n\033[1m**Classification Report**:\033[0m\n", classification_report(y_test_orig, y_pred_orig))

[1m**Accuracy**:[0m
 0.7604697986577181

[1m**Confusion Matrix**:[0m
 [[5274 1822]
 [1747 6057]]

[1m**Classification Report**:[0m
               precision    recall  f1-score   support

        Left       0.75      0.74      0.75      7096
      Stayed       0.77      0.78      0.77      7804

    accuracy                           0.76     14900
   macro avg       0.76      0.76      0.76     14900
weighted avg       0.76      0.76      0.76     14900



END