In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, f1_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBRFClassifier

In [2]:
! kaggle datasets download -d stealthtechnologies/employee-attrition-dataset
! unzip /content/employee-attrition-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/stealthtechnologies/employee-attrition-dataset
License(s): apache-2.0
Downloading employee-attrition-dataset.zip to /content
  0% 0.00/1.72M [00:00<?, ?B/s]
100% 1.72M/1.72M [00:00<00:00, 25.4MB/s]
Archive:  /content/employee-attrition-dataset.zip
  inflating: test.csv                
  inflating: train.csv               


In [3]:
df_train = pd.read_csv('/content/train.csv')
df_test = pd.read_csv('/content/test.csv')

df = pd.concat([df_train, df_test], ignore_index=True)

In [4]:
cols_to_drop = ["Employee ID", "Monthly Income",  'Years at Company']
df = df.drop(cols_to_drop, axis=1)

In [5]:
label_encoders = {}

for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le


In [6]:
X = df.drop('Attrition', axis=1)
y = df['Attrition']

In [7]:
# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [8]:
# Train the model
model = GradientBoostingClassifier(random_state=42)
model.fit(X, y)

In [9]:
import joblib
# Save the model, scaler, label encoders, and feature names
joblib.dump(model, 'attrition_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(df.drop('Attrition', axis=1).columns.tolist(), 'feature_names.pkl')  # Save feature names

print("Model, scaler, label encoders, and feature names saved!")

Model, scaler, label encoders, and feature names saved!


STREAM

In [10]:
%%writefile app.py

import streamlit as st
import numpy as np
import joblib

# Load the saved model, scaler, label encoders, and feature names
model = joblib.load('attrition_model.pkl')
scaler = joblib.load('scaler.pkl')
label_encoders = joblib.load('label_encoders.pkl')
feature_names = joblib.load('feature_names.pkl')

st.title("Employee Attrition Prediction")

# Create inputs for each feature
user_input = {}
for feature in feature_names:
    if feature in label_encoders:
        options = list(label_encoders[feature].classes_)
        user_input[feature] = st.selectbox(f"{feature}", options)
    else:
        user_input[feature] = st.text_input(f"{feature}")

if st.button("Predict"):
    # Preprocess the input data
    input_data = []
    for feature in feature_names:
        if feature in label_encoders:
            input_data.append(label_encoders[feature].transform([user_input[feature]])[0])
        else:
            input_data.append(float(user_input[feature]))

    input_data = np.array(input_data).reshape(1, -1)
    input_data = scaler.transform(input_data)

    # Make the prediction
    prediction = model.predict(input_data)

    st.write(f"Predicted Attrition: {'Yes' if prediction[0] == 1 else 'No'}")


Writing app.py


In [11]:
!streamlit run app.py &  # The ampersand (&) allows it to run in the background


/bin/bash: line 1: streamlit: command not found


In [None]:

from pyngrok import ngrok

# Now that the Streamlit app is running, open the tunnel
public_url = ngrok.connect(8501)  # ngrok will automatically detect the port 8501
print('Public URL:', public_url)




Public URL: NgrokTunnel: "https://ec0c-34-106-167-209.ngrok-free.app" -> "http://localhost:8501"


TEST

In [None]:
!ngrok config add-authtoken 2khUUiLX9ZZDQFlzPMovQWbvMib_ANq4DjAG4W1G4bLjoKHH


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Convert the numerical predictions back to the original categorical names
y_test_orig = label_encoders['Attrition'].inverse_transform(y_test)
y_pred_orig = label_encoders['Attrition'].inverse_transform(y_pred)

In [None]:
# Calculate the accuracy and f1 score
accuracy = accuracy_score(y_test_orig, y_pred_orig)
f1 = f1_score(y_test_orig, y_pred_orig, average='weighted')


# Print the accuracy score
print("\033[1m**Accuracy**:\033[0m\n", accuracy)

# Print the confusion matrix
print("\n\033[1m**Confusion Matrix**:\033[0m\n", confusion_matrix(y_test_orig, y_pred_orig))

# Print the classification report
print("\n\033[1m**Classification Report**:\033[0m\n", classification_report(y_test_orig, y_pred_orig))

[1m**Accuracy**:[0m
 0.7604697986577181

[1m**Confusion Matrix**:[0m
 [[5274 1822]
 [1747 6057]]

[1m**Classification Report**:[0m
               precision    recall  f1-score   support

        Left       0.75      0.74      0.75      7096
      Stayed       0.77      0.78      0.77      7804

    accuracy                           0.76     14900
   macro avg       0.76      0.76      0.76     14900
weighted avg       0.76      0.76      0.76     14900



END