<a href="https://colab.research.google.com/github/Mahendran180923/employee_attrition/blob/main/Employee_Attrition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Employee Attrition Analysis

In [None]:
!pip install streamlit





In [None]:
!pip install pyngrok



In [None]:
%%writefile app.py
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, r2_score, root_mean_squared_error
import streamlit as st
import pickle
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from pyngrok import ngrok


# Load dataset
employee_data = pd.read_csv('/content/Employee_Attrition.csv')


# Data Cleaning and Preprocessing

df = pd.DataFrame(employee_data)
# print(df.info())
# print(df.describe())

duplicates = df.duplicated().sum()
# print(f"Number of duplicate rows: {duplicates}")
df.drop(columns=['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], inplace=True)


# Exploratory data analysis

# Encode the object columns
encoder = {}
categorical_col = df.select_dtypes(include='object').columns
for col in categorical_col:
    encoder[col] = LabelEncoder()
    df[col] = encoder[col].fit_transform(df[col])


# Store the encoded file in pickle format
with open("encoder.pkl", 'wb') as f:
    pickle.dump(encoder, f)


# Outlier detecton
for col in df.columns:
    z_scores = np.abs(df[col] - df[col].mean() / df[col].std())
    outliers = df[z_scores > 4]
    # print(f"Outliers in column '{col}': {outliers.shape[0]}")
    # print(outliers)


# Find correlation
# corr_matrix = df.corr()
# sns.heatmap(corr_matrix, cmap='coolwarm', square=True)
# plt.title('Correlation Heatmap')
# plt.show()


# Machine Learning Model development

# Feature Selection
x = df.drop(['Attrition'], axis=1)
y = df['Attrition']


# Various Model Training
x_test, x_train, y_test, y_train = train_test_split(x, y, test_size=0.20, random_state=42)


# Random Forest Classifier Model
rfc = RandomForestClassifier(n_estimators=10)
rfc.fit(x_train, y_train)

# Gradient Boosting Classifier Model
gbc = GradientBoostingClassifier(learning_rate=0.1)
gbc.fit(x_train, y_train)

# Decision Tree Classifier Model
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)

# Model Evaluation Metrics
rfc_score = rfc.score(x_test, y_test)
gbc_score = gbc.score(x_test, y_test)
dtc_score = dtc.score(x_test, y_test)

print(f'Random Forest Accuracy Score: {rfc_score}')
print(f'Gradient Boosing Accuracy Score: {gbc_score}')
print(f'Decision Tree Accuracy Score: {dtc_score}')


# Save the model as pickle file
with open("model.pkl", 'wb') as f:
  pickle.dump(rfc, f)

# option = st.selectbox('Select employee number to fetch employee details or select details manually', ['Select Manually', 'Use Employee Number'])

# # if option == 'Select Manually':
if __name__ == "__main__":
  st.title("Employee Attrition Prediction")
  Age = st.number_input('Age', 18, 60)
  BusinessTravel = st.selectbox('BusinessTravel', employee_data['BusinessTravel'].unique())
  DailyRate = st.selectbox('DailyRate', sorted(employee_data['DailyRate'].unique()))
  Department = st.selectbox('Department', employee_data['Department'].unique())
  DistanceFromHome = st.selectbox('DistanceFromHome', sorted(employee_data['DistanceFromHome'].unique()))
  Education = st.selectbox('Education: Select "1" for below college or select "2" for above college ', employee_data['Education'].unique())
  EducationField = st.selectbox('EducationField', employee_data['EducationField'].unique())
  EnvironmentSatisfaction = st.selectbox('Satisfaction with the work environment: "1" = Low, "2" = Medium, "3" = High, "4" = Very High', sorted(employee_data['EnvironmentSatisfaction'].unique()))
  Gender = st.selectbox('Gender', employee_data['Gender'].unique())
  HourlyRate = st.selectbox('The employee hourly rate of pay', sorted(employee_data['HourlyRate'].unique()))
  JobInvolvement = st.selectbox('level of involvement the employee has in their job: "1" = Low, "2" = Medium, "3" = High, "4" = Very High', sorted(employee_data['JobInvolvement'].unique()) )
  JobLevel = st.selectbox('Job level of the Employee : e.g., "1" = Entry Level, "2" = Mid-Level, etc', sorted(employee_data['JobLevel'].unique()) )
  JobRole = st.selectbox('JobRole', employee_data['JobRole'].unique())
  JobSatisfaction = st.selectbox('Job Satisfaction with the job: "1"=Low, "2"=Medium, "3"=High, "4"=Very High', sorted(employee_data['JobSatisfaction'].unique()) )
  MaritalStatus = st.selectbox('MaritalStatus', employee_data['MaritalStatus'].unique())
  MonthlyIncome = st.selectbox('MonthlyIncome', sorted(employee_data['MonthlyIncome'].unique()))
  MonthlyRate = st.selectbox('MonthlyRate', sorted(employee_data['MonthlyRate'].unique()))
  NumCompaniesWorked = st.selectbox('NumCompaniesWorked', sorted(employee_data['NumCompaniesWorked'].unique()))
  OverTime = st.selectbox('OverTime', employee_data['OverTime'].unique())
  PercentSalaryHike = st.selectbox('PercentSalaryHike', sorted(employee_data['PercentSalaryHike'].unique()))
  PerformanceRating = st.selectbox('Peformance Rating: "1"=Low, "2"=Medium, "3"=High, "4"=Very High', sorted(employee_data['PerformanceRating'].unique()))
  RelationshipSatisfaction = st.selectbox('Relationship Satisfaction: "1"=Low, "2"=Medium, "3"=High, "4"=Very High', sorted(employee_data['RelationshipSatisfaction'].unique()))
  StockOptionLevel = st.number_input('Stock Option Level', min(employee_data['StockOptionLevel'].unique()), max(employee_data['StockOptionLevel'].unique()))
  TotalWorkingYears = st.selectbox('TotalWorkingYears', sorted(employee_data['TotalWorkingYears'].unique()))
  TrainingTimesLastYear = st.selectbox('TrainingTimesLastYear', sorted(employee_data['TrainingTimesLastYear'].unique()))
  WorkLifeBalance = st.selectbox('Work Life Balance: "1"=Low, "2"=Medium, "3"=High, "4"=Very High', sorted(employee_data['WorkLifeBalance'].unique()))
  YearsAtCompany = st.selectbox('YearsAtCompany', sorted(employee_data['YearsAtCompany'].unique()))
  YearsInCurrentRole = st.selectbox('YearsInCurrentRole', sorted(employee_data['YearsInCurrentRole'].unique()))
  YearsSinceLastPromotion = st.selectbox('YearsSinceLastPromotion', sorted(employee_data['YearsSinceLastPromotion'].unique()))
  YearsWithCurrManager = st.selectbox('YearsWithCurrManager', sorted(employee_data['YearsWithCurrManager'].unique()))


  user_data = pd.DataFrame(
      {'Age': [Age],
      'BusinessTravel': [BusinessTravel],
      'DailyRate': [DailyRate],
      'Department': [Department],
      'DistanceFromHome': [DistanceFromHome],
      'Education': [Education],
      'EducationField': [EducationField],
      'EnvironmentSatisfaction': [EnvironmentSatisfaction],
      'Gender': [Gender],
      'HourlyRate': [HourlyRate],
      'JobInvolvement': [JobInvolvement],
      'JobLevel': [JobLevel],
      'JobRole': [JobRole],
      'JobSatisfaction': [JobSatisfaction],
      'MaritalStatus': [MaritalStatus],
      'MonthlyIncome': [MonthlyIncome],
      'MonthlyRate': [MonthlyRate],
      'NumCompaniesWorked': [NumCompaniesWorked],
      'OverTime': [OverTime],
      'PercentSalaryHike': [PercentSalaryHike],
      'PerformanceRating': [PerformanceRating],
      'RelationshipSatisfaction': [RelationshipSatisfaction],
      'StockOptionLevel': [StockOptionLevel],
      'TotalWorkingYears': [TotalWorkingYears],
      'TrainingTimesLastYear': [TrainingTimesLastYear],
      'WorkLifeBalance': [WorkLifeBalance],
      'YearsAtCompany': [YearsAtCompany],
      'YearsInCurrentRole': [YearsInCurrentRole],
      'YearsSinceLastPromotion': [YearsSinceLastPromotion],
      'YearsWithCurrManager': [YearsWithCurrManager],})

  with open("/content/model.pkl", 'rb') as f:
      reloaded_rfc_attrition_prediction = pickle.load(f)

  with open("/content/encoder.pkl", 'rb') as f:
      reloaded_attrition_encoder = pickle.load(f)

  for col in user_data.columns:
      if col in reloaded_attrition_encoder:
          user_data[col] = reloaded_attrition_encoder[col].transform(user_data[col])

  if st.button('Predict Employee Attrition'):
      prediction = reloaded_rfc_attrition_prediction.predict(user_data)
      predicted_label = reloaded_attrition_encoder['Attrition'].inverse_transform(prediction)
      st.write(f"Employee Prediction: {predicted_label[0]}")

  # Create a public URL for the Streamlit app
    public_url = ngrok.connect(8501)
    print(public_url)



Writing app.py


In [None]:
!pip install streamlit pyngrok ngrok

Collecting streamlit
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting ngrok
  Downloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.43.2-py2.py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Downloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━

In [None]:
!streamlit run /content/app.py --server.runOnSave=true

/bin/bash: line 1: streamlit: command not found


## 10-07-2025 Data access

In [1]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/Employee_Attrition.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [2]:
df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0
