In [2]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [3]:
from google.colab import files
uploaded = files.upload()

Saving df_after_eda.csv to df_after_eda.csv


In [4]:
df = pd.read_csv('df_after_eda.csv')
df.head()


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [5]:
#Convert Attrition (Yes/No) → 1/0
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})
df['Attrition'].value_counts()

Unnamed: 0_level_0,count
Attrition,Unnamed: 1_level_1
0,1233
1,237


In [6]:
#Drop Useless Columns
'''
These columns add no predictive value:

EmployeeCount (always 1)

StandardHours (always 80)

EmployeeNumber (ID, meaningless)

Over18 (mostly constant)
'''


df = df.drop(['EmployeeCount', 'StandardHours', 'EmployeeNumber', 'Over18'], axis=1)

In [7]:
#Identify Categorical & Numerical Columns
categorical_cols = df.select_dtypes(include=['object']).columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

categorical_cols, numeric_cols

(Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
        'MaritalStatus', 'OverTime'],
       dtype='object'),
 Index(['Age', 'Attrition', 'DailyRate', 'DistanceFromHome', 'Education',
        'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
        'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
        'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
        'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
        'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
        'YearsSinceLastPromotion', 'YearsWithCurrManager'],
       dtype='object'))

In [8]:
#One-Hot Encode Categorical Variables
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1,1102,1,2,2,94,3,2,4,...,False,False,False,False,False,True,False,False,True,True
1,49,0,279,8,1,3,61,2,2,2,...,False,False,False,False,True,False,False,True,False,False
2,37,1,1373,2,2,4,92,2,1,3,...,True,False,False,False,False,False,False,False,True,True
3,33,0,1392,3,4,4,56,3,1,3,...,False,False,False,False,True,False,False,True,False,True
4,27,0,591,2,1,1,40,3,1,2,...,True,False,False,False,False,False,False,True,False,False


In [9]:
#for better ML performance, we convert to integers
df = df.astype(int)

In [10]:
#Train-Test Split

"""
Training data (used to train the models)

Testing data (used to evaluate the models on unseen data)
"""
#define X and y:
X = df.drop('Attrition', axis=1)
y = df['Attrition']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      # 80% train, 20% test
    random_state=42,    # reproducible results
    stratify=y          # keeps the same Yes/No ratio
)

In [12]:
#Check the split
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

print("\nTraining target distribution:")
print(y_train.value_counts())

print("\nTest target distribution:")
print(y_test.value_counts())


Training set shape: (1176, 44)
Test set shape: (294, 44)

Training target distribution:
Attrition
0    986
1    190
Name: count, dtype: int64

Test target distribution:
Attrition
0    247
1     47
Name: count, dtype: int64


In [13]:
#Scaling Numerical Features
"""
This step ensures all numerical variables are on the same scale.

Some machine learning models behave badly when features have different scales:

Logistic Regression

SVM

KNN

Neural Networks

Tree-based models (Random Forest, XGBoost) don't require scaling
"""

#Identify numeric columns again (after encoding)
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
numeric_cols


Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
       'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'Gender_Male',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scienti

In [14]:
#Apply StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit only on training data
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])

# Transform test data using the same scaler
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [15]:
#Check scaled data
X_train[numeric_cols].head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
1194,1.090194,1.049455,-0.899915,1.064209,-0.65871,-0.908436,1.795282,1.762189,-0.647997,2.026752,...,-0.472996,3.628867,-0.326041,-0.237915,-0.479714,-0.549841,-0.231869,-0.921443,-0.681548,-0.637729
128,-1.634828,-0.523449,-0.899915,-1.855332,0.260202,1.694111,0.373564,-0.986265,1.153526,-0.864408,...,2.114182,-0.275568,-0.326041,-0.237915,-0.479714,-0.549841,-0.231869,1.085255,-0.681548,-0.637729
810,0.981193,-0.99208,-0.77761,-1.855332,-1.577622,-0.662913,0.373564,1.762189,0.252765,2.347706,...,-0.472996,3.628867,-0.326041,-0.237915,-0.479714,-0.549841,-0.231869,1.085255,-0.681548,-0.637729
478,-1.307825,-0.453653,0.445433,-1.855332,-0.65871,-1.252169,0.373564,-0.986265,0.252765,-0.956202,...,-0.472996,-0.275568,-0.326041,-0.237915,-0.479714,-0.549841,4.312772,1.085255,-0.681548,-0.637729
491,0.654191,0.491086,-0.043784,2.03739,1.179114,0.31918,0.373564,-0.070114,0.252765,-0.185956,...,2.114182,-0.275568,-0.326041,-0.237915,-0.479714,-0.549841,-0.231869,-0.921443,-0.681548,1.568064


In [16]:
#Apply SMOTE (Fix Class Imbalance)
"""
Your dataset is imbalanced:

No = 1233

Yes = 237

This imbalance causes:

Models to predict “No attrition” for everything

High accuracy but terrible recall/F1 for “Yes” class

Biased predictions

Poor model performance

To fix this, we use SMOTE on the training data ONLY.

SMOTE = Synthetic Minority Oversampling Technique
It creates NEW synthetic samples for the minority class (“Yes”).
"""
#Import SMOTE
from imblearn.over_sampling import SMOTE

In [17]:
#Apply SMOTE to training data only
sm = SMOTE(random_state=42)

X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


In [18]:
#checking new class distribution
y_train_res.value_counts()

Unnamed: 0_level_0,count
Attrition,Unnamed: 1_level_1
0,986
1,986


In [19]:
#Download these files
import pickle

# Save processed datasets
pickle.dump(X_train_res, open('X_train_res.pkl', 'wb'))
pickle.dump(y_train_res, open('y_train_res.pkl', 'wb'))
pickle.dump(X_test, open('X_test.pkl', 'wb'))
pickle.dump(y_test, open('y_test.pkl', 'wb'))

print("All preprocessing files saved successfully!")

All preprocessing files saved successfully!
