In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [2]:
df_train = pd.read_csv("../data/Faker_Data/train.csv")

In [3]:
df_train = df_train.drop(columns=["employee_id"])

In [82]:
df_train

Unnamed: 0,age,gender,years_at_company,job_role,monthly_income,work_life_balance,job_satisfaction,performance_rating,number_of_promotions,overtime,...,job_level,company_size,remote_work,leadership_opportunities,innovation_opportunities,company_reputation,employee_recognition,attrition,age_groups,age_before_working
0,20,Male,0,Healthcare,3399,Poor,Low,Average,0,Yes,...,Entry,Small,No,No,Yes,Poor,Low,Left,18-25,20
1,34,Female,13,Finance,14568,Good,Medium,Average,3,No,...,Senior,Large,No,Yes,Yes,Good,High,Stayed,26-35,21
2,44,Male,16,Technology,13291,Good,Very High,Excellent,4,No,...,Senior,Small,No,Yes,Yes,Excellent,High,Stayed,36-45,28
3,37,Female,6,Technology,7699,Good,High,High,1,No,...,Mid,Medium,Yes,Yes,Yes,Good,High,Stayed,36-45,31
4,38,Male,3,Education,7698,Good,Very High,Excellent,1,No,...,Mid,Large,No,No,No,Good,Very High,Stayed,36-45,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71995,21,Male,0,Finance,4025,Excellent,High,High,1,No,...,Entry,Medium,No,No,No,Good,Medium,Stayed,18-25,21
71996,44,Female,1,Technology,9459,Good,Very High,Average,0,Yes,...,Mid,Large,Yes,No,Yes,Good,Very High,Stayed,36-45,43
71997,24,Female,4,Healthcare,3745,Good,High,Average,0,No,...,Entry,Medium,No,No,No,Good,Medium,Stayed,18-25,20
71998,48,Female,27,Education,6001,Excellent,High,High,10,No,...,Mid,Medium,No,No,No,Good,High,Stayed,46-55,21


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72000 entries, 0 to 71999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   age                       72000 non-null  int64 
 1   gender                    72000 non-null  object
 2   years_at_company          72000 non-null  int64 
 3   job_role                  72000 non-null  object
 4   monthly_income            72000 non-null  int64 
 5   work_life_balance         72000 non-null  object
 6   job_satisfaction          72000 non-null  object
 7   performance_rating        72000 non-null  object
 8   number_of_promotions      72000 non-null  int64 
 9   overtime                  72000 non-null  object
 10  distance_from_home        72000 non-null  int64 
 11  education_level           72000 non-null  object
 12  marital_status            72000 non-null  object
 13  number_of_dependents      72000 non-null  int64 
 14  job_level             

In [5]:
# split data to features and target
X = df_train.drop(columns=["attrition"])
y = df_train.attrition

In [6]:
X.work_life_balance.unique()

array(['Poor', 'Good', 'Fair', 'Excellent'], dtype=object)

In [7]:
X.job_satisfaction.unique()

array(['Low', 'Medium', 'Very High', 'High'], dtype=object)

In [8]:
X.performance_rating.unique()

array(['Average', 'Excellent', 'High', 'Low'], dtype=object)

In [9]:
# split the numerical columns from the category columns
numeric_cols = X.select_dtypes(include=['number']).columns
category_cols = X.select_dtypes(exclude=['number']).columns
print(numeric_cols)
print(category_cols)

Index(['age', 'years_at_company', 'monthly_income', 'number_of_promotions',
       'distance_from_home', 'number_of_dependents', 'age_before_working'],
      dtype='object')
Index(['gender', 'job_role', 'work_life_balance', 'job_satisfaction',
       'performance_rating', 'overtime', 'education_level', 'marital_status',
       'job_level', 'company_size', 'remote_work', 'leadership_opportunities',
       'innovation_opportunities', 'company_reputation',
       'employee_recognition', 'age_groups'],
      dtype='object')


In [10]:
y = y.map({'Stayed': 0, 'Left': 1})

In [70]:
X.employee_recognition.unique()

array(['Low', 'High', 'Very High', 'Medium'], dtype=object)

In [69]:
X.company_reputation.unique()

array(['Poor', 'Good', 'Excellent', 'Fair'], dtype=object)

In [72]:
X.age_groups.unique()

array(['18-25', '26-35', '36-45', '46-55', '55+'], dtype=object)

Index(['gender', 'job_role', 'work_life_balance', 'job_satisfaction',
       'performance_rating', 'overtime', 'education_level', 'marital_status',
       'job_level', 'company_size', 'remote_work', 'leadership_opportunities',
       'innovation_opportunities', 'company_reputation',
       'employee_recognition', 'age_groups'],
      dtype='object')

In [None]:
nominal_cols = ['job_role', 'marital_status']
binary_cols = ['gender', 'overtime', 'remote_work', 'leadership_opportunities', 'innovation_opportunities']
ordinal_cols = ['work_life_balance', 'job_satisfaction', 'performance_rating',
                'education_level','job_level','company_size','company_reputation'
                ,'employee_recognition', 'age_groups']

In [74]:
numerical_pipeline = Pipeline([('scaler', StandardScaler())])

one_hot_encode = OneHotEncoder()

binary_pipeline = Pipeline([("binary", OrdinalEncoder(categories=[
    ['Female', 'Male'],
    ['No', 'Yes'],
    ['No', 'Yes'],
    ['No', 'Yes'],
    ['No', 'Yes'],
]))])


ordinal_pipeline = Pipeline([
    ("ordinal", OrdinalEncoder(categories=[
        ['Poor', 'Fair', 'Good', 'Excellent'],
        ['Low', 'Medium','High', 'Very High',],
        ['Low', 'Average', 'High','Excellent'],
        ['Poor', 'Fair','Good', 'Excellent'],
        ['Low', 'Medium','High', 'Very High'],
        ['18-25', '26-35', '36-45', '46-55', '55+']
    ]))
])

In [75]:
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numeric_cols),
    ('ord', ordinal_pipeline, ordinal_cols),
    ('bin', binary_pipeline, binary_cols),
    ('one', one_hot_encode, nominal_cols)
   
])

In [76]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ("classifier", RandomForestClassifier())  
])

In [77]:
model = pipeline.fit(X, y)

In [78]:
X_transformed = pipeline.named_steps['preprocessor'].transform(X)

In [79]:
X_encoded = model.named_steps["preprocessor"].transform(X)

# get the column names from the ColumnTransformer
feature_names = model.named_steps["preprocessor"].get_feature_names_out()

df_encoded = pd.DataFrame(X_encoded, columns=feature_names)
print(df_encoded)

       num__age  num__years_at_company  num__monthly_income  \
0     -1.803826              -1.198257            -1.191488   
1     -0.364632               0.395226             1.620046   
2      0.663364               0.762953             1.298591   
3     -0.056233              -0.462803            -0.109064   
4      0.046567              -0.830530            -0.109316   
...         ...                    ...                  ...   
71995 -1.701026              -1.198257            -1.033907   
71996  0.663364              -1.075681             0.333975   
71997 -1.392627              -0.707954            -1.104391   
71998  1.074563               2.111285            -0.536496   
71999  0.560565              -1.198257            -1.070911   

       num__number_of_promotions  num__distance_from_home  \
0                      -0.803058                -0.029016   
1                       0.698596                -0.313598   
2                       1.199147                -0.740470   

In [80]:
df = pd.DataFrame(df_encoded)
df

Unnamed: 0,num__age,num__years_at_company,num__monthly_income,num__number_of_promotions,num__distance_from_home,num__number_of_dependents,num__age_before_working,ord__work_life_balance,ord__job_satisfaction,ord__performance_rating,...,bin__leadership_opportunities,bin__innovation_opportunities,one__job_role_Education,one__job_role_Finance,one__job_role_Healthcare,one__job_role_Media,one__job_role_Technology,one__marital_status_Divorced,one__marital_status_Married,one__marital_status_Single
0,-1.803826,-1.198257,-1.191488,-0.803058,-0.029016,-1.039805,-0.954211,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.364632,0.395226,1.620046,0.698596,-0.313598,-1.039805,-0.831426,2.0,1.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.663364,0.762953,1.298591,1.199147,-0.740470,2.587668,0.028074,2.0,3.0,3.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,-0.056233,-0.462803,-0.109064,-0.302507,-1.167342,1.136679,0.396430,2.0,2.0,2.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.046567,-0.830530,-0.109316,-0.302507,-0.455888,0.411184,0.887573,2.0,3.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71995,-1.701026,-1.198257,-1.033907,-0.302507,-0.740470,-1.039805,-0.831426,3.0,2.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
71996,0.663364,-1.075681,0.333975,-0.803058,-1.167342,-1.039805,1.869858,2.0,3.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
71997,-1.392627,-0.707954,-1.104391,-0.803058,0.255565,-1.039805,-0.954211,2.0,2.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
71998,1.074563,2.111285,-0.536496,4.202454,0.824728,1.136679,-0.831426,3.0,2.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [81]:
df.columns

Index(['num__age', 'num__years_at_company', 'num__monthly_income',
       'num__number_of_promotions', 'num__distance_from_home',
       'num__number_of_dependents', 'num__age_before_working',
       'ord__work_life_balance', 'ord__job_satisfaction',
       'ord__performance_rating', 'ord__company_reputation',
       'ord__employee_recognition', 'ord__age_groups', 'bin__gender',
       'bin__overtime', 'bin__remote_work', 'bin__leadership_opportunities',
       'bin__innovation_opportunities', 'one__job_role_Education',
       'one__job_role_Finance', 'one__job_role_Healthcare',
       'one__job_role_Media', 'one__job_role_Technology',
       'one__marital_status_Divorced', 'one__marital_status_Married',
       'one__marital_status_Single'],
      dtype='object')

In [69]:
models = {
    "RandomForest": (RandomForestClassifier(random_state=42, max_features='sqrt'), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [10, 15, 20],
        'classifier__min_samples_split': [10, 20],
        'classifier__min_samples_leaf': [4, 8]
    }),
    "KNN": (KNeighborsClassifier(), {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance']
    }),
    "XGBoost": (XGBClassifier(eval_metric='logloss', random_state=42), {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 5, 7],
        'classifier__learning_rate': [0.05, 0.1, 0.2]
    })
}

In [76]:

results = {}

for model_name, (model, params) in models.items():
    print(f"\nRunning GridSearch for {model_name}...")
    
    # update classifier in pipeline
    pipeline.set_params(classifier=model)
    
    # run grid search
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=params,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    grid.fit(X, y)
    
    # store results
    results[model_name] = {
        "best_score": grid.best_score_,
        "best_params": grid.best_params_,
        "best_estimator": grid.best_estimator_
    }



Running GridSearch for RandomForest...

Running GridSearch for KNN...

Running GridSearch for XGBoost...


In [77]:
results

{'RandomForest': {'best_score': np.float64(0.8820277777777779),
  'best_params': {'classifier__max_depth': 15,
   'classifier__min_samples_leaf': 4,
   'classifier__min_samples_split': 20,
   'classifier__n_estimators': 100},
  'best_estimator': Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('num',
                                                    Pipeline(steps=[('scaler',
                                                                     StandardScaler())]),
                                                    Index(['age', 'years_at_company', 'monthly_income', 'number_of_promotions',
         'distance_from_home', 'number_of_dependents', 'age_before_working'],
        dtype='object')),
                                                   ('ord',
                                                    Pipeline(steps=[('ordinal',
                                                                     OrdinalEncoder(categories=[['Poor',
                 

In [78]:
best_model = grid.best_estimator_

In [39]:
best_model

NameError: name 'best_model' is not defined

In [82]:
best_model.feature_names_in_

array(['age', 'gender', 'years_at_company', 'job_role', 'monthly_income',
       'work_life_balance', 'job_satisfaction', 'performance_rating',
       'number_of_promotions', 'overtime', 'distance_from_home',
       'education_level', 'marital_status', 'number_of_dependents',
       'job_level', 'company_size', 'remote_work',
       'leadership_opportunities', 'innovation_opportunities',
       'company_reputation', 'employee_recognition', 'age_groups',
       'age_before_working'], dtype=object)

In [80]:
df_test = pd.read_csv("../data/Faker_Data/test.csv")

In [81]:
X_test = df_test.drop(columns= ["attrition"])
y_test = df_test.attrition

In [84]:
y_test = y_test.map({'Stayed': 0, 'Left': 1})

In [83]:
y_pred = best_model.predict(X_test)

In [85]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8851111111111111


In [86]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[11220   764]
 [ 1304  4712]]


In [87]:
import pickle

# Save
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)