In [66]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier



In [4]:
df_train = pd.read_csv("../data/Faker_Data/train.csv")

In [5]:
df_train = df_train.drop(columns=["employee_id"])

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72000 entries, 0 to 71999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   age                       72000 non-null  int64 
 1   gender                    72000 non-null  object
 2   years_at_company          72000 non-null  int64 
 3   job_role                  72000 non-null  object
 4   monthly_income            72000 non-null  int64 
 5   work_life_balance         72000 non-null  object
 6   job_satisfaction          72000 non-null  object
 7   performance_rating        72000 non-null  object
 8   number_of_promotions      72000 non-null  int64 
 9   overtime                  72000 non-null  object
 10  distance_from_home        72000 non-null  int64 
 11  education_level           72000 non-null  object
 12  marital_status            72000 non-null  object
 13  number_of_dependents      72000 non-null  int64 
 14  job_level             

In [7]:
# split data to features and target
X = df_train.drop(columns=["attrition"])
y = df_train.attrition

In [11]:
X.work_life_balance.unique()

array(['Poor', 'Good', 'Fair', 'Excellent'], dtype=object)

In [12]:
X.job_satisfaction.unique()

array(['Low', 'Medium', 'Very High', 'High'], dtype=object)

In [14]:
X.performance_rating.unique()

array(['Average', 'Excellent', 'High', 'Low'], dtype=object)

In [17]:
# split the numerical columns from the category columns
numeric_cols = X.select_dtypes(include=['number']).columns
category_cols = X.select_dtypes(exclude=['number']).columns
print(numeric_cols)
print(category_cols)

Index(['age', 'years_at_company', 'monthly_income', 'number_of_promotions',
       'distance_from_home', 'number_of_dependents', 'age_before_working'],
      dtype='object')
Index(['gender', 'job_role', 'work_life_balance', 'job_satisfaction',
       'performance_rating', 'overtime', 'education_level', 'marital_status',
       'job_level', 'company_size', 'remote_work', 'leadership_opportunities',
       'innovation_opportunities', 'company_reputation',
       'employee_recognition', 'age_groups'],
      dtype='object')


In [75]:
y = y.map({'Stayed': 0, 'Left': 1})

In [18]:
nominal_cols = ['job_role', 'marital_status']
binary_cols = ['gender', 'overtime', 'remote_work', 'leadership_opportunities', 'innovation_opportunities']
ordinal_cols = ['work_life_balance', 'job_satisfaction', 'performance_rating']

In [53]:
numerical_pipeline = Pipeline([('scaler', StandardScaler())])

binary_pipeline = Pipeline([("binary", OrdinalEncoder(categories=[
    ['Female', 'Male'],
    ['No', 'Yes'],
    ['No', 'Yes'],
    ['No', 'Yes'],
    ['No', 'Yes'],
]))])


ordinal_pipeline = Pipeline([
    ("ordinal", OrdinalEncoder(categories=[
        ['Poor', 'Fair', 'Good', 'Excellent'],
        ['Low', 'Medium','High', 'Very High',],
        ['Low', 'Average', 'High','Excellent']  
    ]))
])

In [59]:
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numeric_cols),
    ('ord', ordinal_pipeline, ordinal_cols),
    ('bin', binary_pipeline, binary_cols),
   
])

In [71]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ("classifier", RandomForestClassifier())  
])

In [69]:
models = {
    "RandomForest": (RandomForestClassifier(random_state=42, max_features='sqrt'), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [10, 15, 20],
        'classifier__min_samples_split': [10, 20],
        'classifier__min_samples_leaf': [4, 8]
    }),
    "KNN": (KNeighborsClassifier(), {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance']
    }),
    "XGBoost": (XGBClassifier(eval_metric='logloss', random_state=42), {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 5, 7],
        'classifier__learning_rate': [0.05, 0.1, 0.2]
    })
}

In [76]:

results = {}

for model_name, (model, params) in models.items():
    print(f"\nRunning GridSearch for {model_name}...")
    
    # update classifier in pipeline
    pipeline.set_params(classifier=model)
    
    # run grid search
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=params,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    grid.fit(X, y)
    
    # store results
    results[model_name] = {
        "best_score": grid.best_score_,
        "best_params": grid.best_params_,
        "best_estimator": grid.best_estimator_
    }



Running GridSearch for RandomForest...

Running GridSearch for KNN...

Running GridSearch for XGBoost...


In [77]:
results

{'RandomForest': {'best_score': np.float64(0.8820277777777779),
  'best_params': {'classifier__max_depth': 15,
   'classifier__min_samples_leaf': 4,
   'classifier__min_samples_split': 20,
   'classifier__n_estimators': 100},
  'best_estimator': Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('num',
                                                    Pipeline(steps=[('scaler',
                                                                     StandardScaler())]),
                                                    Index(['age', 'years_at_company', 'monthly_income', 'number_of_promotions',
         'distance_from_home', 'number_of_dependents', 'age_before_working'],
        dtype='object')),
                                                   ('ord',
                                                    Pipeline(steps=[('ordinal',
                                                                     OrdinalEncoder(categories=[['Poor',
                 

In [78]:
best_model = grid.best_estimator_

In [79]:
best_model

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,"[['Poor', 'Fair', ...], ['Low', 'Medium', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['Female', 'Male'], ['No', 'Yes'], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [82]:
best_model.feature_names_in_

array(['age', 'gender', 'years_at_company', 'job_role', 'monthly_income',
       'work_life_balance', 'job_satisfaction', 'performance_rating',
       'number_of_promotions', 'overtime', 'distance_from_home',
       'education_level', 'marital_status', 'number_of_dependents',
       'job_level', 'company_size', 'remote_work',
       'leadership_opportunities', 'innovation_opportunities',
       'company_reputation', 'employee_recognition', 'age_groups',
       'age_before_working'], dtype=object)

In [80]:
df_test = pd.read_csv("../data/Faker_Data/test.csv")

In [81]:
X_test = df_test.drop(columns= ["attrition"])
y_test = df_test.attrition

In [84]:
y_test = y_test.map({'Stayed': 0, 'Left': 1})

In [83]:
y_pred = best_model.predict(X_test)

In [85]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8851111111111111


In [86]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[11220   764]
 [ 1304  4712]]


In [87]:
import pickle

# Save
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)