In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [3]:
df_train = pd.read_csv("../data/Faker_Data/train.csv")

In [3]:
df_train = df_train.drop(columns=["employee_id"])

In [4]:
df_train

Unnamed: 0,employee_id,age,gender,years_at_company,job_role,monthly_income,work_life_balance,job_satisfaction,performance_rating,number_of_promotions,...,job_level,company_size,remote_work,leadership_opportunities,innovation_opportunities,company_reputation,employee_recognition,attrition,age_groups,age_before_working
0,51005,20,Male,0,Healthcare,3399,Poor,Low,Average,0,...,Entry,Small,No,No,Yes,Poor,Low,Left,18-25,20
1,11454,34,Female,13,Finance,14568,Good,Medium,Average,3,...,Senior,Large,No,Yes,Yes,Good,High,Stayed,26-35,21
2,9692,44,Male,16,Technology,13291,Good,Very High,Excellent,4,...,Senior,Small,No,Yes,Yes,Excellent,High,Stayed,36-45,28
3,51993,37,Female,6,Technology,7699,Good,High,High,1,...,Mid,Medium,Yes,Yes,Yes,Good,High,Stayed,36-45,31
4,23532,38,Male,3,Education,7698,Good,Very High,Excellent,1,...,Mid,Large,No,No,No,Good,Very High,Stayed,36-45,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71995,6266,21,Male,0,Finance,4025,Excellent,High,High,1,...,Entry,Medium,No,No,No,Good,Medium,Stayed,18-25,21
71996,54887,44,Female,1,Technology,9459,Good,Very High,Average,0,...,Mid,Large,Yes,No,Yes,Good,Very High,Stayed,36-45,43
71997,76821,24,Female,4,Healthcare,3745,Good,High,Average,0,...,Entry,Medium,No,No,No,Good,Medium,Stayed,18-25,20
71998,861,48,Female,27,Education,6001,Excellent,High,High,10,...,Mid,Medium,No,No,No,Good,High,Stayed,46-55,21


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72000 entries, 0 to 71999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   employee_id               72000 non-null  int64 
 1   age                       72000 non-null  int64 
 2   gender                    72000 non-null  object
 3   years_at_company          72000 non-null  int64 
 4   job_role                  72000 non-null  object
 5   monthly_income            72000 non-null  int64 
 6   work_life_balance         72000 non-null  object
 7   job_satisfaction          72000 non-null  object
 8   performance_rating        72000 non-null  object
 9   number_of_promotions      72000 non-null  int64 
 10  overtime                  72000 non-null  object
 11  distance_from_home        72000 non-null  int64 
 12  education_level           72000 non-null  object
 13  marital_status            72000 non-null  object
 14  number_of_dependents  

In [6]:
# split data to features and target
X = df_train.drop(columns=["attrition"])
y = df_train.attrition

In [7]:
X.work_life_balance.unique()

array(['Poor', 'Good', 'Fair', 'Excellent'], dtype=object)

In [8]:
X.job_satisfaction.unique()

array(['Low', 'Medium', 'Very High', 'High'], dtype=object)

In [9]:
X.performance_rating.unique()

array(['Average', 'Excellent', 'High', 'Low'], dtype=object)

In [10]:
# split the numerical columns from the category columns
numeric_cols = X.select_dtypes(include=['number']).columns
category_cols = X.select_dtypes(exclude=['number']).columns
print(numeric_cols)
print(category_cols)

Index(['employee_id', 'age', 'years_at_company', 'monthly_income',
       'number_of_promotions', 'distance_from_home', 'number_of_dependents',
       'age_before_working'],
      dtype='object')
Index(['gender', 'job_role', 'work_life_balance', 'job_satisfaction',
       'performance_rating', 'overtime', 'education_level', 'marital_status',
       'job_level', 'company_size', 'remote_work', 'leadership_opportunities',
       'innovation_opportunities', 'company_reputation',
       'employee_recognition', 'age_groups'],
      dtype='object')


In [11]:
y = y.map({'Stayed': 0, 'Left': 1})

In [12]:
X.employee_recognition.unique()

array(['Low', 'High', 'Very High', 'Medium'], dtype=object)

In [13]:
X.company_reputation.unique()

array(['Poor', 'Good', 'Excellent', 'Fair'], dtype=object)

In [14]:
X.age_groups.unique()

array(['18-25', '26-35', '36-45', '46-55', '55+'], dtype=object)

Index(['gender', 'job_role', 'work_life_balance', 'job_satisfaction',
       'performance_rating', 'overtime', 'education_level', 'marital_status',
       'job_level', 'company_size', 'remote_work', 'leadership_opportunities',
       'innovation_opportunities', 'company_reputation',
       'employee_recognition', 'age_groups'],
      dtype='object')

In [15]:
nominal_cols = ['job_role', 'marital_status']
binary_cols = ['gender', 'overtime', 'remote_work', 'leadership_opportunities', 'innovation_opportunities']
ordinal_cols = ['work_life_balance', 'job_satisfaction', 'performance_rating',
                'education_level','job_level','company_size','company_reputation'
                ,'employee_recognition', 'age_groups']

In [17]:
for col in ordinal_cols:
    print(col,X[col].unique())

work_life_balance ['Poor' 'Good' 'Fair' 'Excellent']
job_satisfaction ['Low' 'Medium' 'Very High' 'High']
performance_rating ['Average' 'Excellent' 'High' 'Low']
education_level ['Associate Degree' 'Master’s Degree' 'PhD' 'Bachelor’s Degree'
 'High School']
job_level ['Entry' 'Senior' 'Mid']
company_size ['Small' 'Large' 'Medium']
company_reputation ['Poor' 'Good' 'Excellent' 'Fair']
employee_recognition ['Low' 'High' 'Very High' 'Medium']
age_groups ['18-25' '26-35' '36-45' '46-55' '55+']


In [18]:
numerical_pipeline = Pipeline([('scaler', StandardScaler())])

one_hot_encode = OneHotEncoder()

binary_pipeline = Pipeline([("binary", OrdinalEncoder(categories=[
    ['Female', 'Male'],
    ['No', 'Yes'],
    ['No', 'Yes'],
    ['No', 'Yes'],
    ['No', 'Yes'],
]))])


ordinal_pipeline = Pipeline([
    ("ordinal", OrdinalEncoder(categories=[
        ['Poor', 'Fair', 'Good', 'Excellent'], #work life balance
        ['Low', 'Medium','High', 'Very High',], # job satisfaction
        ['Low', 'Average', 'High','Excellent'], # Performance Rating
        ['High School', 'Associate Degree', 'Bachelor’s Degree', 'Master’s Degree', 'PhD'], #Educational Level
        ['Entry', 'Mid', 'Senior'], # Job Level
        ['Small', 'Medium', 'Large'], #Company Size
        ['Poor', 'Fair','Good', 'Excellent'], # Company Reputation
        ['Low', 'Medium','High', 'Very High'], #employee recognition
        ['18-25', '26-35', '36-45', '46-55', '55+'] # age groups
    ]))
])

In [19]:
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numeric_cols),
    ('ord', ordinal_pipeline, ordinal_cols),
    ('bin', binary_pipeline, binary_cols),
    ('one', one_hot_encode, nominal_cols)
   
])

In [21]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ("classifier", RandomForestClassifier())  
])

In [22]:
model = pipeline.fit(X, y)

In [23]:
X_transformed = pipeline.named_steps['preprocessor'].transform(X)

In [24]:
X_encoded = model.named_steps["preprocessor"].transform(X)

# get the column names from the ColumnTransformer
feature_names = model.named_steps["preprocessor"].get_feature_names_out()

df_encoded = pd.DataFrame(X_encoded, columns=feature_names)
print(df_encoded)

       num__employee_id  num__age  num__years_at_company  num__monthly_income  \
0              0.230538 -1.803826              -1.198257            -1.191488   
1             -1.289845 -0.364632               0.395226             1.620046   
2             -1.357578  0.663364               0.762953             1.298591   
3              0.268518 -0.056233              -0.462803            -0.109064   
4             -0.825553  0.046567              -0.830530            -0.109316   
...                 ...       ...                    ...                  ...   
71995         -1.489277 -1.701026              -1.198257            -1.033907   
71996          0.379766  0.663364              -1.075681             0.333975   
71997          1.222933 -1.392627              -0.707954            -1.104391   
71998         -1.697051  1.074563               2.111285            -0.536496   
71999         -1.122933  0.560565              -1.198257            -1.070911   

       num__number_of_promo

In [25]:
df = pd.DataFrame(df_encoded)
df

Unnamed: 0,num__employee_id,num__age,num__years_at_company,num__monthly_income,num__number_of_promotions,num__distance_from_home,num__number_of_dependents,num__age_before_working,ord__work_life_balance,ord__job_satisfaction,...,bin__leadership_opportunities,bin__innovation_opportunities,one__job_role_Education,one__job_role_Finance,one__job_role_Healthcare,one__job_role_Media,one__job_role_Technology,one__marital_status_Divorced,one__marital_status_Married,one__marital_status_Single
0,0.230538,-1.803826,-1.198257,-1.191488,-0.803058,-0.029016,-1.039805,-0.954211,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,-1.289845,-0.364632,0.395226,1.620046,0.698596,-0.313598,-1.039805,-0.831426,2.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-1.357578,0.663364,0.762953,1.298591,1.199147,-0.740470,2.587668,0.028074,2.0,3.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.268518,-0.056233,-0.462803,-0.109064,-0.302507,-1.167342,1.136679,0.396430,2.0,2.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,-0.825553,0.046567,-0.830530,-0.109316,-0.302507,-0.455888,0.411184,0.887573,2.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71995,-1.489277,-1.701026,-1.198257,-1.033907,-0.302507,-0.740470,-1.039805,-0.831426,3.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
71996,0.379766,0.663364,-1.075681,0.333975,-0.803058,-1.167342,-1.039805,1.869858,2.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
71997,1.222933,-1.392627,-0.707954,-1.104391,-0.803058,0.255565,-1.039805,-0.954211,2.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
71998,-1.697051,1.074563,2.111285,-0.536496,4.202454,0.824728,1.136679,-0.831426,3.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [26]:
df.columns

Index(['num__employee_id', 'num__age', 'num__years_at_company',
       'num__monthly_income', 'num__number_of_promotions',
       'num__distance_from_home', 'num__number_of_dependents',
       'num__age_before_working', 'ord__work_life_balance',
       'ord__job_satisfaction', 'ord__performance_rating',
       'ord__education_level', 'ord__job_level', 'ord__company_size',
       'ord__company_reputation', 'ord__employee_recognition',
       'ord__age_groups', 'bin__gender', 'bin__overtime', 'bin__remote_work',
       'bin__leadership_opportunities', 'bin__innovation_opportunities',
       'one__job_role_Education', 'one__job_role_Finance',
       'one__job_role_Healthcare', 'one__job_role_Media',
       'one__job_role_Technology', 'one__marital_status_Divorced',
       'one__marital_status_Married', 'one__marital_status_Single'],
      dtype='object')

In [27]:
models = {
    "RandomForest": (RandomForestClassifier(random_state=42, max_features='sqrt'), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [10, 15, 20],
        'classifier__min_samples_split': [10, 20],
        'classifier__min_samples_leaf': [4, 8]
    }),
    "KNN": (KNeighborsClassifier(), {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance']
    }),
    "XGBoost": (XGBClassifier(eval_metric='logloss', random_state=42), {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 5, 7],
        'classifier__learning_rate': [0.05, 0.1, 0.2]
    })
}

In [28]:

results = {}

for model_name, (model, params) in models.items():
    print(f"\nRunning GridSearch for {model_name}...")
    
    # update classifier in pipeline
    pipeline.set_params(classifier=model)
    
    # run grid search
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=params,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    grid.fit(X, y)
    
    # store results
    results[model_name] = {
        "best_score": grid.best_score_,
        "best_params": grid.best_params_,
        "best_estimator": grid.best_estimator_
    }



Running GridSearch for RandomForest...

Running GridSearch for KNN...

Running GridSearch for XGBoost...


In [29]:
results

{'RandomForest': {'best_score': np.float64(0.8898472222222222),
  'best_params': {'classifier__max_depth': 15,
   'classifier__min_samples_leaf': 4,
   'classifier__min_samples_split': 20,
   'classifier__n_estimators': 100},
  'best_estimator': Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('num',
                                                    Pipeline(steps=[('scaler',
                                                                     StandardScaler())]),
                                                    Index(['employee_id', 'age', 'years_at_company', 'monthly_income',
         'number_of_promotions', 'distance_from_home', 'number_of_dependents',
         'age_before_working'],
        dtype='object')),
                                                   ('ord',
                                                    Pipeline(steps=[('ordinal',
                                                                     OrdinalEncoder(categories=[['

In [30]:
best_model = grid.best_estimator_

In [31]:
best_model

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,"[['Poor', 'Fair', ...], ['Low', 'Medium', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['Female', 'Male'], ['No', 'Yes'], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [32]:
best_model.feature_names_in_

array(['employee_id', 'age', 'gender', 'years_at_company', 'job_role',
       'monthly_income', 'work_life_balance', 'job_satisfaction',
       'performance_rating', 'number_of_promotions', 'overtime',
       'distance_from_home', 'education_level', 'marital_status',
       'number_of_dependents', 'job_level', 'company_size', 'remote_work',
       'leadership_opportunities', 'innovation_opportunities',
       'company_reputation', 'employee_recognition', 'age_groups',
       'age_before_working'], dtype=object)

In [33]:
df_test = pd.read_csv("../data/Faker_Data/test.csv")

In [34]:
X_test = df_test.drop(columns= ["attrition"])
y_test = df_test.attrition

In [35]:
y_test = y_test.map({'Stayed': 0, 'Left': 1})

In [36]:
y_pred = best_model.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8965555555555556


In [86]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[11220   764]
 [ 1304  4712]]


In [87]:
import pickle

# Save
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)