In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split

#load data from csv
df = pd.read_csv(r"D:\Jeeva\ds_course\employee_attrition\employee_attrition_cleaned.csv")

#select top 15 features
top_15_features = ['overtime', 'maritalstatus_single', 'yearsincurrentrole', 'monthlyincome',
                   'jobrole_sales representative', 'yearswithcurrmanager', 'stockoptionlevel',
                   'jobinvolvement', 'businesstravel_travel_frequently', 'jobsatisfaction',
                   'environmentsatisfaction', 'jobrole_laboratory technician', 
                   'jobrole_research director', 'department_research & development']

#separate labels
X = df[top_15_features]
y = df['attrition']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

#apply oversampling as there are more minority cases
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_test_smp,y_test_smp = smote.fit_resample(X_train,y_train)

X.head(10)

Unnamed: 0,overtime,maritalstatus_single,yearsincurrentrole,monthlyincome,jobrole_sales representative,yearswithcurrmanager,stockoptionlevel,jobinvolvement,businesstravel_travel_frequently,jobsatisfaction,environmentsatisfaction,jobrole_laboratory technician,jobrole_research director,department_research & development
0,1,1,4,5993,0,5,0,3,0,4,2,0,0,0
1,0,0,7,5130,0,7,1,2,1,2,3,0,0,1
2,1,1,0,2090,0,0,0,2,0,3,4,1,0,1
3,1,0,7,2909,0,0,0,3,1,3,4,0,0,1
4,0,0,2,3468,0,2,1,3,0,2,1,1,0,1
5,0,1,7,3068,0,6,0,3,1,4,4,1,0,1
6,1,0,0,2670,0,0,3,4,0,1,3,1,0,1
7,0,0,0,2693,0,0,1,3,0,3,4,1,0,1
8,0,1,7,9526,0,8,0,2,1,3,4,0,0,1
9,0,0,7,5237,0,7,2,3,0,3,3,0,0,1


In [65]:
#using decision tree classifier
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression(random_state=42,class_weight='balanced')
lg.fit(X_test_smp,y_test_smp)

y_pred = lg.predict(X_test)
y_prob = lg.predict_proba(X_test)

from sklearn.metrics import accuracy_score

ac = accuracy_score(y_test,y_pred)
print(ac * 100)

66.66666666666666


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [67]:
#using decision tree classifier
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42,class_weight = "balanced")
dt.fit(X_test_smp,y_test_smp)

y_pred = dt.predict(X_test)
y_prob = dt.predict_proba(X_test)

from sklearn.metrics import accuracy_score

ac = accuracy_score(y_test,y_pred)
print(ac * 100)

74.14965986394559


In [69]:
#using random_forest_classifier

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42,class_weight = "balanced")
rf.fit(X_test_smp,y_test_smp)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)

from sklearn.metrics import accuracy_score

ac = accuracy_score(y_test,y_pred)
print(ac * 100)

80.61224489795919


In [91]:
#using XGBClassifier
import joblib
from xgboost import XGBClassifier

xg = XGBClassifier(random_state=42,eval_metric='logloss',n_estimators=200,max_depth=5,learning_rate= 0.1)
xg.fit(X_test_smp,y_test_smp)

y_pred = xg.predict(X_test)
y_prob = xg.predict_proba(X_test)

from sklearn.metrics import accuracy_score

ac = accuracy_score(y_test,y_pred)
print(ac * 100)

# Save model
joblib.dump(xg, 'attrition_xgboost_model.pkl')

89.1156462585034


['attrition_xgboost_model.pkl']

In [83]:
#hyper parameter tuning for XGBClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100,200],
    'max_depth':[3,5],
    'learning_rate':[0.01,0.1],
    'subsample':[0.8,1.0],
    'colsample_bytree':[0.8,1.0]    
}

grid_search  = GridSearchCV(
    estimator = XGBClassifier(random_state=42,eval_metric='logloss'),
    param_grid = param_grid,
    scoring = "accuracy",
    cv=5,
    n_jobs=-1
)
grid_search.fit(X_test_smp,y_test_smp)
print("Best Params:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))


Best Params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best CV Score: 0.8844490136863072
Test Accuracy: 0.8775510204081632


In [81]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  # NOTE: from imblearn, not sklearn

#load data from csv
df = pd.read_csv(r"D:\Jeeva\ds_course\employee_attrition\employee_attrition_cleaned.csv")

#select top 15 features
top_15_features = ['overtime', 'maritalstatus_single', 'yearsincurrentrole', 'monthlyincome',
                   'jobrole_sales representative', 'yearswithcurrmanager', 'stockoptionlevel',
                   'jobinvolvement', 'businesstravel_travel_frequently', 'jobsatisfaction',
                   'environmentsatisfaction', 'jobrole_laboratory technician', 
                   'jobrole_research director', 'department_research & development']

#separate labels
X = df[top_15_features]
y = df['attrition']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify=y,random_state=42)


pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('xgb', XGBClassifier(
        eval_metric='logloss',
        random_state=42
    ))
])

param_grid = {
    'xgb__n_estimators': [50, 100],
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.01, 0.1]
}


grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',      # better metric for imbalance
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)


y_pred = grid_search.predict(X_test)

print("Best Params:", grid_search.best_params_)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Params: {'xgb__learning_rate': 0.1, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}

Confusion Matrix:
 [[327  43]
 [ 38  33]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89       370
           1       0.43      0.46      0.45        71

    accuracy                           0.82       441
   macro avg       0.67      0.67      0.67       441
weighted avg       0.82      0.82      0.82       441

