In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import joblib

plt.rcParams['figure.figsize'] = 20, 15

In [2]:
df = pd.read_csv("general_data.csv", index_col="EmployeeID")
df

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,Gender,JobLevel,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,Female,1,...,1.0,Y,11,8,0,1.0,6,1,0,0
2,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,Female,1,...,0.0,Y,23,8,1,6.0,3,5,1,4
3,32,No,Travel_Frequently,Research & Development,17,4,Other,1,Male,4,...,1.0,Y,15,8,3,5.0,2,5,0,3
4,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,Male,3,...,3.0,Y,11,8,3,13.0,5,8,7,5
5,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,Male,1,...,4.0,Y,12,8,2,9.0,2,6,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4406,42,No,Travel_Rarely,Research & Development,5,4,Medical,1,Female,1,...,3.0,Y,17,8,1,10.0,5,3,0,2
4407,29,No,Travel_Rarely,Research & Development,2,4,Medical,1,Male,1,...,2.0,Y,15,8,0,10.0,2,3,0,2
4408,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,1,Male,2,...,0.0,Y,20,8,0,5.0,4,4,1,2
4409,42,No,Travel_Rarely,Sales,18,2,Medical,1,Male,1,...,0.0,Y,14,8,1,10.0,2,9,7,8


In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV


from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, OneHotEncoder, StandardScaler

In [4]:
df = pd.read_csv("general_data.csv", index_col="EmployeeID")
df.drop(['EmployeeCount','StandardHours'],axis=1, inplace = True) # Remove unnecessary columns
cat_cols = list(df.dtypes[df.dtypes == 'object'].index.values)
cat_cols.remove('Attrition') # remove target column 
num_cols = list(df.dtypes[df.dtypes != 'object'].index.values)

for col in cat_cols:
    df[col] = df[col].astype('category')

X = df[df.columns.difference(['Attrition'])]
y = df['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [5]:
numerical = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2))])
categorical = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder())])

column_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical, num_cols),
        ('cat', categorical, cat_cols)])

clf = Pipeline(steps=[('preprocessor', column_preprocessor),
                      ('classifier', KNeighborsClassifier())])

In [9]:
column_preprocessor.fit(X_train, y_train)

# Save the preprocessing pipeline to a .pkl file
joblib.dump(column_preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']

In [6]:
from sklearn import set_config
set_config(display='diagram')
clf

In [7]:
param_dict = { 
    "classifier__n_neighbors": list(range(1,20,5)),
    "classifier__p": [1,2,3]
}

grid = GridSearchCV(clf, param_dict, cv=3, verbose=1, n_jobs=-1)
best_model = grid.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


        nan 0.84212018        nan        nan 0.84212018        nan]


In [8]:
best_knn = best_model.best_estimator_
y_pred = best_knn.predict(X_test)

print(f1_score(y_test, y_pred, average='micro'))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))
print(classification_report(y_test,y_pred))


0.9897959183673469
0.9850981767180926
0.9766991127573434
              precision    recall  f1-score   support

          No       0.99      1.00      0.99       741
         Yes       0.98      0.96      0.97       141

    accuracy                           0.99       882
   macro avg       0.99      0.98      0.98       882
weighted avg       0.99      0.99      0.99       882



In [17]:
# Save a model to a pickle file
joblib.dump(best_model, 'best_knn_model.pkl')

['best_knn_model.pkl']

['Yes']
