# Notebook for the project in *TDT4259 - Applied Data Science*

Necessary imports

In [3]:
import sys
import os
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
sys.path.append(".")

from util.util import get_dataset, clean_attrition_dataset, create_additional_columns, remove_duplicates_and_fill_na

We start by preprocessing the columns to be on a concise format.

In [90]:
raw_dataset = get_dataset("train_data.csv")
cleaned_dataset = clean_attrition_dataset(raw_dataset)
dataset_with_more_cols = create_additional_columns(cleaned_dataset)
final_dataset = remove_duplicates_and_fill_na(dataset_with_more_cols)
final_dataset["Work_Duration"] = (final_dataset.Last_Working_Date - final_dataset.Date_Of_Joining).dt.days

In [91]:
final_dataset.head()

Unnamed: 0,Date,Attrition,Emp_ID,Age,Gender,City,Education_Level,Salary,Date_Of_Joining,Last_Working_Date,Joining_Designation,Designation,Total_Business_Value,Quarterly_Rating,Work_Duration
2,2016-03-01,1,1,28,Male,C23,Master,57387,2015-12-24,2016-03-11,1,1,0,2,78
4,2017-12-01,0,2,31,Male,C7,Master,67016,2017-11-06,2017-12-31,2,2,0,1,55
9,2017-04-01,1,4,43,Male,C13,Master,65603,2016-12-07,2017-04-27,2,2,0,1,141
12,2016-03-01,1,5,29,Male,C9,College,46368,2016-01-09,2016-03-07,1,1,0,1,58
17,2017-12-01,0,6,31,Female,C11,Bachelor,78728,2017-07-31,2017-12-31,3,3,0,2,153


Model prototyping

In [105]:
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [93]:
final_dataset = pd.get_dummies(final_dataset, columns=["Gender", "City", "Education_Level"])

In [94]:
final_dataset.head()

Unnamed: 0,Date,Attrition,Emp_ID,Age,Salary,Date_Of_Joining,Last_Working_Date,Joining_Designation,Designation,Total_Business_Value,...,City_C3,City_C4,City_C5,City_C6,City_C7,City_C8,City_C9,Education_Level_Bachelor,Education_Level_College,Education_Level_Master
2,2016-03-01,1,1,28,57387,2015-12-24,2016-03-11,1,1,0,...,0,0,0,0,0,0,0,0,0,1
4,2017-12-01,0,2,31,67016,2017-11-06,2017-12-31,2,2,0,...,0,0,0,0,1,0,0,0,0,1
9,2017-04-01,1,4,43,65603,2016-12-07,2017-04-27,2,2,0,...,0,0,0,0,0,0,0,0,0,1
12,2016-03-01,1,5,29,46368,2016-01-09,2016-03-07,1,1,0,...,0,0,0,0,0,0,1,0,1,0
17,2017-12-01,0,6,31,78728,2017-07-31,2017-12-31,3,3,0,...,0,0,0,0,0,0,0,1,0,0


In [97]:
X_data = final_dataset.drop(columns=["Attrition", "Date", "Date_Of_Joining", "Last_Working_Date"], axis = 1)
y_data = final_dataset["Attrition"]

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=4259)

In [101]:
log = LogisticRegression()
log.fit(X_train, y_train)
log_pred = log.predict(X_test)

print(classification_report(y_test, log_pred))

              precision    recall  f1-score   support

           0       0.84      0.54      0.65       153
           1       0.81      0.95      0.88       324

    accuracy                           0.82       477
   macro avg       0.82      0.74      0.76       477
weighted avg       0.82      0.82      0.80       477



In [104]:
print(log.coef_, log.intercept_)

[[ 5.02468091e-04  4.76240927e-05  9.45376818e-06 -2.48673609e-07
   2.53564985e-07 -5.04864034e-06  4.41862772e-07  4.94076614e-04
   5.97694038e-07  8.07086128e-07  2.59924406e-08  1.64123126e-07
   4.18489684e-08  3.03707977e-08  1.87036468e-07  8.36397859e-08
  -9.72226705e-09 -4.42254835e-08  1.56469406e-07  8.09021359e-08
  -8.61177476e-08  1.62854284e-07  6.25284729e-08  5.38766245e-10
   3.14024645e-08  1.13058427e-07  9.31502200e-08  2.58079410e-09
   8.39354275e-08 -7.88263744e-09  1.66076083e-07 -3.23816929e-08
  -1.30884093e-08  3.42854525e-08 -2.63359271e-08  2.43857090e-08
   1.12863465e-07 -5.21551287e-08  1.86467659e-08  4.15073117e-07
   6.07842913e-07  3.81864136e-07]] [1.40478017e-06]


In [102]:
svc = SVC()
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)

print(classification_report(y_test, svc_pred))

              precision    recall  f1-score   support

           0       0.79      0.60      0.68       153
           1       0.83      0.92      0.87       324

    accuracy                           0.82       477
   macro avg       0.81      0.76      0.78       477
weighted avg       0.82      0.82      0.81       477



In [103]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

print(classification_report(y_test, forest_pred))

              precision    recall  f1-score   support

           0       0.76      0.67      0.71       153
           1       0.85      0.90      0.88       324

    accuracy                           0.83       477
   macro avg       0.81      0.79      0.79       477
weighted avg       0.82      0.83      0.82       477

