In [85]:
#General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
import matplotlib.cm as cm

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

#Metrics
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer

pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows", None)


In [2]:
df = pd.read_csv("Attrition Data.csv", sep = ',')

In [3]:
df1 = df.copy()

# DATA PREPARATION

## DATA TRANSFORMATION

In [4]:
attrition = {'Yes' : 1,
            'No' : 0}

df1['Attrition'] = df1['Attrition'].map(attrition)

In [5]:
Department = {'Sales':0, 
              'Research & Development':1, 
              'Human Resources':2}

df1['Department'] = df1['Department'].map(Department)

In [6]:
Education_field = {'Life Sciences':1, 
                   'Medical':2, 
                   'Marketing':3,
                   'Technical Degree':4, 
                   'Human Resources':5, 
                   'Other':0}

df1['EducationField'] = df1['EducationField'].map(Education_field)

In [7]:
Marital_status = {'Single':0,
                  'Married':1, 
                  'Divorced':2}

df1['MaritalStatus'] = df1['MaritalStatus'].map(Marital_status)

In [8]:
df1 = df1[['Age', 'Department', 'DistanceFromHome', 'Education',
       'EducationField', 'EnvironmentSatisfaction', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'WorkLifeBalance', 'YearsAtCompany', 'Attrition']]

In [9]:
df1.head()

Unnamed: 0,Age,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,WorkLifeBalance,YearsAtCompany,Attrition
0,41,0,1,2,1,2,4,0,5993,8,1,6,1
1,49,1,8,1,1,3,2,1,5130,1,3,10,0
2,37,1,2,2,0,4,3,0,2090,6,3,0,1
3,33,1,3,4,1,4,3,1,2909,1,3,8,0
4,27,1,2,1,2,1,2,1,3468,9,3,2,0


# MODEL DEVELOPMENT 

In [37]:
# Printing the relevant scores

def print_scores(test_data, test_pred, test_prob):
    # ROC- AUC score
    print("ROC-AUC score  test dataset:  \t", roc_auc_score(test_data,test_prob))
    
    #Precision score
    print("precision score  test dataset:  \t", precision_score(test_data,test_pred))

    #Recall Score
    print("Recall score  test dataset:  \t", recall_score(test_data,test_pred))

    #f1 score
    print("f1 score  test dataset :  \t", f1_score(test_data,test_pred)) 
    
    #Accuracy score
    print('Model accuracy score: {0:0.4f}\t'. format(accuracy_score(test_data, test_pred)))

In [53]:
# Tuning using grid 
# Grid Search CV

def grid_tuning(model_element, param, fold, refit_score):
    
    #scoring elements
    scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
    }
    #specifying all hyperparameters with possible values
    
    #tuning weight for minority class then weight for majority class will be 1-weight of minority class
    
    grid = GridSearchCV(estimator = model_element, param_grid = param, scoring = scorers, cv = fold, 
                        refit=refit_score,n_jobs = -1, return_train_score = True)
    
    #train model to learn relationships between x and y
    grid.fit(X_train,y_train)
    
    #Printing the best hyperparameters
    print("Best refit score:", refit_score)
    print("Best F1 score:", grid.best_score_)
    print("Best hyperparameter:", grid.best_params_)
    

## 1. VOTING CLASSIFIER

In [11]:
X = df1.drop('Attrition', axis= 1)

In [12]:
y = df1['Attrition']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [15]:
#Initialize the models for voting classifier
lr = LogisticRegression()
rf = RandomForestClassifier()
svm = SVC()

In [19]:
#Voting classifier
voting = VotingClassifier(
    estimators=[('logistics_regression', lr), ('random_forest', rf), ('support_vector_machine', svm)], 
    voting='hard',
    weights = [1,1,1] # this param is used for assigning the weights the 
)

In [21]:
#Printing the best scores
for clf in (lr, rf, svm, voting):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression 0.8288043478260869
RandomForestClassifier 0.8288043478260869
SVC 0.8260869565217391
VotingClassifier 0.8260869565217391


## 2. AVERAGING 

For a data point that we are trying to predict, multiple predictions are made by various models. The average of the model predictions is the final prediction that we consider.

## 3. WEIGHTED AVERAGE

A weighted average ensemble model allows multiple models to contribute to the prediction based on how good the model is. If a model does better on the dataset in general, we will give it a higher weight. This generalization will help reduce bias and improve overall performance.

## 4. BAGGING CLASSIFIER

In [34]:
bag = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 250, max_samples = 100,
                       bootstrap = True, random_state = 42)

In [35]:
bag.fit(X_train, y_train)

In [36]:
y_pred_bag = bag.predict(X_test)

In [38]:
#predited probabilities
y_pred_prob_bag = bag.predict_proba(X_test)[:,1]

In [41]:
print_scores(y_test, y_pred_bag, y_pred_prob_bag)

ROC-AUC score  test dataset:  	 0.7558593750000002
precision score  test dataset:  	 0.625
Recall score  test dataset:  	 0.078125
f1 score  test dataset :  	 0.1388888888888889
Model accuracy score: 0.8315	


In [54]:
bag_model = BaggingClassifier()

In [55]:
# Create 5 fold cv
fold = StratifiedShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 42)

In [56]:
#Setting the params for RF
param = {
    'n_estimators' : [100, 200, 300, 400, 500],
    }

In [59]:
grid = grid_tuning(bag_model, param, fold, 'precision_score')

Best refit score: precision_score
Best F1 score: 0.5221553884711779
Best hyperparameter: {'n_estimators': 400}


In [61]:
#Using the best params from grid
bag2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 400,
                       bootstrap = True, random_state = 42)

In [62]:
bag2.fit(X_train, y_train)

In [63]:
y_pred_bag2 = bag2.predict(X_test)

In [64]:
#predited probabilities
y_pred_prob_bag2 = bag2.predict_proba(X_test)[:,1]

In [122]:
print_scores(y_test, y_pred_bag2, y_pred_prob_bag2)

ROC-AUC score  test dataset:  	 0.7440635279605263
precision score  test dataset:  	 0.6428571428571429
Recall score  test dataset:  	 0.140625
f1 score  test dataset :  	 0.23076923076923078
Model accuracy score: 0.8370	


##### Evaluate OOB scores for the same bagging classifier

In [47]:
bag2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 250,
                       bootstrap = True, oob_score = True, random_state = 42)

In [48]:
bag2.fit(X_train, y_train)

In [49]:
y_pred_bag2 = bag2.predict(X_test)

In [50]:
#predited probabilities
y_pred_prob_bag2 = bag2.predict_proba(X_test)[:,1]

In [51]:
print_scores(y_test, y_pred_bag2, y_pred_prob_bag2)

ROC-AUC score  test dataset:  	 0.7427785773026317
precision score  test dataset:  	 0.5833333333333334
Recall score  test dataset:  	 0.109375
f1 score  test dataset :  	 0.1842105263157895
Model accuracy score: 0.8315	


In [52]:
bag2.oob_score_

0.8448275862068966

## 5. BOOSTING

#### Adaptive Boosting

Generally, the technique used by AdaBoost is it pays attention to the training data its predecessors under fitted. This results in new predictors concentrating more on these hard cases

Drawback:One of the major drawbacks of this sequential approach is that they cannot be parallelized meaning they cannot be trained on different systems at the same time as each of the predictors requires the training and evaluation of its previous predictors. Hence, they don’t scale like bagging and pasting.

In [79]:
adaboost = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 1),
                             n_estimators = 500,
                             algorithm="SAMME.R",  
                             learning_rate=0.5, 
                             random_state=42)

# SAMME.R - Stagewise Addictive Modelling using a Multiclass Exponential Loss function, .R - real probabilities

In [80]:
adaboost.fit(X_train, y_train)

In [81]:
y_pred_adaboost = adaboost.predict(X_test)

In [82]:
y_pred_prob_adaboost = adaboost.predict_proba(X_test)[:,1]

In [83]:
print_scores(y_test, y_pred_adaboost, y_pred_prob_adaboost)

ROC-AUC score  test dataset:  	 0.7022512335526315
precision score  test dataset:  	 0.5185185185185185
Recall score  test dataset:  	 0.21875
f1 score  test dataset :  	 0.30769230769230765
Model accuracy score: 0.8288	


#### Gradient Boosting Algorithm

In [86]:
grad_boost = GradientBoostingClassifier(max_depth = 1,
                             n_estimators = 500,
                             learning_rate=0.5, 
                             random_state=42)

In [87]:
grad_boost.fit(X_train, y_train)

In [88]:
y_pred_grad_boost = grad_boost.predict(X_test)

In [89]:
y_pred_prob_grad_boost = grad_boost.predict_proba(X_test)[:,1]

In [90]:
print_scores(y_test, y_pred_grad_boost, y_pred_prob_grad_boost)

ROC-AUC score  test dataset:  	 0.7266138980263158
precision score  test dataset:  	 0.5
Recall score  test dataset:  	 0.21875
f1 score  test dataset :  	 0.30434782608695654
Model accuracy score: 0.8261	


#### XG Boosting

In [92]:
xgboost = xgb.XGBClassifier(objective = 'binary:logistic', random_state= 42)

In [93]:
xgboost.fit(X_train, y_train)

In [96]:
y_pred_xgboost = xgboost.predict(X_test)

In [97]:
y_pred_prob_xgboost = xgboost.predict_proba(X_test)[:,1]

In [98]:
print_scores(y_test, y_pred_xgboost, y_pred_prob_xgboost)

ROC-AUC score  test dataset:  	 0.6774259868421052
precision score  test dataset:  	 0.48148148148148145
Recall score  test dataset:  	 0.203125
f1 score  test dataset :  	 0.2857142857142857
Model accuracy score: 0.8234	


In [103]:
#For tuning the model
xg_boost1 = xgb.XGBClassifier(objective = 'binary:logistic', random_state= 42)

In [104]:
params_xg = {
    'n_estimators':[50, 100, 150, 200],
    'min_child_weight': [5,6,7,8],
    'gamma': [0,0.1,0.2],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': range(3,10),
    'score_func' : [precision_score]
}

In [105]:
fold_xg = StratifiedShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 42)

In [None]:
grid_xg = grid_tuning(xg_boost1, params_xg, fold_xg, 'precision_score')

In [117]:
#Using the best params from grid
xg_boost2 = xgb.XGBClassifier(objective = 'binary:logistic', gamma = 0, learning_rate = 0.01, 
                              max_depth = 8, min_child_weight = 7, n_estimators = 150,
                              random_state= 42)

In [118]:
xg_boost2.fit(X_train, y_train)

In [119]:
y_pred_xg_boost2 = xg_boost2.predict(X_test)

In [120]:
#predited probabilities
y_pred_prob_xg_boost2 = xg_boost2.predict_proba(X_test)[:,1]

In [121]:
print_scores(y_test, y_pred_xg_boost2, y_pred_prob_xg_boost2)

ROC-AUC score  test dataset:  	 0.7477384868421053
precision score  test dataset:  	 0.6363636363636364
Recall score  test dataset:  	 0.109375
f1 score  test dataset :  	 0.18666666666666665
Model accuracy score: 0.8342	
