<b>Data mining project - 2020/21</b><br>
<b>Author</b>: [Alexandra Bradan](https://github.com/alexandrabradan)<br>
<b>Python version</b>: 3.x<br>
<b>Last update: 07/01/2021<b>

In [346]:
%matplotlib inline

# general libraries
import sys
import math
import operator
import itertools
import pydotplus
import collections
import missingno as msno
from pylab import MaxNLocator
from collections import Counter
from collections import defaultdict
from IPython.display import Image

# pandas libraries
import pandas as pd
from pandas import DataFrame
from pandas.testing import assert_frame_equal

# visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot

# numpy libraries
import numpy as np
from numpy import std
from numpy import mean
from numpy import arange
from numpy import unique
from numpy import percentile

# scipy libraries
import scipy.stats as stats
from scipy.stats import kstest
from scipy.stats import normaltest

# sklearn libraries
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.experimental import enable_iterative_imputer  # explicitly require this experimental feature
from sklearn.impute import IterativeImputer

from sklearn import tree
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.pipeline import make_pipeline as imbmake_pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, fbeta_score, recall_score, precision_score, classification_report, roc_auc_score 

In [347]:
data_directory = "../../../data/"
plot_directory = "../../../plots/DataUnderstanding/"
TR_file = data_directory + "Train_HR_Employee_Attrition.csv"
TR_cleaned_file = data_directory + "Numerical_Encoding_Train_HR_Employee_Attrition.csv"
TS_file = data_directory + "Numerical_Encoding_Test_HR_Employee_Attrition.csv"

In [348]:
df_cleaned = pd.read_csv(TR_cleaned_file, sep=",") 
df_ts = pd.read_csv(TS_file, sep=",") 

In [349]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       883 non-null    int64  
 1   Attrition                 883 non-null    int64  
 2   BusinessTravel            883 non-null    int64  
 3   DistanceFromHome          883 non-null    int64  
 4   Education                 883 non-null    int64  
 5   EnvironmentSatisfaction   883 non-null    int64  
 6   Gender                    883 non-null    int64  
 7   JobInvolvement            883 non-null    int64  
 8   JobLevel                  883 non-null    int64  
 9   JobRole                   883 non-null    int64  
 10  JobSatisfaction           883 non-null    int64  
 11  MonthlyIncome             883 non-null    int64  
 12  NumCompaniesWorked        883 non-null    int64  
 13  OverTime                  883 non-null    int64  
 14  PercentSal

In [350]:
df_ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       219 non-null    int64  
 1   Attrition                 219 non-null    int64  
 2   BusinessTravel            219 non-null    int64  
 3   DistanceFromHome          219 non-null    int64  
 4   Education                 219 non-null    int64  
 5   EnvironmentSatisfaction   219 non-null    int64  
 6   Gender                    219 non-null    int64  
 7   JobInvolvement            219 non-null    int64  
 8   JobLevel                  219 non-null    int64  
 9   JobRole                   219 non-null    int64  
 10  JobSatisfaction           219 non-null    int64  
 11  MonthlyIncome             219 non-null    int64  
 12  NumCompaniesWorked        219 non-null    int64  
 13  OverTime                  219 non-null    int64  
 14  PercentSal

In [351]:
print(df_cleaned.shape)
print(df_ts.shape)

(883, 24)
(219, 24)


<h2> Discretisation approach </h2> 
Approaches to transform continuous variables into discrete ones. This process is also known as <b>binning</b>, with each bin being each interval. Discretization methods fall into 2 categories: 

- supervised: do not use any information, other than the variable distribution, to create the contiguous bins in which the values will be placed;
- unsupervised: typically use target information in order to create bins or intervals.

Since we are dealying with DT it is natural to use a **supervised discretisation method** with them:

<u>Step 1</u>: First it trains a decision tree of limited depth (2, 3 or 4) using the variable we want to discretize to predict the target;

<u>Step 2</u>: The original variable values are then replaced by the probability returned by the tree. The probability is the same for all the observations within a single bin, thus replacing by the probability is equivalent to grouping the observations within the cut-off decided by the decision tree.

**Advantages** :
- The probabilistic predictions returned decision tree are monotonically related to the target.
- The new bins show decreased entropy, this is the observations within each bucket/bin are more similar to themselves than to those of other buckets/bins.
- The tree finds the bins automatically.

**Disadvantages**:
- It may cause over-fitting
- More importantly, some tuning of tree parameters might need to be done to obtain the optimal splits (e.g., depth, the minimum number of samples in one partition, the maximum number of partitions, and a minimum information gain). This it can be time-consuming.

<u>Features to discretize</u>:
- Age
- DistanceFromHome
- YearsAtCompany
- YearsInCurrentRole
- NumCompaniesWorked
- MonthlyIncome
- MonthlyHours

- PercentSalaryHike
- TaxRate

<h2> Training discretisation </h2>

In [352]:
X_train = df_cleaned.copy()
y_train = df_cleaned['Attrition']

In [353]:
def discretize_based_on_histogram_distribution(curr_column, bins, labels):
    print("%s max_train" %curr_column, X_train[curr_column].max(), "%s min_train" % curr_column, X_train[curr_column].min())
    print("%s max_test" %curr_column, df_ts[curr_column].max(), "%s min_test" % curr_column, df_ts[curr_column].min())
    print(pd.cut(X_train[curr_column], bins, labels=labels, include_lowest=True, right=False).unique())
    X_train[curr_column] = pd.cut(X_train[curr_column], bins, include_lowest=True, right=False)
    print(pd.cut(df_ts[curr_column], bins, labels=labels, include_lowest=True, right=False).unique())
    df_ts[curr_column] = pd.cut(df_ts[curr_column], bins, labels=labels, include_lowest=True, right=False)
    print("%s train_unique" % curr_column, sorted(X_train[curr_column].unique()))
    print("%s test_unique" % curr_column, sorted(df_ts[curr_column].unique()))

SyntaxError: positional argument follows keyword argument (<ipython-input-353-f193d6d4a983>, line 5)

<h6>Age </h6>
Build a classification tree using the Age to predict Attrition in order to discretise the age variable

In [None]:
bins = list(range(10, 71, 10))
labels= list(range(1, len(bins)))
discretize_based_on_histogram_distribution("Age", bins, labels)

<h6>DistanceFromHome </h6>
Build a classification tree using the variable to predict Attrition in order to discretise it

In [None]:
bins = list(range(0, 31, 5))
labels= list(range(1, len(bins)))
discretize_based_on_histogram_distribution("DistanceFromHome", bins, labels)

<h6> YearsAtCompany </h6>

In [None]:
to_drop_indexes = df_ts.index[df_ts["YearsAtCompany"] > 20]
df_ts.drop(list(to_drop_indexes), axis=0, inplace=True)
df_ts.reset_index(drop=True, inplace=True)
print("dropped rows = ", len(to_drop_indexes), sep="\t")

df_ts.shape

In [None]:
bins = list(range(0, 26, 5), )
labels= list(range(1, len(bins)))
discretize_based_on_histogram_distribution("YearsAtCompany", bins, labels)

<h6> YearsInCurrentRole </h6>

In [None]:
to_drop_indexes = df_ts.index[df_ts["YearsInCurrentRole"] > 16]
df_ts.drop(list(to_drop_indexes), axis=0, inplace=True)
df_ts.reset_index(drop=True, inplace=True)
print("dropped rows = ", len(to_drop_indexes), sep="\t")

df_ts.shape

In [None]:
bins = list(range(0, 21, 5))
labels= list(range(1, len(bins)))
discretize_based_on_histogram_distribution("YearsInCurrentRole", bins, labels)

<h6> NumCompaniesWorked </h6>

In [None]:
bins = list(range(0, 11, 5))
labels= list(range(1, len(bins)))
discretize_based_on_histogram_distribution("NumCompaniesWorked", bins, labels)

NumCompaniesWorked is a discretisation candidate

<h6> MonthlyIncome </h6>

In [None]:
bins = list(range(0, 30000, 2500))
labels= list(range(1, len(bins)))
discretize_based_on_histogram_distribution("MonthlyIncome", bins, labels)

<h6> MonthlyHours </h6>

In [None]:
to_drop_indexes = df_ts.index[df_ts["MonthlyHours"] > 590.9767441860465]
df_ts.drop(list(to_drop_indexes), axis=0, inplace=True)
df_ts.reset_index(drop=True, inplace=True)
print("dropped rows = ", len(to_drop_indexes), sep="\t")

df_ts.shape

In [None]:
bins = list(range(0, 601, 200))
labels= list(range(1, len(bins)))
discretize_based_on_histogram_distribution("MonthlyHours", bins, labels)

<h6> PercentSalaryHike </h6>

In [None]:
bins = list(range(0, 31, 5))
labels= list(range(1, len(bins)))
discretize_based_on_histogram_distribution("PercentSalaryHike", bins, labels)

<h6> TaxRate </h6>

In [None]:
bins = list(np.linspace(0, 1, 11))
print("bins", bins)
labels= list(range(1, len(bins)))
discretize_based_on_histogram_distribution("TaxRate", bins, labels)

<h6> OverallSatisfaction </h6>

In [None]:
bins = list(np.linspace(0, 5, 11))
print("bins", bins)
labels= list(range(1, len(bins)))
discretize_based_on_histogram_distribution("OverallSatisfaction", bins, labels)

<h2> Discretize variables and save them on new file </h2>

In [None]:
print(df_cleaned.shape)
print(df_ts.shape)

In [None]:
df1 = df_cleaned.copy()
df2 = df_ts.copy()

In [None]:
print(df1.shape)
print(df2.shape)

In [None]:
df1.to_csv(data_directory + "Discretized_HISTOGRAM_Numerical_Encoding_Train_HR_Employee_Attrition.csv", index=False, header=True)
df2.to_csv(data_directory + "Discretized_HISTOGRAM_Numerical_Encoding_Test_HR_Employee_Attrition.csv", index=False, header=True)

In [None]:
df_discretized = pd.read_csv(data_directory + "Discretized_HISTOGRAM_Numerical_Encoding_Train_HR_Employee_Attrition.csv", sep=",") 
df_discretized.shape

In [None]:
df_discretized = pd.read_csv(data_directory + "Discretized_HISTOGRAM_Numerical_Encoding_Test_HR_Employee_Attrition.csv", sep=",") 
df_discretized.shape