<b>Data mining project - 2020/21</b><br>
<b>Author</b>: [Alexandra Bradan](https://github.com/alexandrabradan)<br>
<b>Python version</b>: 3.x<br>
<b>Last update: 07/01/2021<b>

In [21]:
%matplotlib inline

# general libraries
import sys
import math
import operator
import itertools
import pydotplus
import collections
import missingno as msno
from pylab import MaxNLocator
from collections import Counter
from collections import OrderedDict

from collections import defaultdict
from IPython.display import Image

# pandas libraries
import pandas as pd
from pandas import DataFrame
from pandas.testing import assert_frame_equal

# visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot

# numpy libraries
import numpy as np
from numpy import std
from numpy import mean
from numpy import arange
from numpy import unique
from numpy import argmax
from numpy import percentile

# scipy libraries
import scipy.stats as stats
from scipy.stats import kstest
from scipy.stats import normaltest

# sklearn libraries
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.experimental import enable_iterative_imputer  # explicitly require this experimental feature
from sklearn.impute import IterativeImputer

from sklearn import tree
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.pipeline import make_pipeline as imbmake_pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, fbeta_score, recall_score, precision_score, classification_report, roc_auc_score 

In [22]:
from fim import apriori

In [23]:
data_directory = "../../../data/"
TR_impunted_file = data_directory + "Impunted_Train_HR_Employee_Attrition.csv"
TS_impunted_file = data_directory + "Impunted_Test_HR_Employee_Attrition.csv"
TR_not_impunted = data_directory + "Not_Impunted_Train_HR_Employee_Attrition.csv"
TS_not_impunted = data_directory + "Cleaned_Test_HR_Employee_Attrition.csv"

<h6>Impunted TR Dataframe</h6>

In [24]:
df_impunted = pd.read_csv(TR_impunted_file, sep=",") 
df_impunted.shape

(883, 30)

In [25]:
to_drop = ["MaritalStatus", "EducationField", "Department", "YearsSinceLastPromotion", "HourlyRate", "MonthlyRate"]

# drop features 
for column_name in to_drop:
    del df_impunted[column_name]
    
# check dropping output
print(df_impunted.shape)

(883, 24)


In [26]:
df_impunted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       883 non-null    int64  
 1   Attrition                 883 non-null    object 
 2   BusinessTravel            883 non-null    object 
 3   DistanceFromHome          883 non-null    int64  
 4   Education                 883 non-null    int64  
 5   EnvironmentSatisfaction   883 non-null    int64  
 6   Gender                    883 non-null    object 
 7   JobInvolvement            883 non-null    int64  
 8   JobLevel                  883 non-null    int64  
 9   JobRole                   883 non-null    object 
 10  JobSatisfaction           883 non-null    int64  
 11  MonthlyIncome             883 non-null    int64  
 12  NumCompaniesWorked        883 non-null    int64  
 13  OverTime                  883 non-null    object 
 14  PercentSal

<h6>Not-impunted TR DataFrame</h6>

In [27]:
df_not_impunted = pd.read_csv(TR_not_impunted, sep=",") 
df_not_impunted.shape

(883, 30)

In [28]:
to_drop = ["MaritalStatus", "EducationField", "Department", "YearsSinceLastPromotion", "HourlyRate", "MonthlyRate"]

# drop features 
for column_name in to_drop:
    del df_not_impunted[column_name]
    
# check dropping output
print(df_not_impunted.shape)

(883, 24)


<h6> Impunted TS DataFrame </h6>

In [29]:
df_ts = pd.read_csv(TS_impunted_file, sep=",") 
df_ts.shape

(219, 24)

In [30]:
df_ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       219 non-null    int64  
 1   Attrition                 219 non-null    object 
 2   BusinessTravel            219 non-null    object 
 3   DistanceFromHome          219 non-null    int64  
 4   Education                 219 non-null    int64  
 5   EnvironmentSatisfaction   219 non-null    int64  
 6   Gender                    219 non-null    object 
 7   JobInvolvement            219 non-null    int64  
 8   JobLevel                  219 non-null    int64  
 9   JobRole                   219 non-null    object 
 10  JobSatisfaction           219 non-null    int64  
 11  MonthlyIncome             219 non-null    int64  
 12  NumCompaniesWorked        219 non-null    int64  
 13  OverTime                  219 non-null    object 
 14  PercentSal

<h6> Not-impnted TS DataFrame </h6>

In [31]:
df_ts_not_impunted = pd.read_csv(TS_not_impunted, sep=",") 
df_ts_not_impunted.shape

del df_ts_not_impunted["Department"]
del df_ts_not_impunted["MonthlyRate"]


to_drop_indexes = df_ts_not_impunted.index[df_ts_not_impunted["YearsAtCompany"] > 20]
df_ts_not_impunted.drop(list(to_drop_indexes), axis=0, inplace=True)
df_ts_not_impunted.reset_index(drop=True, inplace=True)
print("dropped rows = ", len(to_drop_indexes), sep="\t")

to_drop_indexes = df_ts_not_impunted.index[df_ts_not_impunted["YearsInCurrentRole"] > 16]
df_ts_not_impunted.drop(list(to_drop_indexes), axis=0, inplace=True)
df_ts_not_impunted.reset_index(drop=True, inplace=True)
print("dropped rows = ", len(to_drop_indexes), sep="\t")

to_drop_indexes = df_ts_not_impunted.index[df_ts_not_impunted["MonthlyHours"] > 590.9767441860465]
df_ts_not_impunted.drop(list(to_drop_indexes), axis=0, inplace=True)
df_ts_not_impunted.reset_index(drop=True, inplace=True)
print("dropped rows = ", len(to_drop_indexes), sep="\t")

df_ts_not_impunted.shape

dropped rows = 	8
dropped rows = 	2
dropped rows = 	7


(219, 24)

In [32]:
df_ts_not_impunted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       191 non-null    float64
 1   Attrition                 219 non-null    object 
 2   BusinessTravel            202 non-null    object 
 3   DistanceFromHome          219 non-null    int64  
 4   Education                 219 non-null    int64  
 5   EnvironmentSatisfaction   219 non-null    int64  
 6   Gender                    208 non-null    object 
 7   JobInvolvement            219 non-null    int64  
 8   JobLevel                  219 non-null    int64  
 9   JobRole                   219 non-null    object 
 10  JobSatisfaction           219 non-null    int64  
 11  MonthlyIncome             168 non-null    float64
 12  NumCompaniesWorked        219 non-null    int64  
 13  OverTime                  219 non-null    object 
 14  PercentSal

<h2> Continuos variables discretisation with K-Means</h2>
K-Means => similar bins' size => IBM's Age and opportunity equity 

In [33]:
df_converted = df_impunted.copy()
df_ts_conv = df_ts.copy()

In [34]:
def print_performed_encoding(column_name, train_encoded):
    column_index = df_impunted.columns.get_loc(column_name)
    encoding_info = {}
    for enc, i in zip(train_encoded, range(0, len(train_encoded))):
        try:
            tmp_list = encoding_info[str(enc)]
            tmp_list.append(df_impunted.iloc[i, column_index])
            encoding_info[str(enc)] = tmp_list
        except KeyError:
            encoding_info[str(enc)] = [df_impunted.iloc[i, column_index]]
            
    for key, value in encoding_info.items():
        min_value = min(value)
        max_value = max(value)
        print(column_name, key, "[%s-%s]" %(min_value, max_value),sep="\t")

In [35]:
cont_variables = ["Age", "DistanceFromHome", "MonthlyIncome", "NumCompaniesWorked", "PercentSalaryHike",
                 "YearsAtCompany", "YearsInCurrentRole", "TaxRate", "MonthlyHours", "OverallSatisfaction"]

for column_name in cont_variables:
    discretizer = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='kmeans').fit(df_converted[[column_name]])
    train_encoded = discretizer.transform(df_converted[[column_name]]).astype(int)
    df_converted[column_name] = train_encoded
    df_ts_conv[column_name] = discretizer.transform(df_ts_conv[[column_name]]).astype(int)
    print_performed_encoding(column_name, train_encoded)

Age	[3]	[48-60]
Age	[1]	[31-38]
Age	[0]	[18-30]
Age	[2]	[39-47]
DistanceFromHome	[1]	[6-13]
DistanceFromHome	[0]	[1-5]
DistanceFromHome	[2]	[14-21]
DistanceFromHome	[3]	[22-29]
MonthlyIncome	[1]	[7094-13888]
MonthlyIncome	[0]	[1009-6992]
MonthlyIncome	[2]	[14004-20520]
MonthlyIncome	[3]	[20933-26997]
NumCompaniesWorked	[3]	[7-9]
NumCompaniesWorked	[0]	[0-2]
NumCompaniesWorked	[1]	[3-4]
NumCompaniesWorked	[2]	[5-6]
PercentSalaryHike	[1]	[15-18]
PercentSalaryHike	[0]	[11-14]
PercentSalaryHike	[3]	[22-25]
PercentSalaryHike	[2]	[19-21]
YearsAtCompany	[1]	[6-10]
YearsAtCompany	[0]	[0-5]
YearsAtCompany	[2]	[11-14]
YearsAtCompany	[3]	[15-20]
YearsInCurrentRole	[1]	[5-8]
YearsInCurrentRole	[0]	[0-4]
YearsInCurrentRole	[2]	[9-12]
YearsInCurrentRole	[3]	[13-16]
TaxRate	[1]	[0.2033107599699022-0.4878233954330433]
TaxRate	[2]	[0.4902649218001915-0.7143783124261257]
TaxRate	[3]	[0.7168701095461659-0.9513959334891722]
TaxRate	[0]	[0.0-0.20014044943820225]
MonthlyHours	[1]	[151.64893617021278-264.325

In [36]:
for column_name in list(df_converted.columns):
    df_converted[column_name] = df_converted[column_name].astype(str) + "_" + column_name
    df_ts_conv[column_name] = df_ts_conv[column_name].astype(str) + "_" + column_name

In [37]:
# transaction creation 
df_db = df_converted.values.tolist()
df_ts_db = df_ts_conv.values.tolist()

<h6> Missing values TR and TS discretisation </h6>

In [38]:
df_not_impunted_converted = df_not_impunted.copy()
df_ts_not_imp_conv = df_ts_not_impunted.copy()

In [39]:
cont_variables = ["Age", "DistanceFromHome", "MonthlyIncome", "NumCompaniesWorked", "PercentSalaryHike",
                 "YearsAtCompany", "YearsInCurrentRole", "TaxRate", "MonthlyHours", "OverallSatisfaction"]

for column_name in cont_variables:
    discretizer = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='kmeans').fit(df_impunted[[column_name]])
    train_encoded = discretizer.transform(df_impunted[[column_name]]).astype(int)
    df_not_impunted_converted[column_name] = train_encoded
    df_ts_not_imp_conv[column_name] = discretizer.transform(df_ts[[column_name]]).astype(int)
    # print_performed_encoding(column_name, train_encoded)

In [40]:
for column_name in list(df_converted.columns):
    df_not_impunted_converted[column_name] = df_not_impunted_converted[column_name].astype(str) + "_" + column_name
    df_ts_not_imp_conv[column_name] = df_ts_not_imp_conv[column_name].astype(str) + "_" + column_name

In [41]:
missing_numeric_values_columns = ["Age", "YearsAtCompany", "MonthlyIncome", "TaxRate", "TrainingTimesLastYear"]
for column_name in missing_numeric_values_columns:
    for i in range(0, len(df_not_impunted)):
        column_index = df_not_impunted.columns.get_loc(column_name)
        if df_not_impunted.iloc[i, column_index] >= 0:
            pass
        else:
            df_not_impunted_converted.iloc[i, column_index] =  df_not_impunted.iloc[i, column_index]
            
for column_name in missing_numeric_values_columns:
    for i in range(0, len(df_ts_not_impunted)):
        column_index = df_ts_not_impunted.columns.get_loc(column_name)
        if df_ts_not_impunted.iloc[i, column_index] >= 0:
            pass
        else:
            df_ts_not_imp_conv.iloc[i, column_index] = df_ts_not_impunted.iloc[i, column_index]

In [42]:
missing_categorical_values_columns = ["BusinessTravel", "Gender"]
for column_name in missing_categorical_values_columns:
    for i in range(0, len(df_not_impunted)):
        column_index = df_not_impunted.columns.get_loc(column_name)
        if df_not_impunted.iloc[i, column_index] != "Male" and \
        df_not_impunted.iloc[i, column_index] != "Female" and \
        df_not_impunted.iloc[i, column_index] != "Travel_Rarely" and \
        df_not_impunted.iloc[i, column_index] != "Travel_Frequently" and \
        df_not_impunted.iloc[i, column_index] != "Non-Travel":
            df_not_impunted_converted.iloc[i, column_index] =  df_not_impunted.iloc[i, column_index]
            
for column_name in missing_categorical_values_columns:
    for i in range(0, len(df_ts_not_impunted)):
        column_index = df_ts_not_impunted.columns.get_loc(column_name)
        if df_ts_not_impunted.iloc[i, column_index] != "Male" and \
        df_ts_not_impunted.iloc[i, column_index] != "Female" and \
        df_ts_not_impunted.iloc[i, column_index] != "Travel_Rarely" and \
        df_ts_not_impunted.iloc[i, column_index] != "Travel_Frequently" and \
        df_ts_not_impunted.iloc[i, column_index] != "Non-Travel":
            df_ts_not_imp_conv.iloc[i, column_index] = df_ts_not_impunted.iloc[i, column_index]

<h2>Generate "Yes_Attrition" frequent itemsets</h2>

Since in our dataset employees leaving are 153/883 * 100 = 17.33 % we will search AR having:
- support \in [1, 18]
- confidence \in [10, 20, 30, 40, 50, 60, 70, 80, 90]

and computing at the same time the lift.

Remember that given an association rule X -> Y, where X is a k-itemset, with k=2,...,n_features and Y is a 1-itemset ({"Yes_Attrtion"} or {"No_Attrition"}):
- **support(XuY) = support_count(XuY) / 883**;
- **confidence(XuY) = support_count(XuY) / support_count(X)**;
- **lift(XuY) = confidence(XuY) / support_count(Y)**

In [43]:
# help(apriori)

In [44]:
def check_presence_in_most_specific_frequent_itemset(most_specific_frequent_itemset, itemset1):
    presence_flag = False
    for key, value in most_specific_frequent_itemset.items():
        replace_flag = False
        itemset2 = value["itemset"] 
        support_count2 = value["support_count"] 
        support2 = value["support"] 
        if set(itemset1[0]).issubset(set(itemset2)):
            presence_flag = True
            break
            """if len(set(itemset2).difference(set(itemset1[0]))) == 0:  # |itemset2| == |itemset1[0]|
                if itemset1[1] > support_count2 and itemset1[2] > support2:
                    replace_flag = True
                    presence_flag = True"""
        elif set(itemset2).issubset(set(itemset1[0])):
            replace_flag = True
        if replace_flag:
            # replace itemset in dict with current one (more specific)
            most_specific_frequent_itemset[str(key)]["itemset"] = itemset1[0]
            most_specific_frequent_itemset[str(key)]["support_count"] = itemset1[1]
            most_specific_frequent_itemset[str(key)]["support"] = itemset1[2]
    if presence_flag is False:
        # new itemset to insert in dict
        last_key_index = len(most_specific_frequent_itemset)
        most_specific_frequent_itemset[str(last_key_index)] = {}
        most_specific_frequent_itemset[str(last_key_index)]["itemset"] = itemset1[0]
        most_specific_frequent_itemset[str(last_key_index)]["support_count"] = itemset1[1]
        most_specific_frequent_itemset[str(last_key_index)]["support"] = itemset1[2]
    return most_specific_frequent_itemset

In [45]:
zmin_range = range(2, len(df_converted.columns))  # k = 2,...,n_features 
support_range = range(1, 19)
most_specific_frequent_itemset = {}

for zmin in zmin_range:
    for supp in support_range:
        itemsets = apriori(df_db, supp=supp, zmin=zmin, target='m', report='as')  
        if len(itemsets) > 0:
            print('Number of itemsets for zmin=%s, supp=%s:' % (zmin, supp), len(itemsets))
            for itemset in itemsets:
                if "Yes_Attrition" in itemset[0]:
                    most_specific_frequent_itemset = \
                        check_presence_in_most_specific_frequent_itemset(most_specific_frequent_itemset, itemset)
    print()

Number of itemsets for zmin=2, supp=1: 634394
Number of itemsets for zmin=2, supp=2: 153725
Number of itemsets for zmin=2, supp=3: 60391
Number of itemsets for zmin=2, supp=4: 29982
Number of itemsets for zmin=2, supp=5: 17035
Number of itemsets for zmin=2, supp=6: 11130
Number of itemsets for zmin=2, supp=7: 7326
Number of itemsets for zmin=2, supp=8: 5049
Number of itemsets for zmin=2, supp=9: 3670
Number of itemsets for zmin=2, supp=10: 2664
Number of itemsets for zmin=2, supp=11: 2064
Number of itemsets for zmin=2, supp=12: 1613
Number of itemsets for zmin=2, supp=13: 1274
Number of itemsets for zmin=2, supp=14: 990
Number of itemsets for zmin=2, supp=15: 827
Number of itemsets for zmin=2, supp=16: 667
Number of itemsets for zmin=2, supp=17: 552
Number of itemsets for zmin=2, supp=18: 451

Number of itemsets for zmin=3, supp=1: 634324
Number of itemsets for zmin=3, supp=2: 153572
Number of itemsets for zmin=3, supp=3: 60224
Number of itemsets for zmin=3, supp=4: 29782
Number of ite

In [46]:
print(len(most_specific_frequent_itemset))

10295


<h6> Checking if dict has duplicates </h6>

In [47]:
new_most_specific_frequent_itemset = {}
index = 0
for i in most_specific_frequent_itemset.keys():
    duplicate_flag = 0
    list(most_specific_frequent_itemset.keys())
    for j in most_specific_frequent_itemset.keys():
        if set(most_specific_frequent_itemset[i]["itemset"]).issubset(set(most_specific_frequent_itemset[j]["itemset"])):
            duplicate_flag += 1
    if duplicate_flag == 1:
        new_most_specific_frequent_itemset[str(index)] = {}
        new_most_specific_frequent_itemset[str(index)]["itemset"] = most_specific_frequent_itemset[i]["itemset"]
        new_most_specific_frequent_itemset[str(index)]["support_count"] = most_specific_frequent_itemset[i]["support_count"]
        new_most_specific_frequent_itemset[str(index)]["support"] = most_specific_frequent_itemset[i]["support"]
        index += 1

In [48]:
len(new_most_specific_frequent_itemset)

10295

In [49]:
most_specific_frequent_itemset = new_most_specific_frequent_itemset

In [50]:
len(most_specific_frequent_itemset)

10295

<h6> Save dict on file </h6>

In [51]:
import pickle
with open('kmeans_maximal_yes_attrition_super_itemsets.pickle', 'wb') as handle:
    pickle.dump(most_specific_frequent_itemset, handle, protocol=pickle.HIGHEST_PROTOCOL)

<h6> Extract info about dict </h6>

In [52]:
# support_count
support_count_set = set()
for key, value in most_specific_frequent_itemset.items():
    support_count = most_specific_frequent_itemset[str(key)]["support_count"]
    support_count_set.add(support_count)
print(support_count_set)

{9, 10, 11, 12, 13}


In [53]:
# support_count's # of itemsets
support_info = {}
for elem in support_count_set:
    support_info[str(elem)] = 0
for key, value in most_specific_frequent_itemset.items():
    support_count = most_specific_frequent_itemset[str(key)]["support_count"]
    support_info[str(support_count)] += 1

print(support_info)

{'9': 7431, '10': 2281, '11': 505, '12': 72, '13': 6}


In [54]:
# support_count's zmin
support_zmin_info = {}
for elem in support_count_set:
    support_zmin_info[str(elem)] = {}
for key, value in most_specific_frequent_itemset.items():
    zmin = len(most_specific_frequent_itemset[str(key)]["itemset"])
    support_count = most_specific_frequent_itemset[str(key)]["support_count"]
    try:
        support_zmin_info[str(support_count)][str(zmin)] += 1
    except KeyError:
        support_zmin_info[str(support_count)][str(zmin)] = 1

print(support_zmin_info)

{'9': {'4': 816, '5': 2239, '3': 96, '6': 2593, '7': 1350, '2': 4, '8': 298, '9': 35}, '10': {'5': 644, '3': 31, '7': 426, '6': 812, '4': 263, '8': 97, '9': 8}, '11': {'4': 55, '5': 130, '6': 189, '3': 5, '7': 102, '8': 20, '9': 4}, '12': {'5': 15, '7': 19, '6': 32, '4': 3, '9': 1, '8': 2}, '13': {'7': 1, '6': 5}}
