<b>Data mining project - 2020/21</b><br>
<b>Author</b>: [Alexandra Bradan](https://github.com/alexandrabradan)<br>
<b>Python version</b>: 3.x<br>
<b>Last update: 07/01/2021<b>

In [198]:
%matplotlib inline

# general libraries
import sys
import math
import operator
import itertools
import pydotplus
import collections
import missingno as msno
from pylab import MaxNLocator
from collections import Counter
from collections import OrderedDict

from collections import defaultdict
from IPython.display import Image

# pandas libraries
import pandas as pd
from pandas import DataFrame
from pandas.testing import assert_frame_equal

# visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot

# numpy libraries
import numpy as np
from numpy import std
from numpy import mean
from numpy import arange
from numpy import unique
from numpy import argmax
from numpy import percentile

# scipy libraries
import scipy.stats as stats
from scipy.stats import kstest
from scipy.stats import normaltest

# sklearn libraries
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.experimental import enable_iterative_imputer  # explicitly require this experimental feature
from sklearn.impute import IterativeImputer

from sklearn import tree
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.pipeline import make_pipeline as imbmake_pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, fbeta_score, recall_score, precision_score, classification_report, roc_auc_score 

In [199]:
from fim import apriori

In [200]:
data_directory = "../../../data/"
TR_impunted_file = data_directory + "Impunted_Train_HR_Employee_Attrition.csv"
TS_impunted_file = data_directory + "Impunted_Test_HR_Employee_Attrition.csv"
TR_not_impunted = data_directory + "Not_Impunted_Train_HR_Employee_Attrition.csv"
TS_not_impunted = data_directory + "Cleaned_Test_HR_Employee_Attrition.csv"

<h6>Impunted TR Dataframe</h6>

In [201]:
df_impunted = pd.read_csv(TR_impunted_file, sep=",") 
df_impunted.shape

(883, 30)

In [202]:
to_drop = ["MaritalStatus", "EducationField", "Department", "YearsSinceLastPromotion", "HourlyRate", "MonthlyRate"]

# drop features 
for column_name in to_drop:
    del df_impunted[column_name]
    
# check dropping output
print(df_impunted.shape)

(883, 24)


In [203]:
df_impunted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       883 non-null    int64  
 1   Attrition                 883 non-null    object 
 2   BusinessTravel            883 non-null    object 
 3   DistanceFromHome          883 non-null    int64  
 4   Education                 883 non-null    int64  
 5   EnvironmentSatisfaction   883 non-null    int64  
 6   Gender                    883 non-null    object 
 7   JobInvolvement            883 non-null    int64  
 8   JobLevel                  883 non-null    int64  
 9   JobRole                   883 non-null    object 
 10  JobSatisfaction           883 non-null    int64  
 11  MonthlyIncome             883 non-null    int64  
 12  NumCompaniesWorked        883 non-null    int64  
 13  OverTime                  883 non-null    object 
 14  PercentSal

<h6>Not-impunted TR DataFrame</h6>

In [204]:
df_not_impunted = pd.read_csv(TR_not_impunted, sep=",") 
df_not_impunted.shape

(883, 30)

In [205]:
to_drop = ["MaritalStatus", "EducationField", "Department", "YearsSinceLastPromotion", "HourlyRate", "MonthlyRate"]

# drop features 
for column_name in to_drop:
    del df_not_impunted[column_name]
    
# check dropping output
print(df_not_impunted.shape)

(883, 24)


<h6> Impunted TS DataFrame </h6>

In [206]:
df_ts = pd.read_csv(TS_impunted_file, sep=",") 
df_ts.shape

(219, 24)

In [207]:
df_ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       219 non-null    int64  
 1   Attrition                 219 non-null    object 
 2   BusinessTravel            219 non-null    object 
 3   DistanceFromHome          219 non-null    int64  
 4   Education                 219 non-null    int64  
 5   EnvironmentSatisfaction   219 non-null    int64  
 6   Gender                    219 non-null    object 
 7   JobInvolvement            219 non-null    int64  
 8   JobLevel                  219 non-null    int64  
 9   JobRole                   219 non-null    object 
 10  JobSatisfaction           219 non-null    int64  
 11  MonthlyIncome             219 non-null    int64  
 12  NumCompaniesWorked        219 non-null    int64  
 13  OverTime                  219 non-null    object 
 14  PercentSal

<h6> Not-impnted TS DataFrame </h6>

In [208]:
df_ts_not_impunted = pd.read_csv(TS_not_impunted, sep=",") 
df_ts_not_impunted.shape

del df_ts_not_impunted["Department"]
del df_ts_not_impunted["MonthlyRate"]


to_drop_indexes = df_ts_not_impunted.index[df_ts_not_impunted["YearsAtCompany"] > 20]
df_ts_not_impunted.drop(list(to_drop_indexes), axis=0, inplace=True)
df_ts_not_impunted.reset_index(drop=True, inplace=True)
print("dropped rows = ", len(to_drop_indexes), sep="\t")

to_drop_indexes = df_ts_not_impunted.index[df_ts_not_impunted["YearsInCurrentRole"] > 16]
df_ts_not_impunted.drop(list(to_drop_indexes), axis=0, inplace=True)
df_ts_not_impunted.reset_index(drop=True, inplace=True)
print("dropped rows = ", len(to_drop_indexes), sep="\t")

to_drop_indexes = df_ts_not_impunted.index[df_ts_not_impunted["MonthlyHours"] > 590.9767441860465]
df_ts_not_impunted.drop(list(to_drop_indexes), axis=0, inplace=True)
df_ts_not_impunted.reset_index(drop=True, inplace=True)
print("dropped rows = ", len(to_drop_indexes), sep="\t")

df_ts_not_impunted.shape

dropped rows = 	8
dropped rows = 	2
dropped rows = 	7


(219, 24)

In [209]:
df_ts_not_impunted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       191 non-null    float64
 1   Attrition                 219 non-null    object 
 2   BusinessTravel            202 non-null    object 
 3   DistanceFromHome          219 non-null    int64  
 4   Education                 219 non-null    int64  
 5   EnvironmentSatisfaction   219 non-null    int64  
 6   Gender                    208 non-null    object 
 7   JobInvolvement            219 non-null    int64  
 8   JobLevel                  219 non-null    int64  
 9   JobRole                   219 non-null    object 
 10  JobSatisfaction           219 non-null    int64  
 11  MonthlyIncome             168 non-null    float64
 12  NumCompaniesWorked        219 non-null    int64  
 13  OverTime                  219 non-null    object 
 14  PercentSal

<h2> Continuos variables discretisation with K-Means</h2>
K-Means => similar bins' size => IBM's Age and opportunity equity 

In [210]:
df_converted = df_impunted.copy()
df_ts_conv = df_ts.copy()

In [211]:
def print_performed_encoding(column_name, train_encoded):
    column_index = df_impunted.columns.get_loc(column_name)
    encoding_info = {}
    for enc, i in zip(train_encoded, range(0, len(train_encoded))):
        try:
            tmp_list = encoding_info[str(enc)]
            tmp_list.append(df_impunted.iloc[i, column_index])
            encoding_info[str(enc)] = tmp_list
        except KeyError:
            encoding_info[str(enc)] = [df_impunted.iloc[i, column_index]]
            
    for key, value in encoding_info.items():
        min_value = min(value)
        max_value = max(value)
        print(column_name, key, "[%s-%s]" %(min_value, max_value),sep="\t")

In [212]:
cont_variables = ["Age", "DistanceFromHome", "MonthlyIncome", "NumCompaniesWorked", "PercentSalaryHike",
                 "YearsAtCompany", "YearsInCurrentRole", "TaxRate", "MonthlyHours", "OverallSatisfaction"]

for column_name in cont_variables:
    discretizer = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='kmeans').fit(df_converted[[column_name]])
    train_encoded = discretizer.transform(df_converted[[column_name]]).astype(int)
    df_converted[column_name] = train_encoded
    df_ts_conv[column_name] = discretizer.transform(df_ts_conv[[column_name]]).astype(int)
    print_performed_encoding(column_name, train_encoded)

Age	[3]	[48-60]
Age	[1]	[31-38]
Age	[0]	[18-30]
Age	[2]	[39-47]
DistanceFromHome	[1]	[6-13]
DistanceFromHome	[0]	[1-5]
DistanceFromHome	[2]	[14-21]
DistanceFromHome	[3]	[22-29]
MonthlyIncome	[1]	[7094-13888]
MonthlyIncome	[0]	[1009-6992]
MonthlyIncome	[2]	[14004-20520]
MonthlyIncome	[3]	[20933-26997]
NumCompaniesWorked	[3]	[7-9]
NumCompaniesWorked	[0]	[0-2]
NumCompaniesWorked	[1]	[3-4]
NumCompaniesWorked	[2]	[5-6]
PercentSalaryHike	[1]	[15-18]
PercentSalaryHike	[0]	[11-14]
PercentSalaryHike	[3]	[22-25]
PercentSalaryHike	[2]	[19-21]
YearsAtCompany	[1]	[6-10]
YearsAtCompany	[0]	[0-5]
YearsAtCompany	[2]	[11-14]
YearsAtCompany	[3]	[15-20]
YearsInCurrentRole	[1]	[5-8]
YearsInCurrentRole	[0]	[0-4]
YearsInCurrentRole	[2]	[9-12]
YearsInCurrentRole	[3]	[13-16]
TaxRate	[1]	[0.2033107599699022-0.4878233954330433]
TaxRate	[2]	[0.4902649218001915-0.7143783124261257]
TaxRate	[3]	[0.7168701095461659-0.9513959334891722]
TaxRate	[0]	[0.0-0.20014044943820225]
MonthlyHours	[1]	[151.64893617021278-264.325

In [213]:
for column_name in list(df_converted.columns):
    df_converted[column_name] = df_converted[column_name].astype(str) + "_" + column_name
    df_ts_conv[column_name] = df_ts_conv[column_name].astype(str) + "_" + column_name

In [214]:
# transaction creation 
df_db = df_converted.values.tolist()
df_ts_db = df_ts_conv.values.tolist()

<h6> Missing values TR and TS discretisation </h6>

In [215]:
df_not_impunted_converted = df_not_impunted.copy()
df_ts_not_imp_conv = df_ts_not_impunted.copy()

In [216]:
cont_variables = ["Age", "DistanceFromHome", "MonthlyIncome", "NumCompaniesWorked", "PercentSalaryHike",
                 "YearsAtCompany", "YearsInCurrentRole", "TaxRate", "MonthlyHours", "OverallSatisfaction"]

for column_name in cont_variables:
    discretizer = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='kmeans').fit(df_impunted[[column_name]])
    train_encoded = discretizer.transform(df_impunted[[column_name]]).astype(int)
    df_not_impunted_converted[column_name] = train_encoded
    df_ts_not_imp_conv[column_name] = discretizer.transform(df_ts[[column_name]]).astype(int)
    # print_performed_encoding(column_name, train_encoded)

In [217]:
for column_name in list(df_converted.columns):
    df_not_impunted_converted[column_name] = df_not_impunted_converted[column_name].astype(str) + "_" + column_name
    df_ts_not_imp_conv[column_name] = df_ts_not_imp_conv[column_name].astype(str) + "_" + column_name

In [218]:
missing_numeric_values_columns = ["Age", "YearsAtCompany", "MonthlyIncome", "TaxRate", "TrainingTimesLastYear"]
for column_name in missing_numeric_values_columns:
    for i in range(0, len(df_not_impunted)):
        column_index = df_not_impunted.columns.get_loc(column_name)
        if df_not_impunted.iloc[i, column_index] >= 0:
            pass
        else:
            df_not_impunted_converted.iloc[i, column_index] =  df_not_impunted.iloc[i, column_index]
            
for column_name in missing_numeric_values_columns:
    for i in range(0, len(df_ts_not_impunted)):
        column_index = df_ts_not_impunted.columns.get_loc(column_name)
        if df_ts_not_impunted.iloc[i, column_index] >= 0:
            pass
        else:
            df_ts_not_imp_conv.iloc[i, column_index] = df_ts_not_impunted.iloc[i, column_index]

In [219]:
missing_categorical_values_columns = ["BusinessTravel", "Gender"]
for column_name in missing_categorical_values_columns:
    for i in range(0, len(df_not_impunted)):
        column_index = df_not_impunted.columns.get_loc(column_name)
        if df_not_impunted.iloc[i, column_index] != "Male" and \
        df_not_impunted.iloc[i, column_index] != "Female" and \
        df_not_impunted.iloc[i, column_index] != "Travel_Rarely" and \
        df_not_impunted.iloc[i, column_index] != "Travel_Frequently" and \
        df_not_impunted.iloc[i, column_index] != "Non-Travel":
            df_not_impunted_converted.iloc[i, column_index] =  df_not_impunted.iloc[i, column_index]
            
for column_name in missing_categorical_values_columns:
    for i in range(0, len(df_ts_not_impunted)):
        column_index = df_ts_not_impunted.columns.get_loc(column_name)
        if df_ts_not_impunted.iloc[i, column_index] != "Male" and \
        df_ts_not_impunted.iloc[i, column_index] != "Female" and \
        df_ts_not_impunted.iloc[i, column_index] != "Travel_Rarely" and \
        df_ts_not_impunted.iloc[i, column_index] != "Travel_Frequently" and \
        df_ts_not_impunted.iloc[i, column_index] != "Non-Travel":
            df_ts_not_imp_conv.iloc[i, column_index] = df_ts_not_impunted.iloc[i, column_index]

<h1> Comparison between frequent, maximal and closed itemsets </h1>

- a **frequent itemsets** if its support is greater than or equal to the minimum value defined for
this measure;
- a **maximal itemset** if none of its immediate supersets is frequent;
- a **closed itemset**, if none of its immediate supersets has the same support as that of the itemset.


Remember also that given an association rule X -> Y, where X is a k-itemset, with k=2,...,n_features and Y is a 1-itemset ({"Yes_Attrtion"} or {"No_Attrition"}):
- **support(XuY) = support_count(XuY) / 883**;
- **confidence(XuY) = support_count(XuY) / support_count(X)**;
- **lift(XuY) = confidence(XuY) / support_count(Y)**

Additionally, since in our dataset employees leaving are 153/883 * 100 = 17.33 % we will search Yes_Attrition ARs having:

- support \in [1, 18]
- confidence \in [10, 20, 30, 40, 50, 60, 70, 80, 90]
and computing at the same time the lift.

In [220]:
# help(apriori)

In [221]:
zmin_range = range(2, len(df_converted.columns))  # k = 2,...,n_features 
support_range = range(1, 19)

Saving frequent, closed and maximal info in a global dict having the following format:
global_itemsets_info = {'z_min': {'min_supp': {'itemset_type': len(itemsets}}}

In [222]:
global_itemsets_info = {}
for zmin in zmin_range:
    global_itemsets_info[str(zmin)] = {}
    for supp in support_range:
        global_itemsets_info[str(zmin)][str(supp)] = {}
        global_itemsets_info[str(zmin)][str(supp)][str("a")] = 0
        global_itemsets_info[str(zmin)][str(supp)][str("m")] = 0
        global_itemsets_info[str(zmin)][str(supp)][str("c")] = 0

In [223]:
itemset_types = ['a', 'm', 'c']

for itemset_type in itemset_types:
    for zmin in zmin_range:
        for supp in support_range:
            itemsets = apriori(df_db, supp=supp, zmin=zmin, target=itemset_type, report='as')  
            yes_itemsets = []
            if len(itemsets) > 0:
                for itemset in itemsets:
                    if "Yes_Attrition" in itemset[0]:
                        yes_itemsets.append(itemset)
            print('Number of %s itemsets for zmin=%s, supp=%s:' % (itemset_type, zmin, supp), len(yes_itemsets))
            # update global dict
            global_itemsets_info[str(zmin)][str(supp)][str(itemset_type)] = len(yes_itemsets)
        print()

Number of a itemsets for zmin=2, supp=1: 39220
Number of a itemsets for zmin=2, supp=2: 4268
Number of a itemsets for zmin=2, supp=3: 1103
Number of a itemsets for zmin=2, supp=4: 392
Number of a itemsets for zmin=2, supp=5: 171
Number of a itemsets for zmin=2, supp=6: 88
Number of a itemsets for zmin=2, supp=7: 46
Number of a itemsets for zmin=2, supp=8: 27
Number of a itemsets for zmin=2, supp=9: 16
Number of a itemsets for zmin=2, supp=10: 8
Number of a itemsets for zmin=2, supp=11: 7
Number of a itemsets for zmin=2, supp=12: 2
Number of a itemsets for zmin=2, supp=13: 1
Number of a itemsets for zmin=2, supp=14: 1
Number of a itemsets for zmin=2, supp=15: 0
Number of a itemsets for zmin=2, supp=16: 0
Number of a itemsets for zmin=2, supp=17: 0
Number of a itemsets for zmin=2, supp=18: 0

Number of a itemsets for zmin=3, supp=1: 39138
Number of a itemsets for zmin=3, supp=2: 4200
Number of a itemsets for zmin=3, supp=3: 1049
Number of a itemsets for zmin=3, supp=4: 352
Number of a it

Number of a itemsets for zmin=12, supp=17: 0
Number of a itemsets for zmin=12, supp=18: 0

Number of a itemsets for zmin=13, supp=1: 0
Number of a itemsets for zmin=13, supp=2: 0
Number of a itemsets for zmin=13, supp=3: 0
Number of a itemsets for zmin=13, supp=4: 0
Number of a itemsets for zmin=13, supp=5: 0
Number of a itemsets for zmin=13, supp=6: 0
Number of a itemsets for zmin=13, supp=7: 0
Number of a itemsets for zmin=13, supp=8: 0
Number of a itemsets for zmin=13, supp=9: 0
Number of a itemsets for zmin=13, supp=10: 0
Number of a itemsets for zmin=13, supp=11: 0
Number of a itemsets for zmin=13, supp=12: 0
Number of a itemsets for zmin=13, supp=13: 0
Number of a itemsets for zmin=13, supp=14: 0
Number of a itemsets for zmin=13, supp=15: 0
Number of a itemsets for zmin=13, supp=16: 0
Number of a itemsets for zmin=13, supp=17: 0
Number of a itemsets for zmin=13, supp=18: 0

Number of a itemsets for zmin=14, supp=1: 0
Number of a itemsets for zmin=14, supp=2: 0
Number of a itemset

Number of m itemsets for zmin=2, supp=1: 10295
Number of m itemsets for zmin=2, supp=2: 1568
Number of m itemsets for zmin=2, supp=3: 466
Number of m itemsets for zmin=2, supp=4: 182
Number of m itemsets for zmin=2, supp=5: 82
Number of m itemsets for zmin=2, supp=6: 46
Number of m itemsets for zmin=2, supp=7: 27
Number of m itemsets for zmin=2, supp=8: 15
Number of m itemsets for zmin=2, supp=9: 10
Number of m itemsets for zmin=2, supp=10: 5
Number of m itemsets for zmin=2, supp=11: 5
Number of m itemsets for zmin=2, supp=12: 2
Number of m itemsets for zmin=2, supp=13: 1
Number of m itemsets for zmin=2, supp=14: 1
Number of m itemsets for zmin=2, supp=15: 0
Number of m itemsets for zmin=2, supp=16: 0
Number of m itemsets for zmin=2, supp=17: 0
Number of m itemsets for zmin=2, supp=18: 0

Number of m itemsets for zmin=3, supp=1: 10291
Number of m itemsets for zmin=3, supp=2: 1562
Number of m itemsets for zmin=3, supp=3: 458
Number of m itemsets for zmin=3, supp=4: 175
Number of m items

Number of m itemsets for zmin=13, supp=1: 0
Number of m itemsets for zmin=13, supp=2: 0
Number of m itemsets for zmin=13, supp=3: 0
Number of m itemsets for zmin=13, supp=4: 0
Number of m itemsets for zmin=13, supp=5: 0
Number of m itemsets for zmin=13, supp=6: 0
Number of m itemsets for zmin=13, supp=7: 0
Number of m itemsets for zmin=13, supp=8: 0
Number of m itemsets for zmin=13, supp=9: 0
Number of m itemsets for zmin=13, supp=10: 0
Number of m itemsets for zmin=13, supp=11: 0
Number of m itemsets for zmin=13, supp=12: 0
Number of m itemsets for zmin=13, supp=13: 0
Number of m itemsets for zmin=13, supp=14: 0
Number of m itemsets for zmin=13, supp=15: 0
Number of m itemsets for zmin=13, supp=16: 0
Number of m itemsets for zmin=13, supp=17: 0
Number of m itemsets for zmin=13, supp=18: 0

Number of m itemsets for zmin=14, supp=1: 0
Number of m itemsets for zmin=14, supp=2: 0
Number of m itemsets for zmin=14, supp=3: 0
Number of m itemsets for zmin=14, supp=4: 0
Number of m itemsets f

Number of c itemsets for zmin=2, supp=1: 30084
Number of c itemsets for zmin=2, supp=2: 4000
Number of c itemsets for zmin=2, supp=3: 1078
Number of c itemsets for zmin=2, supp=4: 389
Number of c itemsets for zmin=2, supp=5: 170
Number of c itemsets for zmin=2, supp=6: 88
Number of c itemsets for zmin=2, supp=7: 46
Number of c itemsets for zmin=2, supp=8: 27
Number of c itemsets for zmin=2, supp=9: 16
Number of c itemsets for zmin=2, supp=10: 8
Number of c itemsets for zmin=2, supp=11: 7
Number of c itemsets for zmin=2, supp=12: 2
Number of c itemsets for zmin=2, supp=13: 1
Number of c itemsets for zmin=2, supp=14: 1
Number of c itemsets for zmin=2, supp=15: 0
Number of c itemsets for zmin=2, supp=16: 0
Number of c itemsets for zmin=2, supp=17: 0
Number of c itemsets for zmin=2, supp=18: 0

Number of c itemsets for zmin=3, supp=1: 30009
Number of c itemsets for zmin=3, supp=2: 3935
Number of c itemsets for zmin=3, supp=3: 1025
Number of c itemsets for zmin=3, supp=4: 350
Number of c it

Number of c itemsets for zmin=13, supp=1: 0
Number of c itemsets for zmin=13, supp=2: 0
Number of c itemsets for zmin=13, supp=3: 0
Number of c itemsets for zmin=13, supp=4: 0
Number of c itemsets for zmin=13, supp=5: 0
Number of c itemsets for zmin=13, supp=6: 0
Number of c itemsets for zmin=13, supp=7: 0
Number of c itemsets for zmin=13, supp=8: 0
Number of c itemsets for zmin=13, supp=9: 0
Number of c itemsets for zmin=13, supp=10: 0
Number of c itemsets for zmin=13, supp=11: 0
Number of c itemsets for zmin=13, supp=12: 0
Number of c itemsets for zmin=13, supp=13: 0
Number of c itemsets for zmin=13, supp=14: 0
Number of c itemsets for zmin=13, supp=15: 0
Number of c itemsets for zmin=13, supp=16: 0
Number of c itemsets for zmin=13, supp=17: 0
Number of c itemsets for zmin=13, supp=18: 0

Number of c itemsets for zmin=14, supp=1: 0
Number of c itemsets for zmin=14, supp=2: 0
Number of c itemsets for zmin=14, supp=3: 0
Number of c itemsets for zmin=14, supp=4: 0
Number of c itemsets f

<h2> Print itemsets with highest support</h2>

In [226]:
import pickle
for itemset_type in itemset_types:
    # read supersets 
    super_itemsets_file = ""
    if itemset_type == 'a':
        super_itemsets_file = "kmeans_frequent_yes_attrition_super_itemsets.pickle"
    elif itemset_type == 'm':
        super_itemsets_file = "kmeans_maximal_yes_attrition_super_itemsets.pickle"
    elif itemset_type == 'c':
        super_itemsets_file = "kmeans_closed_yes_attrition_super_itemsets.pickle"
    else:
        print("Error itemset_type %s" % itemset_type)
        sys.exit(-1)
      
    super_itemsets = {}
    with open(super_itemsets_file, 'rb') as handle:
        super_itemsets = pickle.load(handle)
       
    tmp_support_dict = {}
    associated_max_support_keys = []
    for key, value in super_itemsets.items():
        tmp_support_dict[str(key)] = value["support"]
        
    # ordering dict in descenting values (support)
    support_dict = OrderedDict(sorted(tmp_support_dict.items(), key=lambda kv: kv[1], reverse=True))
    print_max = 0
    print("%s SUPERSETS WITH GREATEST SUPPORT:" % itemset_type)
    for key, value in support_dict.items():
        if print_max == 700:
            break
        else:
            print_max += 1 
            print("i=%s" % print_max, "support=%s" % value, super_itemsets[str(key)]["itemset"])
            print()
            
    print()

a SUPERSETS WITH GREATEST SUPPORT:
i=1 support=0.014722536806342015 ('Yes_Attrition', 'Laboratory Technician_JobRole', '1_JobLevel', '0_YearsAtCompany', '0_YearsInCurrentRole', 'Travel_Rarely_BusinessTravel')

i=2 support=0.014722536806342015 ('Yes_Attrition', '2_RelationshipSatisfaction', '1_JobLevel', '0_StockOptionLevel', 'Travel_Rarely_BusinessTravel', '0_YearsInCurrentRole')

i=3 support=0.014722536806342015 ('Yes_Attrition', '2_Age', 'Yes_OverTime', '3_JobInvolvement', '0_YearsInCurrentRole', 'Travel_Rarely_BusinessTravel')

i=4 support=0.014722536806342015 ('Yes_Attrition', 'Yes_OverTime', '4_RelationshipSatisfaction', '1_JobLevel', '0_YearsInCurrentRole', 'Travel_Rarely_BusinessTravel')

i=5 support=0.014722536806342015 ('Yes_Attrition', '1_JobLevel', '0_DistanceFromHome', '0_YearsAtCompany', '0_YearsInCurrentRole', 'Travel_Rarely_BusinessTravel')

i=6 support=0.013590033975084938 ('Yes_Attrition', '1_RelationshipSatisfaction', '1_OverallSatisfaction', 'Male_Gender', '0_YearsIn


i=522 support=0.01245753114382786 ('Yes_Attrition', '0_YearsAtCompany', '0_NumCompaniesWorked', '3_WorkLifeBalance', 'No_OverTime')

i=523 support=0.01245753114382786 ('Yes_Attrition', '0_YearsAtCompany', '0_NumCompaniesWorked', 'Male_Gender', '0_MonthlyIncome', 'No_OverTime')

i=524 support=0.01245753114382786 ('Yes_Attrition', '0_YearsAtCompany', '3_JobInvolvement', '0_YearsInCurrentRole', 'No_OverTime', 'Travel_Rarely_BusinessTravel')

i=525 support=0.01245753114382786 ('Yes_Attrition', '0_YearsAtCompany', '3_WorkLifeBalance', 'Male_Gender', '0_YearsInCurrentRole', '0_MonthlyIncome')

i=526 support=0.01245753114382786 ('Yes_Attrition', '0_YearsAtCompany', '3_WorkLifeBalance', '0_MonthlyIncome', 'No_OverTime')

i=527 support=0.01245753114382786 ('Yes_Attrition', '0_YearsAtCompany', '3_WorkLifeBalance', 'No_OverTime', 'Travel_Rarely_BusinessTravel')

i=528 support=0.01245753114382786 ('Yes_Attrition', '0_PercentSalaryHike', '3_WorkLifeBalance', 'Male_Gender', 'No_OverTime', 'Travel_R

i=468 support=0.01245753114382786 ('Yes_Attrition', '1_MonthlyHours', '0_YearsAtCompany', 'Male_Gender', 'No_OverTime', 'Travel_Rarely_BusinessTravel')

i=469 support=0.01245753114382786 ('Yes_Attrition', '1_MonthlyHours', '0_PercentSalaryHike', '0_NumCompaniesWorked', '3_WorkLifeBalance')

i=470 support=0.01245753114382786 ('Yes_Attrition', '1_MonthlyHours', '0_NumCompaniesWorked', '3_WorkLifeBalance', 'Male_Gender', 'No_OverTime')

i=471 support=0.01245753114382786 ('Yes_Attrition', '1_MonthlyHours', '3_WorkLifeBalance', 'Male_Gender', 'No_OverTime', 'Travel_Rarely_BusinessTravel')

i=472 support=0.01245753114382786 ('Yes_Attrition', 'Female_Gender', '3_Education', '1_JobLevel', '2_OverallSatisfaction', '0_YearsInCurrentRole', 'Travel_Rarely_BusinessTravel')

i=473 support=0.01245753114382786 ('Yes_Attrition', 'Female_Gender', '3_Education', '1_JobLevel', '3_WorkLifeBalance', '0_YearsInCurrentRole', 'Travel_Rarely_BusinessTravel')

i=474 support=0.01245753114382786 ('Yes_Attrition', 


i=625 support=0.011325028312570781 ('Yes_Attrition', '1_RelationshipSatisfaction', '2_TrainingTimesLastYear', '0_PercentSalaryHike', 'Male_Gender')

i=626 support=0.011325028312570781 ('Yes_Attrition', '1_RelationshipSatisfaction', '2_OverallSatisfaction', 'Travel_Rarely_BusinessTravel')

i=627 support=0.011325028312570781 ('Yes_Attrition', '1_RelationshipSatisfaction', '0_YearsAtCompany', '0_PercentSalaryHike', 'Male_Gender')

i=628 support=0.011325028312570781 ('Yes_Attrition', '1_RelationshipSatisfaction', '0_YearsAtCompany', '0_PercentSalaryHike', '0_MonthlyIncome')

i=629 support=0.011325028312570781 ('Yes_Attrition', '1_RelationshipSatisfaction', '0_YearsAtCompany', '0_PercentSalaryHike', 'Travel_Rarely_BusinessTravel')

i=630 support=0.011325028312570781 ('Yes_Attrition', '1_RelationshipSatisfaction', '0_YearsAtCompany', 'Male_Gender', '0_MonthlyIncome')

i=631 support=0.011325028312570781 ('Yes_Attrition', '1_RelationshipSatisfaction', '0_YearsAtCompany', '0_YearsInCurrentRole