In [104]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fim import apriori
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, classification_report
from sklearn.metrics import plot_confusion_matrix
import imblearn
import seaborn as sns

In [138]:
# Funzione che viene chiamata sul:
# - y_true: target reale
# - y_pred: target predetto dalle regole
# stampa: matrice di confusione, accuracy, precision, recall, specificity, F1
def result_pred(y_true, y_pred):
    # Calcolo dell'accuracy
    print("***** Calcolo dell'accuracy *****")
    accuracy = accuracy_score(y_true, y_pred)    
    print('Frazione di campioni correttamente correlati: %0.4f' % accuracy)
    accuracy = accuracy_score(y_true, y_pred, False)
    print("Numero di campioni correttamente correlati: ", accuracy)    
    print("____________________________________________")

In [129]:
path = "../DataSet/"
df = pd.read_csv(path + 'Dataset_Final.csv')
old_df = pd.read_csv(path + "Dataset.csv")
original_df = pd.read_csv(path + 'Dataset_Final.csv')

## Preprocessing dei dati

Dati in cui avevano Missing Values: 
    - Age,
    - TrainingTimesLastYear,
    - YearsAtCompany 
    - PerformanceRating,
    - Gender
    - BusinessTravel

## ---- Preprocessing dei dati
Applicazione dei Binning alle variabili continue

Dividere in 5 range MonthlyIncome: Very Low; Low; Medium; High; Very High

In [3]:
def toRange_MonthlyIncome(df):
    ll = df['MonthlyIncome'].values
    monthly_range = []
    for i in ll:
        if i < 5000:
            monthly_range.append("Very Low")
        elif i < 9000:
            monthly_range.append("Low")
        elif i < 13000:
            monthly_range.append("Medium")
        elif i < 17000:
            monthly_range.append("High")
        else:
            monthly_range.append("Very High")
    return monthly_range

In [4]:
df["MonthlyIncome_Range"] = toRange_MonthlyIncome(df)

Divido in 4 bin le variabili YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager

In [5]:
def toRange_Years(df, years_toBin):
    dt = {}
    for i in years_toBin:
        ll = df[i].values
        dt[i+"_Range"]=[]
        for x in ll:
            if x <= 4:
                dt[i+"_Range"].append("<=4")
            elif x <= 8:
                dt[i+"_Range"].append(">4 <=8")
            elif x <= 12:
                dt[i+"_Range"].append(">8 <=12")
            else:
                dt[i+"_Range"].append(">12")
    return dt
        #df[i+"_Range"] = _range

In [6]:
years_toBin = ["YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager"]
ll = toRange_Years(df, years_toBin)
for i in years_toBin:
    df[i+"_Range"] = ll[i+"_Range"]

Divido in bin le features:
- DistanceFromHome
- Age
- PercentSalaryHike
- TotalWorkingYears
- TotalSatisfaction

In [7]:
df['DistanceFromHome_Range'] = pd.qcut(df['DistanceFromHome'], q=4, labels=['<=2', '>2 <=7', '>7 <=14', '>14'])
df['Age_Range'] = pd.qcut(df['Age'], q=4, labels=['<=31', '>31 <=37', '>37 <=42', '>42 <=60'])
df['PercentSalaryHike_Range'] = pd.qcut(df['PercentSalaryHike'], q=4, labels=['<=12', '>12 <=14', '>14 <=18', '>18'])
df['TotalWorkingYears_Range'] = pd.qcut(df['TotalWorkingYears'], q=4, labels=['<=6', '>6 <=10', '>10 <=15', '>15'])
df['TotalSatisfaction_Range'] = pd.qcut(df['TotalSatisfaction'], q=3, labels=['Low', 'Medium', 'High'])

Trasformazione delle stringhe binarie in parole diverse tra loro (per evitare confusioni)

In [8]:
df['Attrition'] = df['Attrition'].map(
    {"No": 'Not_Attrition', "Yes": 'Attrition'}).astype(str)
df['OverTime'] = df['OverTime'].map(
    {"No": 'Not_OverTime', "Yes": 'OverTime'}).astype(str)
df['NotValid'] = df['NotValid'].map(
    {False: 'Not_NotValid', True: 'NotValid'}).astype(str)

Drop delle colonne non utilizzate

In [9]:
df = df.drop(columns=["Age", "MonthlyIncome", "DistanceFromHome", "PercentSalaryHike", 
                      "TotalWorkingYears", "TotalSatisfaction",
                     "YearsInCurrentRole", "YearsSinceLastPromotion","YearsWithCurrManager"])

In [10]:
df = df.drop(columns = ["DailyRate", "EnvironmentSatisfaction", "HourlyRate",
                           "JobSatisfaction", "MonthlyRate", "WorkLifeBalance", 
                            "ProbablyLeave", "NotValid"])

Trasformazione degli attributi numerici in "valoreattributo_Nome"

In [11]:
df.columns

Index(['Attrition', 'BusinessTravel', 'Department', 'Education',
       'EducationField', 'Gender', 'JobInvolvement', 'JobLevel', 'JobRole',
       'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TrainingTimesLastYear',
       'YearsAtCompany', 'MonthlyIncome_Range', 'YearsInCurrentRole_Range',
       'YearsSinceLastPromotion_Range', 'YearsWithCurrManager_Range',
       'DistanceFromHome_Range', 'Age_Range', 'PercentSalaryHike_Range',
       'TotalWorkingYears_Range', 'TotalSatisfaction_Range'],
      dtype='object')

In [12]:
missing_attr = ["Gender", "BusinessTravel", "PerformanceRating"]

Rimpiazzare i missing values di BusinessTravel, PerformanceRating, TrainingTimesLastYear con: Missing_< Nome_attributo >

In [13]:
missing_dict = {}
for i in missing_attr:
    missing_dict[i] = old_df[i].fillna("Missing")

In [14]:
for i in missing_attr:
    print(missing_dict[i].unique())

['Male' 'Missing' 'Female']
['Travel_Rarely' 'Travel_Frequently' 'Missing' 'Non-Travel']
[3.0 'Missing' 4.0]


In [15]:
df1 = df.copy()
df2 = df.copy()

Sostituisco le features che avevano originariamente missing values con le nuove colonne

In [16]:
for i in missing_attr:
    df1[i] = missing_dict[i]

In [17]:
for i in df1.columns:
    df1[i] = df1[i].astype(str) + '/'+str(i)

In [130]:
for i in missing_attr:
    original_df[i] = original_df[i].astype(str) + '/'+str(i)

Per calcolare l'accuracy delle predizioni in modo sintetico aggiungiamo ad ogni valore un "/NomeAttributo"

In [19]:
for i in missing_attr:
    print(df1[i].unique())

['Male/Gender' 'Missing/Gender' 'Female/Gender']
['Travel_Rarely/BusinessTravel' 'Travel_Frequently/BusinessTravel'
 'Missing/BusinessTravel' 'Non-Travel/BusinessTravel']
['3.0/PerformanceRating' 'Missing/PerformanceRating'
 '4.0/PerformanceRating']


In [21]:
df1.columns

Index(['Attrition', 'BusinessTravel', 'Department', 'Education',
       'EducationField', 'Gender', 'JobInvolvement', 'JobLevel', 'JobRole',
       'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TrainingTimesLastYear',
       'YearsAtCompany', 'MonthlyIncome_Range', 'YearsInCurrentRole_Range',
       'YearsSinceLastPromotion_Range', 'YearsWithCurrManager_Range',
       'DistanceFromHome_Range', 'Age_Range', 'PercentSalaryHike_Range',
       'TotalWorkingYears_Range', 'TotalSatisfaction_Range'],
      dtype='object')

Trasformazione del dataset in un insieme di transazioni

In [22]:
df_new = df1.drop(columns = ["YearsSinceLastPromotion_Range", 
                            "YearsWithCurrManager_Range", "YearsInCurrentRole_Range"
                           , "OverTime", "RelationshipSatisfaction", "Age_Range"])

In [156]:
def extractRules(baskets, minz, support, confidence, valcheck):
    rules = apriori(baskets, supp=support, zmin=minz, target='r', conf=confidence, report='aSCl', mode='o')
    df_rules_conf = pd.DataFrame(rules, columns=["conseq","antecedent","support_abs","support_perc","confidence","lift"])
    df_rules_conf = df_rules_conf.sort_values(by='lift', ascending=False)
    print("<<<<<<<<< Check del valore "+ valcheck[0] + " >>>>>>>>>>")
    cnt = 0
    for i in df_rules_conf.values:
        if valcheck[0] in i[0]:
            if i[5] > 1:
                print("****** Regola:")
                print(str(i[1]) + " => "+str(i[0]))
                print("**** Support: " + str(i[3]) + " Confidence: "+ str(i[4]) + " Lift: " + str(i[5]))
                print("______________________________")
                cnt = cnt + 1
    print("______________________________")
    print("______________________________")    
    print("SS: " + str(cnt))
    print("______________________________")
    print("______________________________")
    print("<<<<<<<<< Check del valore Not/Attrition + MissingVal >>>>>>>>>>")
    for i in df_rules_conf.values:
        if valcheck[1] in i[1]:
            #if "Attrition" in i[0]:
            if i[5] > 1:
                print("****** Regola:")
                print(str(i[1]) + " => "+str(i[0]))
                print("**** Support: " + str(i[3]) + " Confidence: "+ str(i[4]) + " Lift: " + str(i[5]))
                print("______________________________")
    return df_rules_conf

In [50]:
for i in missing_attr:
    print(df_new[i].unique())

['Male/Gender' 'Missing/Gender' 'Female/Gender']
['Travel_Rarely/BusinessTravel' 'Travel_Frequently/BusinessTravel'
 'Missing/BusinessTravel' 'Non-Travel/BusinessTravel']
['3.0/PerformanceRating' 'Missing/PerformanceRating'
 '4.0/PerformanceRating']


In [23]:
baskets = df_new.values.tolist()

In [157]:
for i in df_new["Gender"].unique():
    if i != "Missing/Gender":
        x = extractRules(baskets, 2, 2, 60, [i, "Missing/Gender"])

<<<<<<<<< Check del valore Male/Gender >>>>>>>>>>
****** Regola:
('Missing/PerformanceRating', '<=6/TotalWorkingYears_Range', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.312925170068027 Confidence: 85.0 Lift: 1.485731272294887
______________________________
****** Regola:
('<=6/TotalWorkingYears_Range', '1/JobLevel', 'Low/TotalSatisfaction_Range', 'Married/MaritalStatus', 'Travel_Rarely/BusinessTravel') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 83.33333333333334 Lift: 1.4565992865636148
______________________________
****** Regola:
('2/JobLevel', '3/Education', '1/StockOptionLevel', 'Research & Development/Department', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 83.33333333333334 Lift: 1.4565992865636148
______________________________
****** Regola:
('2/Education', '<=6/TotalWorkingYears_Range', '3/JobInvolvement', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 83.33333333

('>15/TotalWorkingYears_Range', '3.0/TrainingTimesLastYear', '3/JobInvolvement', 'Travel_Rarely/BusinessTravel', 'Research & Development/Department', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 73.17073170731707 Lift: 1.2789652272265886
______________________________
****** Regola:
('>15/TotalWorkingYears_Range', '1/StockOptionLevel', 'Life Sciences/EducationField', 'Travel_Rarely/BusinessTravel') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 73.17073170731707 Lift: 1.2789652272265886
______________________________
****** Regola:
('<=6/TotalWorkingYears_Range', 'Sales/Department', '0/StockOptionLevel', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 73.17073170731707 Lift: 1.2789652272265886
______________________________
****** Regola:
('2/NumCompaniesWorked', '3.0/TrainingTimesLastYear', '3/JobInvolvement') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 73.17073170731707 Lift:

**** Support: 2.1088435374149657 Confidence: 70.45454545454545 Lift: 1.2314884877310561
______________________________
****** Regola:
('>15/TotalWorkingYears_Range', 'Life Sciences/EducationField', 'Low/TotalSatisfaction_Range', 'Travel_Rarely/BusinessTravel', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.1088435374149657 Confidence: 70.45454545454545 Lift: 1.2314884877310561
______________________________
****** Regola:
('Medium/TotalSatisfaction_Range', '0/StockOptionLevel', '3/JobInvolvement', 'Travel_Rarely/BusinessTravel', 'Research & Development/Department', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.1088435374149657 Confidence: 70.45454545454545 Lift: 1.2314884877310561
______________________________
****** Regola:
('10.0/YearsAtCompany', 'Low/TotalSatisfaction_Range') => Male/Gender
**** Support: 2.1088435374149657 Confidence: 70.45454545454545 Lift: 1.2314884877310561
______________________________
****** Regola:
('2.0/TrainingTimesLastYear', '2/JobL

**** Support: 2.2448979591836733 Confidence: 68.75 Lift: 1.2016944114149821
______________________________
****** Regola:
('3/Education', 'Low/TotalSatisfaction_Range', '3/JobInvolvement', 'Travel_Rarely/BusinessTravel', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.9931972789115644 Confidence: 68.75 Lift: 1.2016944114149821
______________________________
****** Regola:
('1/StockOptionLevel', 'Life Sciences/EducationField', 'Very Low/MonthlyIncome_Range', 'Travel_Rarely/BusinessTravel', 'Research & Development/Department') => Male/Gender
**** Support: 2.2448979591836733 Confidence: 68.75 Lift: 1.2016944114149821
______________________________
****** Regola:
('2.0/TrainingTimesLastYear', '3/Education', '1/StockOptionLevel', 'Research & Development/Department') => Male/Gender
**** Support: 2.2448979591836733 Confidence: 68.75 Lift: 1.2016944114149821
______________________________
****** Regola:
('4.0/YearsAtCompany', '1/StockOptionLevel') => Male/Gender
**** Support: 2.2448979

______________________________
****** Regola:
('Divorced/MaritalStatus', '1/StockOptionLevel', 'Life Sciences/EducationField', 'Research & Development/Department') => Male/Gender
**** Support: 3.2653061224489797 Confidence: 67.6056338028169 Lift: 1.181691815578369
______________________________
****** Regola:
('Sales/Department', 'Single/MaritalStatus', 'Low/MonthlyIncome_Range') => Male/Gender
**** Support: 3.2653061224489797 Confidence: 67.6056338028169 Lift: 1.181691815578369
______________________________
****** Regola:
('Divorced/MaritalStatus', '1/StockOptionLevel', 'Very Low/MonthlyIncome_Range', '3.0/PerformanceRating', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 3.4013605442176873 Confidence: 67.56756756756756 Lift: 1.181026448565093
______________________________
****** Regola:
('Low/MonthlyIncome_Range', '3.0/TrainingTimesLastYear', 'Travel_Rarely/BusinessTravel', 'Research & Development/Department', '3.0/PerformanceRating', 'Not_Attrition/Attrition') => Male/Gen

('<=12/PercentSalaryHike_Range', '3/Education', '0/StockOptionLevel') => Male/Gender
**** Support: 3.2653061224489797 Confidence: 66.66666666666666 Lift: 1.165279429250892
______________________________
****** Regola:
('<=12/PercentSalaryHike_Range', '3/Education', '0/StockOptionLevel', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.312925170068027 Confidence: 66.66666666666666 Lift: 1.165279429250892
______________________________
****** Regola:
('Laboratory Technician/JobRole', '4/Education', '1/JobLevel') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 66.66666666666666 Lift: 1.165279429250892
______________________________
****** Regola:
('>15/TotalWorkingYears_Range', '3.0/TrainingTimesLastYear', '3/JobInvolvement', 'Travel_Rarely/BusinessTravel', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 66.66666666666666 Lift: 1.165279429250892
______________________________
****** Regola:
('Sales Executive/JobRole', '>2 <=7/Distanc

****** Regola:
('2.0/TrainingTimesLastYear', '1/StockOptionLevel', 'Married/MaritalStatus', 'Research & Development/Department') => Male/Gender
**** Support: 3.6734693877551026 Confidence: 65.85365853658537 Lift: 1.1510687045039296
______________________________
****** Regola:
('Divorced/MaritalStatus', '2/JobLevel', 'Travel_Rarely/BusinessTravel') => Male/Gender
**** Support: 3.6734693877551026 Confidence: 65.85365853658537 Lift: 1.1510687045039296
______________________________
****** Regola:
('2.0/TrainingTimesLastYear', 'Life Sciences/EducationField', 'Married/MaritalStatus') => Male/Gender
**** Support: 3.6734693877551026 Confidence: 65.85365853658537 Lift: 1.1510687045039296
______________________________
****** Regola:
('>7 <=14/DistanceFromHome_Range', '1/NumCompaniesWorked', 'Travel_Rarely/BusinessTravel') => Male/Gender
**** Support: 3.6734693877551026 Confidence: 65.85365853658537 Lift: 1.1510687045039296
______________________________
****** Regola:
('>6 <=10/TotalWorkingYe

**** Support: 2.0408163265306123 Confidence: 65.21739130434783 Lift: 1.1399472677454376
______________________________
****** Regola:
('3/Education', '1/StockOptionLevel', 'Low/TotalSatisfaction_Range', 'Research & Development/Department', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 3.061224489795918 Confidence: 65.21739130434783 Lift: 1.1399472677454376
______________________________
****** Regola:
('2/Education', 'Life Sciences/EducationField', '3.0/TrainingTimesLastYear', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 65.21739130434783 Lift: 1.1399472677454376
______________________________
****** Regola:
('Sales/Department', 'Single/MaritalStatus', '2/JobLevel', '3.0/TrainingTimesLastYear') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 65.21739130434783 Lift: 1.1399472677454376
______________________________
****** Regola:
('Divorced/MaritalStatus', 'Medium/TotalSatisfaction_Range', 'Very Low/MonthlyIncome_Range', '3

______________________________
****** Regola:
('>7 <=14/DistanceFromHome_Range', '>12 <=14/PercentSalaryHike_Range', 'Research & Development/Department', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.1088435374149657 Confidence: 64.58333333333334 Lift: 1.1288644470868014
______________________________
****** Regola:
('1/NumCompaniesWorked', 'Medium/TotalSatisfaction_Range', '1/StockOptionLevel', 'Married/MaritalStatus', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.1088435374149657 Confidence: 64.58333333333334 Lift: 1.1288644470868014
______________________________
****** Regola:
('>14/DistanceFromHome_Range', 'Medical/EducationField', 'Married/MaritalStatus', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.1088435374149657 Confidence: 64.58333333333334 Lift: 1.1288644470868014
______________________________
****** Regola:
('Attrition/Attrition', 'Medium/TotalSatisfaction_Range', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.1088435374149657

**** Support: 2.1768707482993195 Confidence: 64.0 Lift: 1.1186682520808562
______________________________
****** Regola:
('2.0/TrainingTimesLastYear', '3/Education', 'Low/MonthlyIncome_Range', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.1768707482993195 Confidence: 64.0 Lift: 1.1186682520808562
______________________________
****** Regola:
('>14/DistanceFromHome_Range', '0/StockOptionLevel', '3.0/TrainingTimesLastYear', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.1768707482993195 Confidence: 64.0 Lift: 1.1186682520808562
______________________________
****** Regola:
('>18/PercentSalaryHike_Range', '1/NumCompaniesWorked', '3.0/TrainingTimesLastYear', 'Research & Development/Department') => Male/Gender
**** Support: 2.1768707482993195 Confidence: 64.0 Lift: 1.1186682520808562
______________________________
****** Regola:
('2/JobInvolvement', 'Sales/Department', 'Travel_Rarely/BusinessTravel', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.1768707482993195

______________________________
****** Regola:
('Sales/Department', '3.0/TrainingTimesLastYear', '3/JobInvolvement', '3.0/PerformanceRating') => Male/Gender
**** Support: 3.6734693877551026 Confidence: 63.52941176470588 Lift: 1.1104427502273204
______________________________
****** Regola:
('Low/MonthlyIncome_Range', 'Married/MaritalStatus', 'Travel_Rarely/BusinessTravel', 'Research & Development/Department', '3.0/PerformanceRating', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 3.6734693877551026 Confidence: 63.52941176470588 Lift: 1.1104427502273204
______________________________
****** Regola:
('0/StockOptionLevel', 'Married/MaritalStatus', 'Travel_Rarely/BusinessTravel', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 3.6734693877551026 Confidence: 63.52941176470588 Lift: 1.1104427502273204
______________________________
****** Regola:
('<=6/TotalWorkingYears_Range', 'Married/MaritalStatus', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 6.870748299319727 

______________________________
****** Regola:
('2/NumCompaniesWorked',) => Male/Gender
**** Support: 6.258503401360545 Confidence: 63.013698630136986 Lift: 1.101428501620706
______________________________
****** Regola:
('3/Education', '3.0/TrainingTimesLastYear', '3/JobInvolvement', 'Research & Development/Department', '3.0/PerformanceRating', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 3.1292517006802725 Confidence: 63.013698630136986 Lift: 1.101428501620706
______________________________
****** Regola:
('>2 <=7/DistanceFromHome_Range', '3.0/TrainingTimesLastYear', '3/JobInvolvement', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 3.1292517006802725 Confidence: 63.013698630136986 Lift: 1.101428501620706
______________________________
****** Regola:
('<=6/TotalWorkingYears_Range', 'Low/MonthlyIncome_Range', 'Married/MaritalStatus', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 3.1292517006802725 Confidence: 63.013698630136986 Lift: 1.101428501620706
____

('Manufacturing Director/JobRole', '>6 <=10/TotalWorkingYears_Range', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.380952380952381 Confidence: 62.5 Lift: 1.092449464922711
______________________________
****** Regola:
('Research Director/JobRole', '>15/TotalWorkingYears_Range', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 62.5 Lift: 1.092449464922711
______________________________
****** Regola:
('>2 <=7/DistanceFromHome_Range', '1/StockOptionLevel', 'Married/MaritalStatus', '3/JobInvolvement') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 62.5 Lift: 1.092449464922711
______________________________
****** Regola:
('>2 <=7/DistanceFromHome_Range', 'Medium/TotalSatisfaction_Range', '3.0/TrainingTimesLastYear', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 62.5 Lift: 1.092449464922711
______________________________
****** Regola:
('High/TotalSatisfaction_Range', 'Sales Executive/JobRo

______________________________
****** Regola:
('Sales Executive/JobRole', 'Low/MonthlyIncome_Range', '3/JobInvolvement') => Male/Gender
**** Support: 3.1292517006802725 Confidence: 62.16216216216216 Lift: 1.0865443326798856
______________________________
****** Regola:
('Low/MonthlyIncome_Range', '1/StockOptionLevel', 'Married/MaritalStatus', 'Research & Development/Department', '3.0/PerformanceRating', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 3.1292517006802725 Confidence: 62.16216216216216 Lift: 1.0865443326798856
______________________________
****** Regola:
('Medical/EducationField', 'Medium/TotalSatisfaction_Range', '3/JobInvolvement', '3.0/PerformanceRating', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 3.1292517006802725 Confidence: 62.16216216216216 Lift: 1.0865443326798856
______________________________
****** Regola:
('>12 <=14/PercentSalaryHike_Range', '1/StockOptionLevel', 'Travel_Rarely/BusinessTravel', '3.0/PerformanceRating', 'Not_Attrition/Attr

****** Regola:
('1/NumCompaniesWorked', '1/JobLevel', 'Very Low/MonthlyIncome_Range', '3.0/PerformanceRating', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.312925170068027 Confidence: 61.81818181818181 Lift: 1.0805318343962815
______________________________
****** Regola:
('3/Education', 'Low/MonthlyIncome_Range', '0/StockOptionLevel', 'Research & Development/Department', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.312925170068027 Confidence: 61.81818181818181 Lift: 1.0805318343962815
______________________________
****** Regola:
('Divorced/MaritalStatus', '1/StockOptionLevel', '3.0/TrainingTimesLastYear', 'Travel_Rarely/BusinessTravel', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.312925170068027 Confidence: 61.81818181818181 Lift: 1.0805318343962815
______________________________
****** Regola:
('>15/TotalWorkingYears_Range', 'Married/MaritalStatus', '3.0/TrainingTimesLastYear', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.3129251700

______________________________
****** Regola:
('>6 <=10/TotalWorkingYears_Range', '1/NumCompaniesWorked', 'Life Sciences/EducationField', 'Travel_Rarely/BusinessTravel') => Male/Gender
**** Support: 2.380952380952381 Confidence: 61.40350877192983 Lift: 1.0732836848363476
______________________________
****** Regola:
('>14/DistanceFromHome_Range', 'Sales/Department', '2/JobLevel') => Male/Gender
**** Support: 2.380952380952381 Confidence: 61.40350877192983 Lift: 1.0732836848363476
______________________________
****** Regola:
('Single/MaritalStatus', 'Medium/TotalSatisfaction_Range', '0/StockOptionLevel', '3.0/PerformanceRating') => Male/Gender
**** Support: 4.761904761904762 Confidence: 61.40350877192983 Lift: 1.0732836848363476
______________________________
****** Regola:
('High/TotalSatisfaction_Range', 'Single/MaritalStatus', 'Life Sciences/EducationField', '0/StockOptionLevel') => Male/Gender
**** Support: 2.380952380952381 Confidence: 61.40350877192983 Lift: 1.0732836848363476
__

**** Support: 6.938775510204081 Confidence: 61.07784431137725 Lift: 1.0675913333855476
______________________________
****** Regola:
('1/StockOptionLevel', '3/JobInvolvement', 'Travel_Rarely/BusinessTravel', 'Research & Development/Department', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 5.442176870748299 Confidence: 61.06870229007634 Lift: 1.0674315382450916
______________________________
****** Regola:
('1/NumCompaniesWorked', '1/StockOptionLevel', 'Research & Development/Department') => Male/Gender
**** Support: 5.442176870748299 Confidence: 61.06870229007634 Lift: 1.0674315382450916
______________________________
****** Regola:
('3/Education', 'Low/MonthlyIncome_Range', 'Research & Development/Department', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 5.442176870748299 Confidence: 61.06870229007634 Lift: 1.0674315382450916
______________________________
****** Regola:
('Very Low/MonthlyIncome_Range', '0/StockOptionLevel') => Male/Gender
**** Support: 10.136054

______________________________
****** Regola:
('4/Education', 'Medium/TotalSatisfaction_Range', '1/StockOptionLevel', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.312925170068027 Confidence: 60.71428571428571 Lift: 1.0612366230677766
______________________________
****** Regola:
('2.0/TrainingTimesLastYear', '1/NumCompaniesWorked', '1/JobLevel', 'Research & Development/Department') => Male/Gender
**** Support: 2.312925170068027 Confidence: 60.71428571428571 Lift: 1.0612366230677766
______________________________
****** Regola:
('Technical Degree/EducationField', 'Very Low/MonthlyIncome_Range') => Male/Gender
**** Support: 2.312925170068027 Confidence: 60.71428571428571 Lift: 1.0612366230677766
______________________________
****** Regola:
('>2 <=7/DistanceFromHome_Range', '1/StockOptionLevel', '3.0/TrainingTimesLastYear', 'Not_Attrition/Attrition') => Male/Gender
**** Support: 2.312925170068027 Confidence: 60.71428571428571 Lift: 1.0612366230677766
________________________

**** Support: 2.585034013605442 Confidence: 60.317460317460316 Lift: 1.054300435988902
______________________________
****** Regola:
('>10 <=15/TotalWorkingYears_Range', '1/StockOptionLevel', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.585034013605442 Confidence: 60.317460317460316 Lift: 1.054300435988902
______________________________
****** Regola:
('Very Low/MonthlyIncome_Range', '3.0/TrainingTimesLastYear', '3/JobInvolvement', 'Travel_Rarely/BusinessTravel', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.585034013605442 Confidence: 60.317460317460316 Lift: 1.054300435988902
______________________________
****** Regola:
('1/NumCompaniesWorked', 'Life Sciences/EducationField', 'Very Low/MonthlyIncome_Range', '3/JobInvolvement') => Male/Gender
**** Support: 2.585034013605442 Confidence: 60.317460317460316 Lift: 1.054300435988902
______________________________
****** Regola:
('>14/DistanceFromHome_Range', 'Single/MaritalStatus', 'Travel_Rarely/BusinessTravel') => M

**** Support: 2.0408163265306123 Confidence: 60.0 Lift: 1.0487514863258025
______________________________
****** Regola:
('>6 <=10/TotalWorkingYears_Range', '1/StockOptionLevel', 'Married/MaritalStatus', 'Travel_Rarely/BusinessTravel', 'Research & Development/Department', '3.0/PerformanceRating') => Male/Gender
**** Support: 2.0408163265306123 Confidence: 60.0 Lift: 1.0487514863258025
______________________________
****** Regola:
('Low/MonthlyIncome_Range', '3.0/TrainingTimesLastYear', 'Travel_Rarely/BusinessTravel') => Male/Gender
**** Support: 7.142857142857142 Confidence: 60.0 Lift: 1.0487514863258025
______________________________
****** Regola:
('<=12/PercentSalaryHike_Range', 'Very Low/MonthlyIncome_Range', '3/JobInvolvement', 'Research & Development/Department') => Male/Gender
**** Support: 2.4489795918367347 Confidence: 60.0 Lift: 1.0487514863258025
______________________________
****** Regola:
('>7 <=14/DistanceFromHome_Range', '1/JobLevel', '0/StockOptionLevel') => Male/Gende

****** Regola:
('Missing/Gender', '3.0/PerformanceRating') => Research & Development/Department
**** Support: 2.517006802721088 Confidence: 74.0 Lift: 1.131945889698231
______________________________
****** Regola:
('Missing/Gender',) => Research & Development/Department
**** Support: 3.6054421768707483 Confidence: 70.66666666666667 Lift: 1.0809573361082205
______________________________
****** Regola:
('Missing/Gender', 'Not_Attrition/Attrition') => Research & Development/Department
**** Support: 2.925170068027211 Confidence: 70.49180327868852 Lift: 1.0782825267395644
______________________________
****** Regola:
('Missing/Gender', 'Travel_Rarely/BusinessTravel') => Research & Development/Department
**** Support: 2.0408163265306123 Confidence: 69.76744186046511 Lift: 1.0672022844420783
______________________________
****** Regola:
('Missing/Gender', 'Research & Development/Department') => 3/JobInvolvement
**** Support: 2.2448979591836733 Confidence: 62.264150943396224 Lift: 1.05447352

In [140]:
def predict_FromRules(df, baskets, minz, support, confidence, valcheck):
    rules = apriori(baskets, supp=support, zmin=minz, target='r', conf=confidence, report='aSCl', mode='o')
    pred = ["NotPredicted" for i in range(0, len(df))]
    df_rules_conf = pd.DataFrame(rules, columns=["conseq","antecedent","support_abs","support_perc","confidence","lift"])
    df_rules_conf = df_rules_conf.sort_values(by='lift', ascending=False)
    dt = {}
    for i in df[valcheck].unique():
        if "Missing" not in i:
            dt[i] = df_rules_conf[df_rules_conf["conseq"] == i][:100]
    for i in df[valcheck].unique():
        if "Missing" not in i:
            antecedente = dt[i]["antecedent"]
            for k in antecedente:
                associazioni = {}
                for j in range(0, len(k)):
                    valore, attributo = k[j].split("/")
                    associazioni[attributo] = valore
                x = df.copy()
                for s in associazioni:
                    x = x[x[s]==(str(associazioni[s]) + "/"+s)]
                indici = list(x.index)
                for c in indici:
                    #if(pred[c] == "NotPredicted"):
                    pred[c] = i
    return pred, dt    

In [81]:
baskets = df_new.values.tolist()

### Sostituzione MV per Gender

In [141]:
pred, dt = predict_FromRules(df_new, baskets, 2, 2, 60, "Gender")

In [133]:
original_df["GenderPredicted"] = pred

In [120]:
dict(Counter(pred))["NotPredicted"]/len(pred) * 100

25.37414965986395

In [134]:
y_true = original_df[original_df["GenderPredicted"]!="NotPredicted"]["Gender"]
y_pred = original_df[original_df["GenderPredicted"]!="NotPredicted"]["GenderPredicted"]

In [139]:
result_pred(y_true, y_pred)

***** Calcolo dell'accuracy *****
Frazione di campioni correttamente correlati: 0.7247
Numero di campioni correttamente correlati:  795
____________________________________________


### Sostituzione MV per PerformanceRating

In [142]:
pred, dt = predict_FromRules(df_new, baskets, 2, 2, 60, "PerformanceRating")

In [143]:
original_df["PerformanceRatingPredicted"] = pred

In [144]:
dict(Counter(pred))["NotPredicted"]/len(pred) * 100

43.06122448979592

In [145]:
y_true = original_df[original_df["PerformanceRatingPredicted"]!="NotPredicted"]["PerformanceRating"]
y_pred = original_df[original_df["PerformanceRatingPredicted"]!="NotPredicted"]["PerformanceRatingPredicted"]

In [146]:
result_pred(y_true, y_pred)

***** Calcolo dell'accuracy *****
Frazione di campioni correttamente correlati: 0.9415
Numero di campioni correttamente correlati:  788
____________________________________________


### Sostituzione MV per BusinessTravel

In [147]:
pred, dt = predict_FromRules(df_new, baskets, 2, 2, 60, "BusinessTravel")

In [148]:
original_df["BusinessTravelPredicted"] = pred

In [149]:
dict(Counter(pred))["NotPredicted"]/len(pred) * 100

36.12244897959184

In [150]:
y_true = original_df[original_df["BusinessTravelPredicted"]!="NotPredicted"]["BusinessTravel"]
y_pred = original_df[original_df["BusinessTravelPredicted"]!="NotPredicted"]["BusinessTravelPredicted"]

In [151]:
result_pred(y_true, y_pred)

***** Calcolo dell'accuracy *****
Frazione di campioni correttamente correlati: 0.8158
Numero di campioni correttamente correlati:  766
____________________________________________
