# Business case

Predicting the probability of an individual being excessively absent from work

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
pd.options.display.max_columns = None
pd.options.display.max_rows = None
from datetime import datetime

In [2]:
df = pd.read_csv("Absenteeism-data.csv")
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [3]:
df_temp = df.drop('ID', axis = 1)
df_temp.head()

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [4]:
for col in df_temp.columns:
    print("Number of null values for {}: {}".format(col, df[col].isna().sum()))

Number of null values for Reason for Absence: 0
Number of null values for Date: 0
Number of null values for Transportation Expense: 0
Number of null values for Distance to Work: 0
Number of null values for Age: 0
Number of null values for Daily Work Load Average: 0
Number of null values for Body Mass Index: 0
Number of null values for Education: 0
Number of null values for Children: 0
Number of null values for Pets: 0
Number of null values for Absenteeism Time in Hours: 0


In [5]:
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 11 columns):
Reason for Absence           700 non-null int64
Date                         700 non-null object
Transportation Expense       700 non-null int64
Distance to Work             700 non-null int64
Age                          700 non-null int64
Daily Work Load Average      700 non-null float64
Body Mass Index              700 non-null int64
Education                    700 non-null int64
Children                     700 non-null int64
Pets                         700 non-null int64
Absenteeism Time in Hours    700 non-null int64
dtypes: float64(1), int64(9), object(1)
memory usage: 60.2+ KB


In [6]:
df_temp.describe(include='all')

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
unique,,432,,,,,,,,,
top,,17/08/2015,,,,,,,,,
freq,,5,,,,,,,,,
mean,19.411429,,222.347143,29.892857,36.417143,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
std,8.356292,,66.31296,14.804446,6.379083,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082
min,0.0,,118.0,5.0,27.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,13.0,,179.0,16.0,31.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,23.0,,225.0,26.0,37.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,27.0,,260.0,50.0,40.0,294.217,31.0,1.0,2.0,1.0,8.0


In [7]:
df_temp['Reason for Absence'].value_counts().sort_index()

0      38
1      16
2       1
3       1
4       2
5       3
6       6
7      13
8       5
9       4
10     22
11     24
12      8
13     52
14     18
15      2
16      3
17      1
18     21
19     36
21      6
22     32
23    147
24      3
25     29
26     31
27     66
28    110
Name: Reason for Absence, dtype: int64

In [8]:
df_with_dummies = pd.get_dummies(data = df_temp, columns=['Reason for Absence'])

In [9]:
df_with_dummies.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason for Absence_0,Reason for Absence_1,Reason for Absence_2,Reason for Absence_3,Reason for Absence_4,Reason for Absence_5,Reason for Absence_6,Reason for Absence_7,Reason for Absence_8,Reason for Absence_9,Reason for Absence_10,Reason for Absence_11,Reason for Absence_12,Reason for Absence_13,Reason for Absence_14,Reason for Absence_15,Reason for Absence_16,Reason for Absence_17,Reason for Absence_18,Reason for Absence_19,Reason for Absence_21,Reason for Absence_22,Reason for Absence_23,Reason for Absence_24,Reason for Absence_25,Reason for Absence_26,Reason for Absence_27,Reason for Absence_28
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [10]:
df_with_dummies.columns

Index(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets', 'Absenteeism Time in Hours', 'Reason for Absence_0',
       'Reason for Absence_1', 'Reason for Absence_2', 'Reason for Absence_3',
       'Reason for Absence_4', 'Reason for Absence_5', 'Reason for Absence_6',
       'Reason for Absence_7', 'Reason for Absence_8', 'Reason for Absence_9',
       'Reason for Absence_10', 'Reason for Absence_11',
       'Reason for Absence_12', 'Reason for Absence_13',
       'Reason for Absence_14', 'Reason for Absence_15',
       'Reason for Absence_16', 'Reason for Absence_17',
       'Reason for Absence_18', 'Reason for Absence_19',
       'Reason for Absence_21', 'Reason for Absence_22',
       'Reason for Absence_23', 'Reason for Absence_24',
       'Reason for Absence_25', 'Reason for Absence_26',
       'Reason for Absence_27', 'Reason for Absence_28'],
      dtype='object')

In [11]:
df_with_dummies = df_with_dummies.drop('Reason for Absence_0', axis = 1)

In [12]:
df_with_dummies.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason for Absence_1,Reason for Absence_2,Reason for Absence_3,Reason for Absence_4,Reason for Absence_5,Reason for Absence_6,Reason for Absence_7,Reason for Absence_8,Reason for Absence_9,Reason for Absence_10,Reason for Absence_11,Reason for Absence_12,Reason for Absence_13,Reason for Absence_14,Reason for Absence_15,Reason for Absence_16,Reason for Absence_17,Reason for Absence_18,Reason for Absence_19,Reason for Absence_21,Reason for Absence_22,Reason for Absence_23,Reason for Absence_24,Reason for Absence_25,Reason for Absence_26,Reason for Absence_27,Reason for Absence_28
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [13]:
df_with_dummies['Reason 1'] = df_with_dummies[['Reason for Absence_1','Reason for Absence_2','Reason for Absence_3','Reason for Absence_4','Reason for Absence_5','Reason for Absence_6','Reason for Absence_7','Reason for Absence_8','Reason for Absence_9','Reason for Absence_10','Reason for Absence_11','Reason for Absence_12','Reason for Absence_13','Reason for Absence_14' ]].max(axis=1)
df_with_dummies['Reason 2'] = df_with_dummies[['Reason for Absence_15','Reason for Absence_16','Reason for Absence_17' ]].max(axis=1)
df_with_dummies['Reason 3'] = df_with_dummies[['Reason for Absence_18','Reason for Absence_19', 'Reason for Absence_21'  ]].max(axis=1)
df_with_dummies['Reason 4'] = df_with_dummies[['Reason for Absence_22','Reason for Absence_23','Reason for Absence_24', 'Reason for Absence_25', 'Reason for Absence_26', 'Reason for Absence_27', 'Reason for Absence_28'  ]].max(axis=1)

In [14]:
df_with_dummies.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason for Absence_1,Reason for Absence_2,Reason for Absence_3,Reason for Absence_4,Reason for Absence_5,Reason for Absence_6,Reason for Absence_7,Reason for Absence_8,Reason for Absence_9,Reason for Absence_10,Reason for Absence_11,Reason for Absence_12,Reason for Absence_13,Reason for Absence_14,Reason for Absence_15,Reason for Absence_16,Reason for Absence_17,Reason for Absence_18,Reason for Absence_19,Reason for Absence_21,Reason for Absence_22,Reason for Absence_23,Reason for Absence_24,Reason for Absence_25,Reason for Absence_26,Reason for Absence_27,Reason for Absence_28,Reason 1,Reason 2,Reason 3,Reason 4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1


In [15]:
df_dummies_binned = df_with_dummies.drop(columns=['Reason for Absence_1','Reason for Absence_2','Reason for Absence_3','Reason for Absence_4','Reason for Absence_5','Reason for Absence_6','Reason for Absence_7','Reason for Absence_8','Reason for Absence_9','Reason for Absence_10','Reason for Absence_11','Reason for Absence_12','Reason for Absence_13','Reason for Absence_14','Reason for Absence_15','Reason for Absence_16','Reason for Absence_17' , 'Reason for Absence_18','Reason for Absence_19', 'Reason for Absence_21' ,'Reason for Absence_22','Reason for Absence_23','Reason for Absence_24', 'Reason for Absence_25', 'Reason for Absence_26', 'Reason for Absence_27', 'Reason for Absence_28' ], axis=1)

In [16]:
df_dummies_binned.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason 1,Reason 2,Reason 3,Reason 4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [17]:
df_dummies_binned.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason 1',
       'Reason 2', 'Reason 3', 'Reason 4'], dtype=object)

In [18]:
columns_reordered = [ 'Reason 1',
       'Reason 2', 'Reason 3', 'Reason 4','Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

df_dummies_reordered_cols = df_dummies_binned[columns_reordered]
df_dummies_reordered_cols.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [19]:
df_modified = df_dummies_reordered_cols.copy()
df_modified.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [20]:
print(type(df_modified.loc[0,['Date']][0]))
print(df_modified.loc[0,['Date']][0])
date_from_df = df_modified.loc[0,['Date']][0]

date_to_convert = datetime.strptime(date_from_df, '%d/%m/%Y')
date_to_convert.weekday()

<class 'str'>
07/07/2015


1

In [21]:
dict_months = {'01':'January', '02':'February','03':'March','04':'April','05':'May', '06':'June','07':'July','08':'August', '09':'September','10':'October','11':'November', '12':'December'}
df_modified['Month'] = df_modified['Date'].apply(lambda x: x[4:5])

In [22]:
df_modified[['Date','Month']].head(10)

Unnamed: 0,Date,Month
0,07/07/2015,7
1,14/07/2015,7
2,15/07/2015,7
3,16/07/2015,7
4,23/07/2015,7
5,10/07/2015,7
6,17/07/2015,7
7,24/07/2015,7
8,06/07/2015,7
9,13/07/2015,7


In [23]:
dict_day_of_week = {0:'Monday',1:"Tuesday",2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
df_modified['Day of the week'] = df_modified['Date'].apply(lambda x:datetime.strptime(x, '%d/%m/%Y').weekday())

In [24]:
df_modified[['Date','Day of the week']].head(10)

Unnamed: 0,Date,Day of the week
0,07/07/2015,1
1,14/07/2015,1
2,15/07/2015,2
3,16/07/2015,3
4,23/07/2015,3
5,10/07/2015,4
6,17/07/2015,4
7,24/07/2015,4
8,06/07/2015,0
9,13/07/2015,0


In [25]:
df_modified = df_modified.drop(columns='Date',axis=1)
df_modified.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of the week
0,0,0,0,1,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,1,2,1,2,7,3


In [26]:
df_mod_date = df_modified.copy()
df_mod_date.head(10)

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of the week
0,0,0,0,1,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,1,2,1,2,7,3
5,0,0,0,1,179,51,38,239.554,31,1,0,0,2,7,4
6,0,0,0,1,361,52,28,239.554,27,1,1,4,8,7,4
7,0,0,0,1,260,50,36,239.554,23,1,4,0,4,7,4
8,0,0,1,0,155,12,34,239.554,25,1,2,0,40,7,0
9,0,0,0,1,235,11,37,239.554,29,3,1,1,8,7,0


In [27]:
df_mod_date['Education'] = df_mod_date['Education'].apply(lambda x: 0 if x==1 else 1)
df_mod_date['Education'].value_counts()

0    583
1    117
Name: Education, dtype: int64

In [28]:
df_preprocessed = df_mod_date.copy()
df_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of the week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,7,3


In [29]:
df_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [30]:
df_preprocessed['targets'] = np.where(df_preprocessed['Absenteeism Time in Hours'] > df_preprocessed['Absenteeism Time in Hours'].median() ,1,0)
df_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of the week,targets
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,7,3,0


In [31]:
data_with_targets = df_preprocessed.drop(["Absenteeism Time in Hours", "Day of the week","Daily Work Load Average","Distance to Work"], axis=1)
data_with_targets is df_preprocessed

False

In [32]:
data_with_targets.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month,targets
0,0,0,0,1,289,33,30,0,2,1,7,1
1,0,0,0,0,118,50,31,0,1,0,7,0
2,0,0,0,1,179,38,31,0,0,0,7,0
3,1,0,0,0,279,39,24,0,2,0,7,1
4,0,0,0,1,289,33,30,0,2,1,7,0


In [33]:
inputs = data_with_targets.iloc[:,:-1]
targets = data_with_targets['targets']
inputs.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month
0,0,0,0,1,289,33,30,0,2,1,7
1,0,0,0,0,118,50,31,0,1,0,7
2,0,0,0,1,179,38,31,0,0,0,7
3,1,0,0,0,279,39,24,0,2,0,7
4,0,0,0,1,289,33,30,0,2,1,7


In [34]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y= None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_=  np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled], axis=1)[init_col_order]

In [35]:
inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month'], dtype=object)

In [36]:
columns_to_scale = [
       'Transportation Expense', 'Age',
        'Body Mass Index',
       'Children', 'Pets', 'Month']

In [37]:
scaler = CustomScaler(columns_to_scale)
scaler.fit(inputs)

  return self.partial_fit(X, y)


CustomScaler(columns=['Transportation Expense', 'Age', 'Body Mass Index', 'Children', 'Pets', 'Month'],
       copy=None, with_mean=None, with_std=None)

In [38]:
scaled_inputs = scaler.transform(inputs)



In [39]:
scaled_inputs.shape

(700, 11)

In [40]:
scaled_inputs.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month
0,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,1.154619
1,0,0,0,0,-1.574681,2.130803,1.002633,0,-0.01928,-0.58969,1.154619
2,0,0,0,1,-0.654143,0.24831,1.002633,0,-0.91903,-0.58969,1.154619
3,1,0,0,0,0.854936,0.405184,-0.643782,0,0.880469,-0.58969,1.154619
4,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,1.154619


In [41]:
final_columns = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets']
scaled_inputs = scaled_inputs[final_columns]
scaled_inputs.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,1.154619,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,1.154619,-1.574681,2.130803,1.002633,0,-0.01928,-0.58969
2,0,0,0,1,1.154619,-0.654143,0.24831,1.002633,0,-0.91903,-0.58969
3,1,0,0,0,1.154619,0.854936,0.405184,-0.643782,0,0.880469,-0.58969
4,0,0,0,1,1.154619,1.005844,-0.536062,0.767431,0,0.880469,0.268487


In [42]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)



In [43]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [44]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [46]:
reg = LogisticRegression()

In [47]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [48]:
reg.score(x_train, y_train)

0.7678571428571429

### Manually check the accouracy

In [49]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [50]:
# model_outputs==y_train

In [51]:
np.sum(model_outputs == y_train)

430

In [52]:
model_outputs.shape[0]

560

In [53]:
np.sum(model_outputs == y_train)/model_outputs.shape[0]

0.7678571428571429

In [54]:
reg.intercept_

array([-1.45975714])

In [55]:
reg.coef_

array([[ 2.64508374,  0.83843074,  2.95807729,  0.66179946,  0.30737634,
         0.58434706, -0.20885503,  0.2756095 , -0.25574104,  0.36714894,
        -0.25908782]])

In [56]:
feature_name = inputs.columns.values

In [57]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason 1,2.645084
1,Reason 2,0.838431
2,Reason 3,2.958077
3,Reason 4,0.661799
4,Transportation Expense,0.307376
5,Age,0.584347
6,Body Mass Index,-0.208855
7,Education,0.27561
8,Children,-0.255741
9,Pets,0.367149


In [58]:
summary_table.index = summary_table.index+1

In [59]:
summary_table

Unnamed: 0,Feature name,Coefficient
1,Reason 1,2.645084
2,Reason 2,0.838431
3,Reason 3,2.958077
4,Reason 4,0.661799
5,Transportation Expense,0.307376
6,Age,0.584347
7,Body Mass Index,-0.208855
8,Education,0.27561
9,Children,-0.255741
10,Pets,0.367149


In [60]:
summary_table.loc[0] = ['Interecept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Interecept,-1.459757
1,Reason 1,2.645084
2,Reason 2,0.838431
3,Reason 3,2.958077
4,Reason 4,0.661799
5,Transportation Expense,0.307376
6,Age,0.584347
7,Body Mass Index,-0.208855
8,Education,0.27561
9,Children,-0.255741


In [61]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Interecept,-1.459757,0.232293
1,Reason 1,2.645084,14.084624
2,Reason 2,0.838431,2.312735
3,Reason 3,2.958077,19.260903
4,Reason 4,0.661799,1.938277
5,Transportation Expense,0.307376,1.359853
6,Age,0.584347,1.793819
7,Body Mass Index,-0.208855,0.811513
8,Education,0.27561,1.317333
9,Children,-0.255741,0.774342


In [62]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason 3,2.958077,19.260903
1,Reason 1,2.645084,14.084624
2,Reason 2,0.838431,2.312735
4,Reason 4,0.661799,1.938277
6,Age,0.584347,1.793819
10,Pets,0.367149,1.443613
5,Transportation Expense,0.307376,1.359853
8,Education,0.27561,1.317333
7,Body Mass Index,-0.208855,0.811513
9,Children,-0.255741,0.774342


### Testing

In [63]:
reg.score(x_test, y_test)

0.6928571428571428

In [64]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.80842494, 0.19157506],
       [0.48155346, 0.51844654],
       [0.58071174, 0.41928826],
       [0.77236157, 0.22763843],
       [0.05839247, 0.94160753],
       [0.31830116, 0.68169884],
       [0.21808267, 0.78191733],
       [0.11916204, 0.88083796],
       [0.78153595, 0.21846405],
       [0.68685708, 0.31314292],
       [0.36658745, 0.63341255],
       [0.19872811, 0.80127189],
       [0.05917687, 0.94082313],
       [0.72561704, 0.27438296],
       [0.28405462, 0.71594538],
       [0.52451153, 0.47548847],
       [0.49959995, 0.50040005],
       [0.47236391, 0.52763609],
       [0.40291595, 0.59708405],
       [0.07778193, 0.92221807],
       [0.78153595, 0.21846405],
       [0.77236157, 0.22763843],
       [0.53895727, 0.46104273],
       [0.53895727, 0.46104273],
       [0.20606517, 0.79393483],
       [0.67466373, 0.32533627],
       [0.43278143, 0.56721857],
       [0.79230391, 0.20769609],
       [0.29807745, 0.70192255],
       [0.77236157, 0.22763843],
       [0.

In [65]:
predicted_proba[:,1]

array([0.19157506, 0.51844654, 0.41928826, 0.22763843, 0.94160753,
       0.68169884, 0.78191733, 0.88083796, 0.21846405, 0.31314292,
       0.63341255, 0.80127189, 0.94082313, 0.27438296, 0.71594538,
       0.47548847, 0.50040005, 0.52763609, 0.59708405, 0.92221807,
       0.21846405, 0.22763843, 0.46104273, 0.46104273, 0.79393483,
       0.32533627, 0.56721857, 0.20769609, 0.70192255, 0.22763843,
       0.41037677, 0.71641718, 0.72704945, 0.41931876, 0.22763843,
       0.54403429, 0.23765435, 0.74031915, 0.4265629 , 0.6893575 ,
       0.20903514, 0.44323571, 0.27939413, 0.53377793, 0.71574604,
       0.59265739, 0.72166755, 0.19157506, 0.22308907, 0.19157506,
       0.4702892 , 0.2521657 , 0.68169884, 0.35585213, 0.88666347,
       0.31031742, 0.89924114, 0.24239511, 0.47127171, 0.25038646,
       0.73602811, 0.65758106, 0.40704509, 0.86922157, 0.22240701,
       0.3618479 , 0.06337546, 0.23765435, 0.74049122, 0.36905573,
       0.23765435, 0.3386537 , 0.85404   , 0.56202789, 0.69655

### Save the model

In [66]:
import pickle

In [67]:
with open('model','wb') as file:
    pickle.dump(reg, file)

In [68]:
with open('scaler', 'wb') as file:
    pickle.dump(scaler, file)