### Cretaing a logistic regression to predict absenteeism

In [85]:
#importing the library
import numpy as np
import pandas as pd

In [20]:
#loading the dataset
data_preprocessed=pd.read_csv('python.csv')

In [21]:
#to show the top 5 rows of dataset
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


# Create the targets

In [22]:
#find the median of 'Absenteeism Time in Hours'
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [23]:
# create targets for our logistic regression
# they have to be categories and we must find a way to say if someone is 'being absent too much' or not
# what we've decided to do is to take the median of the dataset as a cut-off line
# in this way the dataset will be balanced (there will be roughly equal number of 0s and 1s for the logistic regression)
# as balancing is a great problem for ML, this will work great for us
# alternatively, if we had more data, we could have found other ways to deal with the issue 
# for instance, we could have assigned some arbitrary value as a cut-off line, instead of the median

# note that what line does is to assign 1 to anyone who has been absent 4 hours or more (more than 3 hours)
# that is the equivalent of taking half a day off
targets=np.where(data_preprocessed['Absenteeism Time in Hours']>3,1,0)

In [24]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [26]:
# create a Series in the original data frame that will contain the targets for the regression
data_preprocessed['Excessive Absenteeism']=targets

In [27]:
#to show the targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


# A Commnet on the targets

In [28]:
#arround 46 % targets is 1 and 64% of targets is 0
targets.sum()/ targets.shape[0]

0.45571428571428574

In [29]:
# create a checkpoint by dropping the unnecessary variables
data_with_targets=data_preprocessed.drop(['Absenteeism Time in Hours','Day of the week','Daily Work Load Average','Distance to Work'],axis=1)

In [30]:
# if data_with_targets is data_preprocessed = True, then the two are pointing to the same object
# if it is False, then the two variables are completely different and this is in fact a checkpoint
data_with_targets is data_preprocessed

False

# Select the inputs for the regression

In [31]:
data_with_targets.shape

(700, 12)

In [32]:
data_with_targets.iloc[:,0:14]
#or
#data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
5,0,0,0,1,10,179,38,31,0,0,0,0
6,0,0,0,1,7,361,28,27,0,1,4,1
7,0,0,0,1,7,260,36,23,0,4,0,1
8,0,0,1,0,6,155,34,25,0,2,0,1
9,0,0,0,1,7,235,37,29,1,1,1,1


In [33]:
unscaled_inputs=data_with_targets.iloc[:,:-1]

# standardize the data

In [15]:
#from sklearn.preprocessing import StandardScaler

#absenteeism_scaler= StandardScaler()

In [34]:
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator, TransformerMixin):
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
          # scaler is nothing but a Standard Scaler object
        
        self.scaler=StandardScaler(copy,with_mean,with_std)
        self.columns=columns
        self.mean_=None
        self.var_=None
        
         # the fit method, which, again based on StandardScale
            
    def fit(self,X, y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_=np.mean(X[self.columns])
        self.var_=np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling
    
    def transform(self,X,y=None, copy=None):
         
            # record the initial order of the columns
        
        init_col_order=X.columns
        
         # scale all features that you chose when creating the instance of the class
            
        X_scaled =pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
       
        # declare a variable containing all information that was not scaled
        
        X_not_scaled=X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
            

In [35]:
#check the columns names
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [40]:
#columns_to_scale=[ 'Month Value',
       #'Day of the week', 'Transportation Expense', 'Distance to Work',
      # 'Age', 'Daily Work Load Average', 'Body Mass Index',
      # 'Children', 'Pets']
columns_to_omit=['Reason_1','Reason_2','Reason_3','Reason_4','Education']

In [42]:
#list Comprehension: is a syntactic construct which allows us to create a list from existing lists based on loops, 
#conditionals, etc

columns_to_scale=[x for x in unscaled_inputs.columns.values if x not in columns_to_omit]


In [44]:
# fit the data (calculate mean and standard deviation); they are automatically stored inside the object

absenteeism_scaler=CustomScaler(columns_to_scale)

In [45]:
absenteeism_scaler.fit(unscaled_inputs)

  return self.partial_fit(X, y)


CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age', 'Body Mass Index', 'Children', 'Pets'],
       copy=None, with_mean=None, with_std=None)

In [46]:
scaled_inputs=absenteeism_scaler.transform(unscaled_inputs)



In [47]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.030796,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.030796,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
5,0,0,0,1,0.929019,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
6,0,0,0,1,0.030796,2.092381,-1.320435,0.061825,0,-0.019280,2.843016
7,0,0,0,1,0.030796,0.568211,-0.065439,-0.878984,0,2.679969,-0.589690
8,0,0,1,0,-0.268611,-1.016322,-0.379188,-0.408580,0,0.880469,-0.589690
9,0,0,0,1,0.030796,0.190942,0.091435,0.532229,1,-0.019280,0.268487


In [48]:
scaled_inputs.shape

(700, 11)

# split the data in train and test and shuffle

In [49]:
from sklearn.model_selection import train_test_split

# Split

In [50]:
x_train,x_test,y_train,y_test=train_test_split(scaled_inputs, targets, train_size=0.8)



In [51]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [52]:
print(x_test.shape,y_test.shape)

(140, 11) (140,)


# Logistic Regression with sklearn

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [54]:
reg=LogisticRegression()

In [55]:
reg.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [56]:
reg.score(x_train,y_train)

0.7714285714285715

# Manually check the accuracy

In [57]:
model_outputs=reg.predict(x_train)
model_outputs

array([0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,

In [58]:
model_outputs==y_train

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False,  True,
        True, False, False,  True,  True,  True, False,  True, False,
       False, False,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True,  True,  True, False, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [59]:
np.sum(model_outputs==y_train)

432

In [60]:
model_outputs.shape[0]

560

In [61]:
np.sum(model_outputs==y_train)/model_outputs.shape[0]

0.7714285714285715

# Extracting the Intercept and Coefficients

In [62]:
#intercept of our model
reg.intercept_

array([-1.45264912])

In [63]:
#coefficients of our model
reg.coef_

array([[ 2.58656786,  0.9544401 ,  2.90853979,  0.5165485 ,  0.01582426,
         0.52402255, -0.26095383,  0.2578841 , -0.31322415,  0.44925481,
        -0.24837571]])

In [64]:
#check the name of our columns
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [65]:

Feature_name=unscaled_inputs.columns.values

In [66]:
#Summary table can calculate many methode we have calculate two way append and concatinate
summary_table=pd.DataFrame(columns=['Feature_name'], data=Feature_name)
summary_table["Coefficient"]=np.transpose(reg.coef_)
#summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature_name,Coefficient
0,Reason_1,2.586568
1,Reason_2,0.95444
2,Reason_3,2.90854
3,Reason_4,0.516548
4,Month Value,0.015824
5,Transportation Expense,0.524023
6,Age,-0.260954
7,Body Mass Index,0.257884
8,Education,-0.313224
9,Children,0.449255


In [67]:

summary_table.index=summary_table.index+1 #now our index start with 1 not 0
summary_table.loc[0]=['Intercept', reg.intercept_[0]]
summay_table=summary_table.sort_index()
summary_table

Unnamed: 0,Feature_name,Coefficient
1,Reason_1,2.586568
2,Reason_2,0.95444
3,Reason_3,2.90854
4,Reason_4,0.516548
5,Month Value,0.015824
6,Transportation Expense,0.524023
7,Age,-0.260954
8,Body Mass Index,0.257884
9,Education,-0.313224
10,Children,0.449255


# Interpreting the Coefficients

In [68]:
#creating the new series  "odds ratio" which will show the odds ratio of each coefficient
summary_table['Odds_ratio']=np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature_name,Coefficient,Odds_ratio
1,Reason_1,2.586568,13.2841
2,Reason_2,0.95444,2.597216
3,Reason_3,2.90854,18.330013
4,Reason_4,0.516548,1.676232
5,Month Value,0.015824,1.01595
6,Transportation Expense,0.524023,1.688807
7,Age,-0.260954,0.770316
8,Body Mass Index,0.257884,1.294189
9,Education,-0.313224,0.731086
10,Children,0.449255,1.567144


In [69]:
#Sorts the values in a data frame with respect to a given column
summary_table.sort_values('Odds_ratio',ascending=False)

Unnamed: 0,Feature_name,Coefficient,Odds_ratio
3,Reason_3,2.90854,18.330013
1,Reason_1,2.586568,13.2841
2,Reason_2,0.95444,2.597216
6,Transportation Expense,0.524023,1.688807
4,Reason_4,0.516548,1.676232
10,Children,0.449255,1.567144
8,Body Mass Index,0.257884,1.294189
5,Month Value,0.015824,1.01595
11,Pets,-0.248376,0.780067
7,Age,-0.260954,0.770316


#Backward elimination
the idea is that we can simplify our model by removing all features which have close to no contribution to the model

when we have the p-values, we get rid of all coefficients with p-values>0.05

if the weight is small enough, it won't make a difference anyway...

I say: if we remove these variables, the rest of our model should not really change in terms of coefficient values

### Testing the Model

In [72]:
reg.score(x_test,y_test)

0.7

In [74]:
#predict_proba is  retrun the probability estimates for all possible outputs(classes)
predicted_proba=reg.predict_proba(x_test)
predicted_proba

array([[0.84274463, 0.15725537],
       [0.8780891 , 0.1219109 ],
       [0.877581  , 0.122419  ],
       [0.36182263, 0.63817737],
       [0.81795557, 0.18204443],
       [0.25737594, 0.74262406],
       [0.88109965, 0.11890035],
       [0.3397425 , 0.6602575 ],
       [0.69926723, 0.30073277],
       [0.8780891 , 0.1219109 ],
       [0.75226594, 0.24773406],
       [0.2600551 , 0.7399449 ],
       [0.26221998, 0.73778002],
       [0.33988211, 0.66011789],
       [0.75138192, 0.24861808],
       [0.0696572 , 0.9303428 ],
       [0.71855854, 0.28144146],
       [0.81511657, 0.18488343],
       [0.67158431, 0.32841569],
       [0.68706262, 0.31293738],
       [0.7919687 , 0.2080313 ],
       [0.21480312, 0.78519688],
       [0.73123381, 0.26876619],
       [0.20280188, 0.79719812],
       [0.67367087, 0.32632913],
       [0.75578089, 0.24421911],
       [0.47494608, 0.52505392],
       [0.21241552, 0.78758448],
       [0.34293811, 0.65706189],
       [0.49273184, 0.50726816],
       [0.

In [76]:
predicted_proba.shape

(140, 2)

In [78]:
predicted_proba[:,1]

array([0.15725537, 0.1219109 , 0.122419  , 0.63817737, 0.18204443,
       0.74262406, 0.11890035, 0.6602575 , 0.30073277, 0.1219109 ,
       0.24773406, 0.7399449 , 0.73778002, 0.66011789, 0.24861808,
       0.9303428 , 0.28144146, 0.18488343, 0.32841569, 0.31293738,
       0.2080313 , 0.78519688, 0.26876619, 0.79719812, 0.32632913,
       0.24421911, 0.52505392, 0.78758448, 0.65706189, 0.50726816,
       0.26505798, 0.32217663, 0.12090016, 0.6790121 , 0.29669371,
       0.12781063, 0.7174961 , 0.65385198, 0.86776235, 0.1272834 ,
       0.26413605, 0.26505798, 0.84770495, 0.81162997, 0.650628  ,
       0.26046897, 0.26690799, 0.86528322, 0.67797857, 0.67133614,
       0.38973528, 0.59948507, 0.20185567, 0.37002512, 0.83696372,
       0.122419  , 0.56766995, 0.6591939 , 0.67238068, 0.96678485,
       0.17775187, 0.28722595, 0.51796034, 0.12467507, 0.50253075,
       0.26690799, 0.7174961 , 0.6702899 , 0.20109342, 0.73438998,
       0.72402623, 0.12344066, 0.569994  , 0.82247   , 0.20725

## save the model

saving the model is many way like joblib and json we will save the model using the (picle module)

pickle : is a Python module used to convert a python object into a character stream

In [82]:
#import the libraray
import pickle

In [83]:
#pickle the model file
with open('model','wb') as file:
    pickle.dump(reg, file)

In [84]:
#pickle the scaler file
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)
