Losgistic regression with sklearn

In [1]:
import numpy as np
import pandas as pd
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [2]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [3]:
#  > 3, 1, 0.  If mmedian is gretaer than 3, that is 1. if not, that is 0
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

In [4]:
#add new column '-targets- called Excessive Absenteeism' into data_preprocessed
data_preprocessed['Excessive Absenteeism'] = targets

In [5]:

#data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis =1)
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week', 
                                            'Daily Work Load Average', 'Distance to Work'], axis =1)

In [6]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [7]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()
#absenteeism_scaler.fit(unscaled_inputs)

In [8]:
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [9]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [10]:
#columns_to_scale = ['Month Value', 'Day of the Week', 'Transportation Expense', 'Distance to Work',
#'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']

columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [11]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [12]:
absenteeism_scaler = CustomScaler(columns_to_scale)
#absenteeism_scaler = StandardScaler()



In [13]:
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pet'],
             copy=None, with_mean=None, with_std=None)

In [14]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [15]:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
scaler = preprocessing.StandardScaler().fit(X_train)


In [16]:
from sklearn.model_selection import train_test_split
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 632         0         0         0         1    -1.166834   
 167         1         0         0         0    -1.166834   
 243         1         0         0         0     0.030796   
 337         0         0         0         0     1.228426   
 299         1         0         0         0     0.929019   
 ..        ...       ...       ...       ...          ...   
 207         0         0         1         0    -1.166834   
 72          0         0         0         1     0.929019   
 196         0         0         0         1    -0.867426   
 684         0         0         0         1     1.228426   
 183         0         0         0         1    -1.166834   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 632               -0.654143  0.248310         1.002633          0 -0.919030   
 167               -1.016322 -0.379188        -0.408580          0  0.880469   
 243                1.6245

In [17]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs,targets, train_size = 0.8, random_state = 20)
#train_size = 80%, random_state = SPLITS DATA INTO RANDOM TRAIN AND TEST SUBSETS
#random_state  20 array or subsets

In [18]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

Trainning the model

In [20]:
reg = LogisticRegression()

In [21]:
#LogisticRegression().fit
reg.fit(x_train, y_train)

LogisticRegression()

In [22]:
#mean accuracy
reg.score(x_train, y_train)

0.7767857142857143

Manually check the accuracy

In [23]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [24]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [25]:
np.sum(model_outputs==y_train)
#total number of correct predictions

435

In [26]:
model_outputs.shape[0]

560

In [27]:
##Accuracy = correct predictions/ # observations
np.sum((model_outputs==y_train)/model_outputs.shape[0])

0.7767857142857144

Finding the intercept and coefficients

In [28]:
reg.intercept_

array([-1.60957471])

In [29]:
reg.coef_

array([[ 2.77151176,  0.93168817,  3.09210221,  0.8090592 ,  0.00781237,
         0.62505482, -0.17390339,  0.28829409, -0.24081615,  0.35753531,
        -0.27337422]])

In [30]:
## scaled_inputs.columns.values  'numpy.ndarray' object has no attribute 'columns'
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [31]:
feature_name = unscaled_inputs.columns.values

In [32]:
summary_table = pd.DataFrame(columns=["Feature name"], data = feature_name)
summary_table["coefficient"] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,coefficient
0,Reason_1,2.771512
1,Reason_2,0.931688
2,Reason_3,3.092102
3,Reason_4,0.809059
4,Month Value,0.007812
5,Transportation Expense,0.625055
6,Age,-0.173903
7,Body Mass Index,0.288294
8,Education,-0.240816
9,Children,0.357535


In [33]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ["Intercept", reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,coefficient
0,Intercept,-1.609575
1,Reason_1,2.771512
2,Reason_2,0.931688
3,Reason_3,3.092102
4,Reason_4,0.809059
5,Month Value,0.007812
6,Transportation Expense,0.625055
7,Age,-0.173903
8,Body Mass Index,0.288294
9,Education,-0.240816


In [34]:
#log(odds) = b0 +b1x1+ b2x2+ b3x3+ ..

In [35]:
summary_table["odds_ratio"] = np.exp(summary_table.coefficient)

In [36]:
summary_table

Unnamed: 0,Feature name,coefficient,odds_ratio
0,Intercept,-1.609575,0.199973
1,Reason_1,2.771512,15.982778
2,Reason_2,0.931688,2.538791
3,Reason_3,3.092102,22.023327
4,Reason_4,0.809059,2.245794
5,Month Value,0.007812,1.007843
6,Transportation Expense,0.625055,1.868348
7,Age,-0.173903,0.840378
8,Body Mass Index,0.288294,1.33415
9,Education,-0.240816,0.785986


In [37]:

#odds ratio is around 1 that could be nothing and useless
# odds* odds ratio = new odds
#5:1 * 2 = 10:1


# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'ascending'
summary_table.sort_values("odds_ratio", ascending=False)

Unnamed: 0,Feature name,coefficient,odds_ratio
3,Reason_3,3.092102,22.023327
1,Reason_1,2.771512,15.982778
2,Reason_2,0.931688,2.538791
4,Reason_4,0.809059,2.245794
6,Transportation Expense,0.625055,1.868348
10,Children,0.357535,1.429801
8,Body Mass Index,0.288294,1.33415
5,Month Value,0.007812,1.007843
7,Age,-0.173903,0.840378
9,Education,-0.240816,0.785986


In [38]:
#machine learning engineers focuses on accuracy like standard deviation
#econometricians and statisticians focuses on interable models like out phenomena

In [39]:
reg.score(x_test, y_test)

0.7357142857142858

In [41]:
#reg is regression
#left is probability for target 0 
#right is probability for target 1 
predicted_proba = reg.predict_proba(x_test)

predicted_proba

array([[0.75466347, 0.24533653],
       [0.60917591, 0.39082409],
       [0.48329093, 0.51670907],
       [0.75768232, 0.24231768],
       [0.08357741, 0.91642259],
       [0.3052464 , 0.6947536 ],
       [0.303675  , 0.696325  ],
       [0.11636888, 0.88363112],
       [0.7400284 , 0.2599716 ],
       [0.75596036, 0.24403964],
       [0.50609784, 0.49390216],
       [0.19501503, 0.80498497],
       [0.06248668, 0.93751332],
       [0.7055465 , 0.2944535 ],
       [0.29675526, 0.70324474],
       [0.52028649, 0.47971351],
       [0.50551315, 0.49448685],
       [0.50843643, 0.49156357],
       [0.36713074, 0.63286926],
       [0.06422143, 0.93577857],
       [0.73822433, 0.26177567],
       [0.75768232, 0.24231768],
       [0.47994423, 0.52005577],
       [0.47760936, 0.52239064],
       [0.22619725, 0.77380275],
       [0.74047815, 0.25952185],
       [0.51148533, 0.48851467],
       [0.87702735, 0.12297265],
       [0.24005377, 0.75994623],
       [0.75768232, 0.24231768],
       [0.

In [45]:
x_test.shape

(140, 11)

In [43]:
predicted_proba.shape

(140, 2)

In [44]:
predicted_proba[:,1]

array([0.24533653, 0.39082409, 0.51670907, 0.24231768, 0.91642259,
       0.6947536 , 0.696325  , 0.88363112, 0.2599716 , 0.24403964,
       0.49390216, 0.80498497, 0.93751332, 0.2944535 , 0.70324474,
       0.47971351, 0.49448685, 0.49156357, 0.63286926, 0.93577857,
       0.26177567, 0.24231768, 0.52005577, 0.52239064, 0.77380275,
       0.25952185, 0.48851467, 0.12297265, 0.75994623, 0.24231768,
       0.38859882, 0.71238179, 0.69821485, 0.49507156, 0.24231768,
       0.59772596, 0.26042186, 0.78022686, 0.4398591 , 0.60641126,
       0.24188848, 0.49713003, 0.25862385, 0.40689715, 0.80759194,
       0.59889345, 0.71944702, 0.24317762, 0.24676126, 0.2414598 ,
       0.50180816, 0.29299268, 0.6947536 , 0.24459387, 0.82033268,
       0.39193844, 0.90599346, 0.26442957, 0.32150234, 0.3220128 ,
       0.70502976, 0.69623972, 0.26579672, 0.77584327, 0.24541121,
       0.24490372, 0.07551158, 0.26087263, 0.76176924, 0.29640101,
       0.25772789, 0.31539593, 0.88408057, 0.4387068 , 0.59547

save the model

In [46]:
import pickle

In [None]:
with open('model', 'wb') as file:
    pickle.dumb(reg, file)

In [47]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)

In [66]:
# load the model from disk
loaded_model = pickle.load(open('scaler', 'rb'))
result = loaded_model.transform(x_test,y_test)
print(result)

     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
0         NaN       NaN       NaN       NaN    -1.786901   
1         0.0       0.0       0.0       0.0    -1.876546   
2         NaN       NaN       NaN       NaN    -1.697256   
3         NaN       NaN       NaN       NaN    -2.414415   
4         NaN       NaN       NaN       NaN    -2.055835   
..        ...       ...       ...       ...          ...   
671       0.0       0.0       1.0       0.0          NaN   
672       1.0       0.0       0.0       0.0          NaN   
688       0.0       0.0       0.0       0.0          NaN   
693       1.0       0.0       0.0       0.0          NaN   
694       0.0       0.0       0.0       1.0          NaN   

     Transportation Expense       Age  Body Mass Index  Education  Children  \
0                 -3.365266 -5.673966        -6.052813        NaN -1.745927   
1                 -3.339760 -5.624746        -6.384734        0.0 -0.936377   
2                 -3.352513 -5.550918     