# Import the needed libraries

In [None]:
# import the relevant libraries
import pandas as pd
import numpy as np

# Load the data

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Absenteeism_preprocessed_1.csv to Absenteeism_preprocessed_1 (1).csv


In [None]:
import io
data_preprocessed = pd.read_csv(io.BytesIO(uploaded['Absenteeism_preprocessed_1.csv']))
# Dataset is now stored in a Pandas Dataframe

In [None]:
# eyeball the data
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


# Create the targets

In [None]:
# find the median of 'Absenteeism Time in Hours'
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [None]:
# create targets for our logistic regression
# they have to be categories and we must find a way to say if someone is 'being absent too much' or not
# what we've decided to do is to take the median of the dataset as a cut-off line
# in this way the dataset will be balanced (there will be roughly equal number of 0s and 1s for the logistic regression)
# as balancing is a great problem for ML, this will work great for us
# alternatively, if we had more data, we could have found other ways to deal with the issue 
# for instance, we could have assigned some arbitrary value as a cut-off line, instead of the median

# note that what line does is to assign 1 to anyone who has been absent 4 hours or more (more than 3 hours)
# that is the equivalent of taking half a day off

# initial code from the lecture
# targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

# parameterized code
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [None]:
# look the targets
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [None]:
# create a Series in the original data frame that will contain the targets for the regression
data_preprocessed['Excessive Absenteeism'] = targets

In [None]:
# check what happened
# maybe manually see how the targets were created
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [None]:
# check if dataset is balanced (what % of targets are 1s)
# targets.sum() will give us the number of 1s that there are
# the shape[0] will give us the length of the targets array
targets.sum() / targets.shape[0]

0.45571428571428574

In [None]:
# create a checkpoint by dropping the unnecessary variables
# also drop the variables we 'eliminated' after exploring the weights
data_with_targets = data_preprocessed.drop(['Age', 'Body Mass Index', 'Children', 'Pet', 'Education'],axis=1)

In [None]:
# check if the line above is a checkpoint :)

# if data_with_targets is data_preprocessed = True, then the two are pointing to the same object
# if it is False, then the two variables are completely different and this is in fact a checkpoint
data_with_targets is data_preprocessed

False

In [None]:
# check what's inside
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Daily Work Load Average,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,239.554,4,1
1,0,0,0,0,7,1,118,13,239.554,0,0
2,0,0,0,1,7,2,179,51,239.554,2,0
3,1,0,0,0,7,3,279,5,239.554,4,1
4,0,0,0,1,7,3,289,36,239.554,2,0


# Select the inputs for the regression

In [None]:
data_with_targets.shape

(700, 11)

In [None]:
# Selects all rows and all columns until 14 (excluding)
data_with_targets.iloc[:,:10]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Daily Work Load Average,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,239.554,4
1,0,0,0,0,7,1,118,13,239.554,0
2,0,0,0,1,7,2,179,51,239.554,2
3,1,0,0,0,7,3,279,5,239.554,4
4,0,0,0,1,7,3,289,36,239.554,2
...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,237.656,8
696,1,0,0,0,5,2,225,26,237.656,3
697,1,0,0,0,5,3,330,16,237.656,8
698,0,0,0,1,5,3,235,16,237.656,2


In [None]:
# Selects all rows and all columns but the last one (basically the same operation)
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Daily Work Load Average,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,239.554,4
1,0,0,0,0,7,1,118,13,239.554,0
2,0,0,0,1,7,2,179,51,239.554,2
3,1,0,0,0,7,3,279,5,239.554,4
4,0,0,0,1,7,3,289,36,239.554,2
...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,237.656,8
696,1,0,0,0,5,2,225,26,237.656,3
697,1,0,0,0,5,3,330,16,237.656,8
698,0,0,0,1,5,3,235,16,237.656,2


In [None]:
# Create a variable that will contain the inputs (everything without the targets)
unscaled_inputs = data_with_targets.iloc[:,:-1]

# Standardize the data

In [None]:
# standardize the inputs

from sklearn.preprocessing import StandardScaler

# define scaler as an object
absenteeism_scaler = StandardScaler()

In [None]:
# import the libraries needed to create the Custom Scaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method for actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [None]:
# check what are all columns that we've got
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Daily Work Load Average', 'Absenteeism Time in Hours'],
      dtype=object)

In [None]:
# choose the columns to scale
# we later augmented this code and put it in comments
# columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']
    
# select the columns to omit
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

In [None]:
# create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [None]:
# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)

In [None]:
# fit the data (calculate mean and standard deviation); they are automatically stored inside the object 
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Month Value', 'Day of the Week',
                      'Transportation Expense', 'Distance to Work',
                      'Daily Work Load Average', 'Absenteeism Time in Hours'],
             copy=None, with_mean=None, with_std=None)

In [None]:
# standardizes the data, using the transform method 
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [None]:
# the scaled_inputs are now an ndarray, because sklearn works with ndarrays
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Daily Work Load Average,Absenteeism Time in Hours
0,0,0,0,1,0.030796,-0.800950,1.005844,0.412816,-0.806331,-0.218105
1,0,0,0,0,0.030796,-0.800950,-1.574681,-1.141882,-0.806331,-0.534035
2,0,0,0,1,0.030796,-0.232900,-0.654143,1.426749,-0.806331,-0.376070
3,1,0,0,0,0.030796,0.335149,0.854936,-1.682647,-0.806331,-0.218105
4,0,0,0,1,0.030796,0.335149,1.005844,0.412816,-0.806331,-0.376070
...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.568019,-0.232900,-0.654143,-0.533522,-0.853789,0.097825
696,1,0,0,0,-0.568019,-0.232900,0.040034,-0.263140,-0.853789,-0.297087
697,1,0,0,0,-0.568019,0.335149,1.624567,-0.939096,-0.853789,0.097825
698,0,0,0,1,-0.568019,0.335149,0.190942,-0.939096,-0.853789,-0.376070


In [None]:
# check the shape of the inputs
scaled_inputs.shape

(700, 10)

# Split the data for Train and Test stage

In [None]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

In [None]:
# check how this method works
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  ...  Daily Work Load Average  Absenteeism Time in Hours
 564         0         0  ...                 0.218718                   0.097825
 399         0         0  ...                -0.685486                   0.097825
 282         0         0  ...                 0.560476                  -0.218105
 585         1         0  ...                -0.188851                   0.097825
 699         0         0  ...                -0.853789                  -0.376070
 ..        ...       ...  ...                      ...                        ...
 409         0         0  ...                -0.685486                   0.097825
 56          0         0  ...                -0.758273                  -0.297087
 29          0         0  ...                -1.647399                  -0.376070
 266         0         0  ...                -0.154696                  -0.455052
 62          0         0  ...                -0.458497                  -0.297087
 
 [525 rows x 1

In [None]:
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [None]:
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape)

(560, 10) (560,)


In [None]:
# check the shape of the test inputs and targets
print (x_test.shape, y_test.shape)

(140, 10) (140,)


# Apply Logical regression and train the model

In [None]:
# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

In [None]:
# create a logistic regression object
reg = LogisticRegression()

In [None]:
# fit our train inputs
# that is basically the whole training part of the machine learning
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# assess the train accuracy of the model
reg.score(x_train,y_train)

0.9321428571428572

# Evaluate the Accuracy

In [None]:
# find the model outputs according to our model
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,

In [None]:
# compare them with the targets
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [None]:
# ACTUALLY compare the two variables
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [None]:
# find out in how many instances we predicted correctly
np.sum((model_outputs==y_train))

522

In [None]:
# get the total number of instances
model_outputs.shape[0]

560

In [None]:
# calculate the accuracy of the model
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.9321428571428572

# Find the Intercepts and Coefficients

In [None]:
# get the intercept (bias) of our model
reg.intercept_

array([1.28286724])

In [None]:
# get the coefficients (weights) of our model
reg.coef_

array([[ 0.6075219 , -0.02738523,  0.49833285,  0.13631453,  0.10127289,
        -0.05091472,  0.43044728,  0.08755244, -0.03639032,  8.94336027]])

In [None]:
# check what were the names of our columns
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Daily Work Load Average', 'Absenteeism Time in Hours'],
      dtype=object)

In [None]:
# save the names of the columns in an ad-hoc variable
feature_name = unscaled_inputs.columns.values

In [None]:
# use the coefficients from this table (they will be exported later and will be used in Tableau)
# transpose the model coefficients (model.coef_) and throws them into a df (a vertical organization, so that they can be
# multiplied by certain matrices later) 
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)

# display the summary table
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,0.607522
1,Reason_2,-0.027385
2,Reason_3,0.498333
3,Reason_4,0.136315
4,Month Value,0.101273
5,Day of the Week,-0.050915
6,Transportation Expense,0.430447
7,Distance to Work,0.087552
8,Daily Work Load Average,-0.03639
9,Absenteeism Time in Hours,8.94336


In [None]:
# do a little Python trick to move the intercept to the top of the summary table
# move all indices by 1
summary_table.index = summary_table.index + 1

# add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,1.282867
1,Reason_1,0.607522
2,Reason_2,-0.027385
3,Reason_3,0.498333
4,Reason_4,0.136315
5,Month Value,0.101273
6,Day of the Week,-0.050915
7,Transportation Expense,0.430447
8,Distance to Work,0.087552
9,Daily Work Load Average,-0.03639


# Interpreting the coefficients

In [None]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [None]:
# display the df
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,1.282867,3.606967
1,Reason_1,0.607522,1.835876
2,Reason_2,-0.027385,0.972986
3,Reason_3,0.498333,1.645975
4,Reason_4,0.136315,1.146042
5,Month Value,0.101273,1.106579
6,Day of the Week,-0.050915,0.95036
7,Transportation Expense,0.430447,1.537945
8,Distance to Work,0.087552,1.091499
9,Daily Work Load Average,-0.03639,0.964264


In [None]:
# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'descending'
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
10,Absenteeism Time in Hours,8.94336,7656.883046
0,Intercept,1.282867,3.606967
1,Reason_1,0.607522,1.835876
3,Reason_3,0.498333,1.645975
7,Transportation Expense,0.430447,1.537945
4,Reason_4,0.136315,1.146042
5,Month Value,0.101273,1.106579
8,Distance to Work,0.087552,1.091499
2,Reason_2,-0.027385,0.972986
9,Daily Work Load Average,-0.03639,0.964264


# Test the model

In [None]:
# assess the test accuracy of the model
reg.score(x_test,y_test)

0.9285714285714286

In [None]:
# find the predicted probabilities of each class
# the first column shows the probability of a particular observation to be 0, while the second one - to be 1
predicted_proba = reg.predict_proba(x_test)

# let's check that out
predicted_proba

array([[9.36922870e-01, 6.30771304e-02],
       [5.20368435e-01, 4.79631565e-01],
       [7.58095302e-02, 9.24190470e-01],
       [8.19572182e-01, 1.80427818e-01],
       [3.80147903e-02, 9.61985210e-01],
       [5.54825382e-01, 4.45174618e-01],
       [7.18391729e-02, 9.28160827e-01],
       [6.05972822e-02, 9.39402718e-01],
       [8.83530328e-01, 1.16469672e-01],
       [6.55599577e-01, 3.44400423e-01],
       [3.10911514e-02, 9.68908849e-01],
       [3.82173003e-01, 6.17826997e-01],
       [0.00000000e+00, 1.00000000e+00],
       [9.61348332e-02, 9.03865167e-01],
       [7.06665507e-02, 9.29333449e-01],
       [4.60324185e-01, 5.39675815e-01],
       [6.92623971e-01, 3.07376029e-01],
       [4.21031568e-02, 9.57896843e-01],
       [0.00000000e+00, 1.00000000e+00],
       [3.28745564e-02, 9.67125444e-01],
       [7.57449784e-01, 2.42550216e-01],
       [8.15255775e-01, 1.84744225e-01],
       [1.15942771e-01, 8.84057229e-01],
       [8.07986148e-01, 1.92013852e-01],
       [7.118766

In [None]:
predicted_proba.shape

(140, 2)

In [None]:
# select ONLY the probabilities referring to 1s
predicted_proba[:,1]

array([0.06307713, 0.47963156, 0.92419047, 0.18042782, 0.96198521,
       0.44517462, 0.92816083, 0.93940272, 0.11646967, 0.34440042,
       0.96890885, 0.617827  , 1.        , 0.90386517, 0.92933345,
       0.53967582, 0.30737603, 0.95789684, 1.        , 0.96712544,
       0.24255022, 0.18474423, 0.88405723, 0.19201385, 0.92881233,
       0.11002955, 0.15451328, 0.88532953, 0.08330488, 0.18914006,
       0.93888755, 0.93195205, 0.0888743 , 0.97050905, 0.18474423,
       0.94500294, 0.22787795, 0.87190308, 0.43577913, 0.45478671,
       0.10222723, 0.99962265, 0.1058822 , 0.04121936, 0.15489353,
       0.87634979, 0.99999858, 0.05591275, 0.15980931, 0.085993  ,
       0.99975134, 0.06168365, 0.99999895, 0.07354757, 1.        ,
       0.11763431, 0.98124109, 0.39166506, 0.03097074, 0.03182124,
       0.94313315, 0.92507453, 0.07620141, 0.31467588, 0.8984778 ,
       0.21564647, 0.02080362, 0.12344711, 0.99979102, 0.91424573,
       0.1919877 , 0.06608821, 0.95331102, 0.07882679, 0.94570