## Machine Learning

### Creating a logistic regression to predict absenteeism

In [1]:
# import relevant libraries
import numpy as np
import pandas as pd

In [2]:
# load the data
data_preprocessed = pd.read_csv('../data/preprocessed.csv')
data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,2015,July,Tuesday,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,2015,July,Tuesday,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,2015,July,Wednesday,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,2015,July,Thursday,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,2015,July,Thursday,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018,May,Wednesday,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,2018,May,Wednesday,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,2018,May,Thursday,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,2018,May,Thursday,235,16,32,237.656,25,1,0,0,2


### Create the targets

In [3]:
# using logistics regression
# creating classes; moderately absent, excessively absent
# using the median vlaue of the 'Absenteeism Time in Hours' as a cut-off line
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [4]:
# moderately absent (<= 3hours), excessively absent(>= 4 hours)
# moderately absent = 0, excessively absent = 1
# create a variable to predict this targets (0 & 1)
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                    data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [5]:
# add the target data to the dataframe
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,2015,July,Tuesday,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,2015,July,Tuesday,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,2015,July,Wednesday,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,2015,July,Thursday,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,2015,July,Thursday,289,36,33,239.554,30,0,2,1,2,0


In [6]:
# checking the targets distribution
targets.sum() / targets.shape[0]

0.45571428571428574

In [7]:
# dropping the absentism column
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day', 'Daily Work Load Average', 'Distance to Work'], axis = 1)
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,2015,July,289,33,30,0,2,1,1
1,0,0,0,0,2015,July,118,50,31,0,1,0,0
2,0,0,0,1,2015,July,179,38,31,0,0,0,0
3,1,0,0,0,2015,July,279,39,24,0,2,0,1
4,0,0,0,1,2015,July,289,33,30,0,2,1,0


## Selecting the inputs for the regression

In [8]:
data_with_targets.shape

(700, 13)

In [9]:
# selecting the inputs excluding the last column 'extreme absenteeism'
unscaled_inputs = data_with_targets.iloc[:, :-1]
unscaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,2015,July,289,33,30,0,2,1
1,0,0,0,0,2015,July,118,50,31,0,1,0
2,0,0,0,1,2015,July,179,38,31,0,0,0
3,1,0,0,0,2015,July,279,39,24,0,2,0
4,0,0,0,1,2015,July,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018,May,179,40,22,1,2,0
696,1,0,0,0,2018,May,225,28,24,0,1,2
697,1,0,0,0,2018,May,330,28,25,1,0,0
698,0,0,0,1,2018,May,235,32,25,1,0,0


## Standardize the Data

In [10]:
#from sklearn.preprocessing import StandardScaler

# create an object to subtract the mean and divide by the SD featurewise
#absenteeism_scalar = StandardScaler()

In [11]:
# before using standardscaler, lets convert 'month' and 'day' to integers
unscaled_inputs['Month'] = unscaled_inputs['Month'].replace(['January', 'February', 'March', 'April', 'May', 'June', 
'July', 'August', 'September', 'October', 'November', 'December'], ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'])

unscaled_inputs.head(9)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,2015,7,289,33,30,0,2,1
1,0,0,0,0,2015,7,118,50,31,0,1,0
2,0,0,0,1,2015,7,179,38,31,0,0,0
3,1,0,0,0,2015,7,279,39,24,0,2,0
4,0,0,0,1,2015,7,289,33,30,0,2,1
5,0,0,0,1,2015,7,179,38,31,0,0,0
6,0,0,0,1,2015,7,361,28,27,0,1,4
7,0,0,0,1,2015,7,260,36,23,0,4,0
8,0,0,1,0,2015,7,155,34,25,0,2,0


In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create a custom scaler so as to choose which input to standardize.
# this allows the exclusion of the dummie varaibles columns from being standardized
# alternatively, one can standardize the dataset before creating dummie variables.
class CustomScaler(BaseEstimator, TransformerMixin):

    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [13]:
# checking the column values in the unscaled data frame
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Year', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [14]:
#columns_to_scale = ['Year', 'Month', 'Day', 'Transportation Expense', 'Distance to Work', 'Age',
 #      'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']

columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [15]:
#using list comprehension to find columns that needs scaling
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [16]:
absenteeism_scaler = CustomScaler(columns_to_scale)



In [17]:
# fit input data (calculate and store the mean and SD)
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Year', 'Month', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [18]:
# transforming the unscaled inputs with the information contained in the absenteeism scalar. ie subtracting by the mean and dividing by the SD
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Year,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,-1.556984,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,-1.556984,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,-1.556984,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,-1.556984,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,-1.556984,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,1.530520,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,1.530520,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,1.530520,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,1.530520,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


### Spilt the data into train & test and shuffle

In [19]:
#import relevant modules
from sklearn.model_selection import train_test_split


In [20]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4      Year     Month  \
 253         0         0         1         0 -0.527816  0.468236   
 451         0         0         0         1  0.501352 -0.102784   
 82          1         0         0         0 -1.556984  1.324766   
 161         1         0         0         0 -0.527816 -0.959313   
 381         0         0         0         1  0.501352 -1.244823   
 ..        ...       ...       ...       ...       ...       ...   
 191         1         0         0         0 -0.527816 -0.673803   
 22          1         0         0         0 -1.556984  0.468236   
 43          0         0         1         0 -1.556984  0.753746   
 688         0         0         0         0  1.530520 -0.388293   
 319         1         0         0         0 -0.527816  1.324766   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 253               -0.986140 -1.163560        -1.114186          0  -0.91903   
 451                0.

above output consists of:
array 1: a training dataset with inputs
array 2: a training dataset with targets
array 3: a test dataset with inputs
array 4: a test dataset with targets

In [21]:
# declare four variables to contain the split outputs
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20) 

In [22]:
# view output shapes
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)


(560, 12) (560,)
(140, 12) (140,)


## Logistic regression with sklearn

In [23]:
from sklearn.linear_model import LogisticRegression
#for model evaluation
from sklearn import metrics

### Training the model

In [24]:
#declare a logistic regression object
reg = LogisticRegression()

In [25]:
# fitting the regression
reg.fit(x_train,y_train)

LogisticRegression()

In [26]:
# evaluating model accuracy
reg.score(x_train,y_train)

0.7732142857142857

### Manually check the accuracy

In [27]:
# find the outputs 
model_outputs = reg.predict(x_train)
model_outputs # shows the prediction of the model

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,

In [28]:
# and compare with the targets
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [29]:
# calculate the number of correct predictions
np.sum((model_outputs==y_train))

433

In [30]:
# number of outputs
model_outputs.shape[0]

560

In [31]:
# Accuracy = Correct predictions / # observations
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.7732142857142857

to use this model outside of python (Tableau), we need to create a function, to enable easy usuage within Tableau

#### Finding the intercept and coefficients

In [32]:
# finding intercept
reg.intercept_

array([-1.67771985])

In [33]:
#finding the coefficient
reg.coef_

array([[ 2.82316666,  1.01016786,  3.05540398,  0.85399389, -0.22904467,
         0.0542186 ,  0.60025743, -0.15947038,  0.29255373, -0.09642773,
         0.33955714, -0.26450946]])

In [34]:
# to know the variable these variables refer to
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Year', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [35]:
# best practice to define a variale to contain this info
feature_name = unscaled_inputs.columns.values

In [36]:
#creating a dataframe to contain the feature_names and the corresponding coefficient
summary_table = pd.DataFrame (columns=['feature name'], data = feature_name)

# matching the names with the coeffiecient
summary_table['coefficient'] =np.transpose(reg.coef_) #transpose because ndarrays are rows not columns
summary_table

Unnamed: 0,feature name,coefficient
0,Reason_1,2.823167
1,Reason_2,1.010168
2,Reason_3,3.055404
3,Reason_4,0.853994
4,Year,-0.229045
5,Month,0.054219
6,Transportation Expense,0.600257
7,Age,-0.15947
8,Body Mass Index,0.292554
9,Education,-0.096428


In [37]:
# appending the intercept
# to shift the index of thee summary table so that the intercept comes first
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,feature name,coefficient
0,intercept,-1.67772
1,Reason_1,2.823167
2,Reason_2,1.010168
3,Reason_3,3.055404
4,Reason_4,0.853994
5,Year,-0.229045
6,Month,0.054219
7,Transportation Expense,0.600257
8,Age,-0.15947
9,Body Mass Index,0.292554


## Interpreting the coefficients

"""
Remember, coefficient = weight and intercepts = bias. the weight shows how we weigh a certain input, the closer they are to zero, the smaller the weight... 
this holds for models where all variables are of the same scale, like this model.
Standardized coefficients are basically coefficients of a regression where all variables have been standardized. 
it's advised to standard the variables like in this model.
In logistic Regression, coefficient predicted are the 'log(odds). logistic regreesion by default are linear function, predicting log(odds). 
these log(odds) are later transformed into zero's and one's
"""

In [38]:
# find the exponentials of the coefficient, to make them more interpretable
summary_table['odds_ratio'] = np.exp(summary_table.coefficient)
summary_table

Unnamed: 0,feature name,coefficient,odds_ratio
0,intercept,-1.67772,0.186799
1,Reason_1,2.823167,16.830062
2,Reason_2,1.010168,2.746062
3,Reason_3,3.055404,21.22976
4,Reason_4,0.853994,2.34901
5,Year,-0.229045,0.795293
6,Month,0.054219,1.055715
7,Transportation Expense,0.600257,1.822588
8,Age,-0.15947,0.852595
9,Body Mass Index,0.292554,1.339845


In [39]:
# sort the dataframe by the 'odds_ratio' column
summary_table.sort_values('odds_ratio', ascending = False)


Unnamed: 0,feature name,coefficient,odds_ratio
3,Reason_3,3.055404,21.22976
1,Reason_1,2.823167,16.830062
2,Reason_2,1.010168,2.746062
4,Reason_4,0.853994,2.34901
7,Transportation Expense,0.600257,1.822588
11,Children,0.339557,1.404326
9,Body Mass Index,0.292554,1.339845
6,Month,0.054219,1.055715
10,Education,-0.096428,0.908076
8,Age,-0.15947,0.852595


if a coefficient is around zero or it's odd ratio is close to 1 = corresponding feature isn't particularly important

#### Backward Elimination:
This entails simplifying a model by removing all features which have close to no contribution to the model.

#### Testing the model

In [40]:
reg.score(x_test,y_test)

0.7357142857142858

In [41]:
# getting the probability of an output being 0 or 1
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.76164939, 0.23835061],
       [0.5809831 , 0.4190169 ],
       [0.43946164, 0.56053836],
       [0.8207311 , 0.1792689 ],
       [0.06462704, 0.93537296],
       [0.38986871, 0.61013129],
       [0.23562559, 0.76437441],
       [0.17533149, 0.82466851],
       [0.73964127, 0.26035873],
       [0.67966059, 0.32033941],
       [0.54279938, 0.45720062],
       [0.24419867, 0.75580133],
       [0.06771161, 0.93228839],
       [0.71275943, 0.28724057],
       [0.34184942, 0.65815058],
       [0.54026414, 0.45973586],
       [0.61886717, 0.38113283],
       [0.49945296, 0.50054704],
       [0.46104393, 0.53895607],
       [0.06136406, 0.93863594],
       [0.65781674, 0.34218326],
       [0.8207311 , 0.1792689 ],
       [0.423314  , 0.576686  ],
       [0.423314  , 0.576686  ],
       [0.3246915 , 0.6753085 ],
       [0.72446227, 0.27553773],
       [0.55316734, 0.44683266],
       [0.87154714, 0.12845286],
       [0.26121435, 0.73878565],
       [0.8207311 , 0.1792689 ],
       [0.

the first column shows the probability our model assigned to the observation being zero, and the second column shows the probability of the model assigned to the observation being 1. Suming any 2 numbers horizontally equals 1. Since we are interested in the probability of absenteeism, we are concerned with the probability of 1 (second column).

In [43]:
#slicing the output to get the probability of excessive absenteeism
predicted_proba[:,1]

array([0.23835061, 0.4190169 , 0.56053836, 0.1792689 , 0.93537296,
       0.61013129, 0.76437441, 0.82466851, 0.26035873, 0.32033941,
       0.45720062, 0.75580133, 0.93228839, 0.28724057, 0.65815058,
       0.45973586, 0.38113283, 0.50054704, 0.53895607, 0.93863594,
       0.34218326, 0.1792689 , 0.576686  , 0.576686  , 0.6753085 ,
       0.27553773, 0.44683266, 0.12845286, 0.73878565, 0.1792689 ,
       0.2973015 , 0.7223258 , 0.72117047, 0.46489346, 0.1792689 ,
       0.59179573, 0.18241421, 0.76878019, 0.3971435 , 0.66888352,
       0.17700261, 0.42431116, 0.26940075, 0.40454096, 0.8026973 ,
       0.54533911, 0.62918171, 0.33396797, 0.21345938, 0.17475886,
       0.52520478, 0.33820249, 0.71490252, 0.22946791, 0.86739133,
       0.42657217, 0.84568976, 0.18908835, 0.34488347, 0.34838934,
       0.65178842, 0.60644285, 0.24752435, 0.82007715, 0.24871371,
       0.32711722, 0.08402244, 0.26335078, 0.71037629, 0.40722401,
       0.26335078, 0.31830736, 0.90273589, 0.4158139 , 0.55688

In [44]:
predicted_proba[:,1].shape


(140,)

#### Save Model

In [46]:
# using pickel module to convert a python object into a character stream
# this involves saving the reg variable into a file, which can be loaded into a new notebook for usuage
import pickle

In [47]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [48]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler,file)