# Creating a logistic regression to predict absenteeism

In [1]:
import pandas as pd
import numpy as np

In [3]:
data_preprocessed = pd.read_csv('Absentees_preprocessed.csv')

In [4]:
data_preprocessed.head()

Unnamed: 0.1,Unnamed: 0,Reason_1,Reason_2,Reason_,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of week
0,0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3


## Create the targets

In [5]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [6]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [7]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [8]:
data_preprocessed['Excessive Absenteeism'] = targets

In [9]:
data_preprocessed.head()

Unnamed: 0.1,Unnamed: 0,Reason_1,Reason_2,Reason_,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of week,Excessive Absenteeism
0,0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1,1
1,1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1,0
2,2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2,0
3,3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3,1
4,4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3,0


## A comment on the targets

In [10]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [11]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'],axis=1)

In [12]:
data_with_targets is data_preprocessed

False

In [13]:
data_with_targets.head()

Unnamed: 0.1,Unnamed: 0,Reason_1,Reason_2,Reason_,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of week,Excessive Absenteeism
0,0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,7,1,1
1,1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,7,1,0
2,2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,7,2,0
3,3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,7,3,1
4,4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,7,3,0


## Select the inputs for the regression

In [14]:
data_with_targets.shape

(700, 17)

In [15]:
data_with_targets.iloc[:,:14]

Unnamed: 0.1,Unnamed: 0,Reason_1,Reason_2,Reason_,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1
1,1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0
2,2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0
3,3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0
4,4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,695,1,0,0,0,2018-05-23,179,22,40,237.656,22,1,2,0
696,696,1,0,0,0,2018-05-23,225,26,28,237.656,24,0,1,2
697,697,1,0,0,0,2018-05-24,330,16,28,237.656,25,1,0,0
698,698,0,0,0,1,2018-05-24,235,16,32,237.656,25,1,0,0


In [16]:
data_with_targets.iloc[:,:-1]

Unnamed: 0.1,Unnamed: 0,Reason_1,Reason_2,Reason_,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of week
0,0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,7,1
1,1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,7,1
2,2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,7,2
3,3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,7,3
4,4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,695,1,0,0,0,2018-05-23,179,22,40,237.656,22,1,2,0,5,2
696,696,1,0,0,0,2018-05-23,225,26,28,237.656,24,0,1,2,5,2
697,697,1,0,0,0,2018-05-24,330,16,28,237.656,25,1,0,0,5,3
698,698,0,0,0,1,2018-05-24,235,16,32,237.656,25,1,0,0,5,3


In [17]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardize the data

In [18]:
from sklearn.preprocessing import StandardScaler

absenteeism_scaler = StandardScaler()

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin): 
    
 
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    

    def transform(self, X, y=None, copy=None):
        
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [20]:
unscaled_inputs.columns.values

array(['Unnamed: 0', 'Reason_1', 'Reason_2', 'Reason_', 'Reason_4',
       'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value', 'Day of week'], dtype=object)

In [26]:
unscaled_inputs.drop('Unnamed: 0', axis=1, inplace=True)

In [27]:
unscaled_inputs.drop('Date', axis=1, inplace=True)

In [28]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [29]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [30]:
absenteeism_scaler = CustomScaler(columns_to_scale)



In [31]:
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Reason_', 'Transportation Expense', 'Distance to Work',
                      'Age', 'Daily Work Load Average', 'Body Mass Index',
                      'Children', 'Pets', 'Month Value', 'Day of week'],
             copy=None, with_mean=None, with_std=None)

In [32]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [33]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of week
0,0,0,-0.314485,1,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487,0.182726,-0.683704
1,0,0,-0.314485,0,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690,0.182726,-0.683704
2,0,0,-0.314485,1,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690,0.182726,-0.007725
3,1,0,-0.314485,0,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690,0.182726,0.668253
4,0,0,-0.314485,1,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487,0.182726,0.668253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,-0.314485,0,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690,-0.388293,-0.007725
696,1,0,-0.314485,0,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663,-0.388293,-0.007725
697,1,0,-0.314485,0,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690,-0.388293,0.668253
698,0,0,-0.314485,1,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690,-0.388293,0.668253


In [34]:
scaled_inputs.shape

(700, 14)

## Split the data into train & test and shuffle

### Import the relevant module

In [35]:
from sklearn.model_selection import train_test_split

### Split

In [36]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2   Reason_  Reason_4  Transportation Expense  \
 545         0         0 -0.314485         1               -1.574681   
 301         0         0  3.179797         0                1.005844   
 448         0         0 -0.314485         1               -0.654143   
 147         0         0 -0.314485         1                0.040034   
 242         0         0 -0.314485         1               -1.016322   
 ..        ...       ...       ...       ...                     ...   
 436         1         0 -0.314485         0                1.624567   
 336         0         0 -0.314485         0                2.348925   
 371         1         0 -0.314485         0                0.040034   
 11          1         0 -0.314485         0                0.568211   
 3           1         0 -0.314485         0                0.854936   
 
      Distance to Work       Age  Daily Work Load Average  Body Mass Index  \
 545         -1.344669  0.091435                -0.08208

In [37]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [38]:
print (x_train.shape, y_train.shape)

(560, 14) (560,)


In [39]:
print (x_test.shape, y_test.shape)

(140, 14) (140,)


## Logistic regression with sklearn

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [43]:
reg = LogisticRegression()

In [44]:
reg.fit(x_train,y_train)

LogisticRegression()

In [45]:
reg.score(x_train,y_train)

0.7785714285714286

### Manually check the accuracy

In [46]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [47]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [48]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [49]:
np.sum((model_outputs==y_train))

436

In [50]:
model_outputs.shape[0]

560

In [51]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.7785714285714286

### Finding the intercept and coefficients

In [52]:
reg.intercept_

array([-1.70674422])

In [53]:
reg.coef_

array([[ 3.21419214,  1.16152841,  1.13422126,  1.25849406,  0.65028641,
        -0.01397274, -0.17397136, -0.00620182,  0.28498526, -0.25559485,
         0.36412974, -0.31538171,  0.17181177, -0.07527038]])

In [54]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value', 'Day of week'], dtype=object)

In [55]:
feature_name = unscaled_inputs.columns.values

In [56]:
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,3.214192
1,Reason_2,1.161528
2,Reason_,1.134221
3,Reason_4,1.258494
4,Transportation Expense,0.650286
5,Distance to Work,-0.013973
6,Age,-0.173971
7,Daily Work Load Average,-0.006202
8,Body Mass Index,0.284985
9,Education,-0.255595


In [57]:
summary_table.index = summary_table.index + 1

summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.706744
1,Reason_1,3.214192
2,Reason_2,1.161528
3,Reason_,1.134221
4,Reason_4,1.258494
5,Transportation Expense,0.650286
6,Distance to Work,-0.013973
7,Age,-0.173971
8,Daily Work Load Average,-0.006202
9,Body Mass Index,0.284985


## Interpreting the coefficients

In [58]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [59]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.706744,0.181456
1,Reason_1,3.214192,24.883182
2,Reason_2,1.161528,3.194813
3,Reason_,1.134221,3.108752
4,Reason_4,1.258494,3.520116
5,Transportation Expense,0.650286,1.91609
6,Distance to Work,-0.013973,0.986124
7,Age,-0.173971,0.840321
8,Daily Work Load Average,-0.006202,0.993817
9,Body Mass Index,0.284985,1.329742


In [60]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
1,Reason_1,3.214192,24.883182
4,Reason_4,1.258494,3.520116
2,Reason_2,1.161528,3.194813
3,Reason_,1.134221,3.108752
5,Transportation Expense,0.650286,1.91609
11,Children,0.36413,1.439261
9,Body Mass Index,0.284985,1.329742
13,Month Value,0.171812,1.187454
8,Daily Work Load Average,-0.006202,0.993817
6,Distance to Work,-0.013973,0.986124


## Test Data

In [64]:
reg.score(x_test, y_test)

0.7357142857142858

In [65]:
predected_proba = reg.predict_proba(x_test)
predected_proba

array([[0.73681024, 0.26318976],
       [0.60189093, 0.39810907],
       [0.39870555, 0.60129445],
       [0.80450673, 0.19549327],
       [0.07157655, 0.92842345],
       [0.32201671, 0.67798329],
       [0.320268  , 0.679732  ],
       [0.08747583, 0.91252417],
       [0.80380857, 0.19619143],
       [0.75388056, 0.24611944],
       [0.4847485 , 0.5152515 ],
       [0.18188979, 0.81811021],
       [0.0498474 , 0.9501526 ],
       [0.71761918, 0.28238082],
       [0.22486064, 0.77513936],
       [0.55929485, 0.44070515],
       [0.5453367 , 0.4546633 ],
       [0.57590802, 0.42409198],
       [0.39056442, 0.60943558],
       [0.02987143, 0.97012857],
       [0.70443367, 0.29556633],
       [0.79638021, 0.20361979],
       [0.40283245, 0.59716755],
       [0.42753517, 0.57246483],
       [0.18979402, 0.81020598],
       [0.76417264, 0.23582736],
       [0.49633149, 0.50366851],
       [0.87484715, 0.12515285],
       [0.1384786 , 0.8615214 ],
       [0.7880049 , 0.2119951 ],
       [0.

In [66]:
predected_proba.shape

(140, 2)

In [69]:
predected_proba[:,1]

array([0.26318976, 0.39810907, 0.60129445, 0.19549327, 0.92842345,
       0.67798329, 0.679732  , 0.91252417, 0.19619143, 0.24611944,
       0.5152515 , 0.81811021, 0.9501526 , 0.28238082, 0.77513936,
       0.44070515, 0.4546633 , 0.42409198, 0.60943558, 0.97012857,
       0.29556633, 0.20361979, 0.59716755, 0.57246483, 0.81020598,
       0.23582736, 0.50366851, 0.12515285, 0.8615214 , 0.2119951 ,
       0.37776798, 0.67697906, 0.6829686 , 0.53943893, 0.20361979,
       0.51785196, 0.20516177, 0.81679586, 0.43607309, 0.59952777,
       0.2198134 , 0.41619943, 0.21135803, 0.30796293, 0.81779347,
       0.6694303 , 0.69132371, 0.27201174, 0.20518418, 0.17843671,
       0.5764155 , 0.26743152, 0.66259177, 0.28701443, 0.85246302,
       0.47249324, 0.92611665, 0.24914154, 0.24974676, 0.25025319,
       0.71683462, 0.65431544, 0.30782385, 0.85173147, 0.18874251,
       0.26471447, 0.0553864 , 0.22122836, 0.8103607 , 0.3239414 ,
       0.20419405, 0.22981785, 0.91467359, 0.44550857, 0.62868

## Save the model

In [70]:
import pickle

In [71]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [72]:
with open('scalar', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)