## CREATING A LOGISTIC REGRESSSION TO PREDICT ABSENTEEISM

### IMPORT RELEVANT LIBRARIES

In [1]:
import pandas as pd
import numpy as np

### load the data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## we will use logistic Regression to predict absenteeism

### CREATING THE TARGET FOR THE LOGISTIC REGRESSION 
Logit Reg is a type of classification, so we will be classifying people into classes

THE CLASSES ARE MODERATELY ABSENT AND EXCESSIVELY ABSENT
WE WILL TAKE THE MEDIAN VALUE OF THE ABSENTEEISM TIME IN HOURS AND USE IT AS A CUT OFF LINE 

In [3]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [4]:
# If an observation has been absent for less than 3hours we will asign the value of zero otherwise one
# in supervised learning we can call the 0s and 1s targets
# we will predict if we will get a zero or a one

# WE WILL USE THE NUMPY 'np.where' function np.where(condition,value if True, value if False)- checks if a condition has been
# satisfied and assigns a value accordingly

# target variable will measure if a person has been absent for 3hrs

targets = np.where(data_preprocessed['Absenteeism Time in Hours']>3,1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

# parameterize the code above
parameterizing code minimizes chance of making mistakes

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours']>data_preprocessed['Absenteeism Time in Hours'].median(),1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed['Excessive Absenteeism'] = targets

In [7]:
data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2,0


In [8]:
# by using the median we have implicitly balanced the dataset. Roughly half of the targets are zeros and the other half 1s. This 
# will prevent our model from learning to output one of the two classes exclussively thinking it did very well

# LETS PROVE THAT

# targets.sum()-number of targets that are 1
# targets.shape[0]- total number of targets

(targets.sum()/targets.shape[0]).round(2)

# around 46% are 1s and 54% are 0s. RECALL that when balancing your dataset, the 2 classes neednt represent 50% of the sample exactly
# 60/40 or 45/55 split works equally well for logistic regression but not true for neural networks


0.46

### DROP "Absenteeism Time in Hours"

In [9]:
data_with_targets = data_preprocessed.drop(["Absenteeism Time in Hours", 'Day of the week', 'Daily Work Load Average', 'Distance to Work'], axis=1)
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [10]:
# lets check if the datasets are same

data_with_targets is data_preprocessed

False

### SELECTING THE INPUTS FOR THE LOGISTIC REGRESSION

In [11]:
data_with_targets.shape

(700, 12)

In [12]:
# We will use the pandas method iloc (DataFrame.iloc[row indices, column indices]- slices(selects) data by position when given
# rows and columns wanted)
# it is used in selection by position in the dataframe

data_with_targets.iloc[:,0:14] # this selects all rows and columns 1 to 14.

# or data_with_targets.iloc[:,:14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


In [13]:
# OR
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [14]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

### STANDARDIZING THE DATA

In [15]:
# from sklearn.preprocessing import StandardScaler

In [16]:
# DECLARE A STANDARD SCALER OBJECT

# absenteeism_scaler = StandardScaler() 
# the object created will be used to scale our data ie subtract the mean and divide by the standard deviation

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
class CustomScaler:
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)  # Corrected line
        self.columns = columns
        self.mean_ = None  # Initialize mean_

    def fit(self, X):
        self.scaler.fit(X[self.columns])
        self.mean_ = self.scaler.mean_ # Capture the mean after fit
        return self

    def transform(self, X):
        X[self.columns] = self.scaler.transform(X[self.columns])
        return X

    def inverse_transform(self, X):
         X[self.columns] = self.scaler.inverse_transform(X[self.columns])
         return X

# when we declare the scaler object, there is an extra arguement (columns to scale). so custom scaler will not stdize all inputs but only the one we
# choose, in this way we can preserve the dummies untouched. in practice we would avoid this step by stdizing prior to creating the dummies

In [18]:
# check what are all columns that we've got
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [19]:
#columns_to_scale = ['month_values','Day of the week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']
# this are the columns we will like to scale

# Insted of removing the unnecessary columns like that we can Augment the code by declaring a new variable called "columns to omit"

columns_to_omit = ['Reason_1','Reason_2','Reason_3','Reason_4','Education'] 

In [20]:
# To find the column that needs scaling we use list comprehension (IT IS AN AMAZING SYNTACTIC CONSTRUCT WHICH ALLOWS US TO CREATE A LIST FROM EXISTING
# LISTS BASED ON LOOPS, CONDITIONALS etc

columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [21]:
# DECARE THE ABSENTEEISM SCALER
absenteeism_scaler = CustomScaler(columns_to_scale)

In [22]:
# fit our input data
absenteeism_scaler.fit(unscaled_inputs)

# absenteeism_scaler will contain infor about the mean and std deviation. whenever you get new data you will know that the stand
# ardization infor is contained in the absenteeism scaler, thus you will be able to stdize the new data in same way

<__main__.CustomScaler at 0x1f0a6a7fd40>

In [23]:
# lets apply the scaling mechnism using the method "transform"

scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

scaled_inputs

# whenever we get new data we will just apply absenteeism_scaler.transform(new data) to reach same transformation

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [24]:
scaled_inputs.shape

(700, 11)

### SPLITTING THE DATASET INTO TRAIN AND TEST and SHUFFLE

In [25]:
# import relevant module

from sklearn.model_selection import train_test_split

# the train_test_split has many arguements, the 2 most important are "inputs" and "targets"

# sklearn.mode_selection.train_test_split(inputs,targets)-splits arrays and matrices into random train and test subsets

### SPLIT

In [26]:
train_test_split(scaled_inputs, targets)

# i will use train_size= 0.8

# in addition the train_test_split method has a shuffle parameter(by default it is set to True)
# sklearn.mode_selection.train_test_split(inputs,targets, train_size,shuffle=True, random_state= an int)-splits arrays and
# matrices into random train and test subsets

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 278         1         0         0         0     0.753746   
 600         1         0         0         0    -1.244823   
 308         0         0         0         1     1.039256   
 382         0         0         0         1    -1.244823   
 300         0         0         0         0     1.039256   
 ..        ...       ...       ...       ...          ...   
 466         0         0         0         1     0.182726   
 456         0         0         1         0    -0.102784   
 168         1         0         0         0    -0.959313   
 345         1         0         0         0     1.610276   
 269         1         0         0         0     0.468236   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 278               -0.654143  0.248310         1.002633          0 -0.919030   
 600               -0.654143 -1.006686        -1.819793          1 -0.919030   
 308               -0.6541

In [27]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets)

In [28]:
print(x_train.shape, y_train.shape)

(525, 11) (525,)


In [29]:
print(x_test.shape, y_test.shape)

(175, 11) (175,)


### the default split above is 75:25 (ie 3 ratio 1) but we can still use 90:10 or 80:20. we ensure not to assign too much data for testing. we can specify the amount of data we want to use for our training in the function
we use (train_size = a certain value). the value range from 0 to 1. so value of 0.9 means the train data size is 90 and test data is 10


In [30]:
# i will use train_size= 0.8

# in addition the train_test_split method has a shuffle parameter(by default it is set to True)
# sklearn.mode_selection.train_test_split(inputs,targets, train_size,shuffle=True, random_state= an int)-splits arrays and
# matrices into random train and test subsets

# there may be a small issue though. when we rerun our code, we get a different shuffle and this means a diff split. 
# THIS CAUSES THE FINAL MODEL TO DIFFER EVERYTIME DUE TO THE SHUFFLING. WE MAY GET A HIGHER ACCURACY OR A LESS ACCURACY THAN SHOULD BE

# THERE IS A SOLUTION IN THAT ALL SKLEARN FUNCTIONS THAT involve SOME RANDOMNESS CONTAIN A RANDOM STATE PARAMETER(it takes int. values)
# if we set it to a number it will make the shuffle pseudorandom(it will always shuffle observation in same random way)

x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size= 0.8, random_state = 20)

In [31]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [32]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


## FITTING THE MODEL AND ASSESSING ITS ACCURACY
WHEN EVER WE ARE TRAINING ML MODEL THERE ARE MANY MATHEMATICAL ISSUES ARISING IN THE BACKGROUND. IMPERFECT LIB SUCH AS STATSMODELS ARE NOT ALWAYS NUMERICALLY STABLE FOR MORE COMPLICATED MODELS THATS WHY WE CHOSE SKLEARN

### Logistic Regression with sklearn

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics # this will help while evaluating the model

### Training the model

In [34]:
# lets declare a log Reg object
reg = LogisticRegression()

# fit the regression

# sklearn.linear_model.LogisticRegression.fit(x,y) fits the model according to the given training data

reg.fit(x_train,y_train)

In [35]:
# lets evaluate the models accuracy. there are two required arguements inputs and targets. 
reg.score(x_train,y_train)
print("our model has an accuracy of "  +  str(reg.score(x_train,y_train)))

# Based on the data we used our model learned to classify 80% of the observation correctly

our model has an accuracy of 0.7928571428571428


### lets find the accuracy manually

In [36]:
# find the models output
model_output = reg.predict(x_train)
model_output

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [37]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [38]:
# there are few diff. lets see them
model_output == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [39]:
# since True =1 and false=0 lets sum this array using np.sum
np.sum(model_output == y_train)

# the result will be the total number of true entries

444

In [40]:
# Accuracy = correct prediction/ no of observations
model_output.shape[0]

560

In [41]:
Accuracy = np.sum(model_output == y_train)/model_output.shape[0]
Accuracy

0.7928571428571428

## Creating a summary table with the coefficients and intercepts

In [42]:
# intercept
reg.intercept_

array([-1.69725618])

In [43]:
# coefficient
reg.coef_

array([[ 2.84464182,  1.00484973,  3.35999407,  0.70515655,  0.21912133,
         0.54156568, -0.15345709,  0.24861004, -0.36689013,  0.30734006,
        -0.4295252 ]])

In [44]:
# what variable does the coefficients refer to

# scaled_inputs.columns.values 
# will give an error AttributeError: 'numpy.ndarray' object has no attribute 'columns'.
# EXPLANATION: We standardised the pandas data frame unscaled_inputs using the std scaler and the result was stored in scaled inputs. THIS SHOWS THAT 
# SKLEARN METHODS ARE COMPATIBLE WITH WITH PANDAS DATAFRAMES BUT WHENEVER WE EMPLOY SOME SKLEARN FN EVERYTHING IS TRANSFORMED TO ND ARRAYS

# THIS IS THE DIFF BW SKLEARN AND STATSMODELS. WITH STATSMODELS ALMOST ALL OF THE MANIPULATIONS WERE GOING THROUGH PANDAS DATAFRAMES

unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [45]:
feature_name = unscaled_inputs.columns.values

In [46]:
summary_table = pd.DataFrame (columns=["feature_name"], data = feature_name)
summary_table["coefficient"] = np.transpose(reg.coef_)
summary_table

# np.transpose(reg.coef_)-NOTE THAT WE MUST TRANSPOSE THIS ARRAY (reg.coef_) BC BY DEFAULT ND ARRAYS ARE ROWS and NOT COLUMNS 

Unnamed: 0,feature_name,coefficient
0,Reason_1,2.844642
1,Reason_2,1.00485
2,Reason_3,3.359994
3,Reason_4,0.705157
4,Month Value,0.219121
5,Transportation Expense,0.541566
6,Age,-0.153457
7,Body Mass Index,0.24861
8,Education,-0.36689
9,Children,0.30734


In [47]:
# lets add the intercept using the "append" or "concatenate" method. however most method will put the newly appended data at the end of the datafram
# there is no prepend method
# ONE SOLUTION IS: add 1 to all indices of the dataframe summary table

summary_table.index = summary_table.index + 1
# in this way we will shift up all indices by one. 
# now the zero index is empty, lets fill it with intercept value

summary_table.loc[0] = ['Intercept', reg.intercept_[0]] # we will specify the zeroeth element so we can extract the float rather than the whole array

# we sort the summary table by index

summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,feature_name,coefficient
0,Intercept,-1.697256
1,Reason_1,2.844642
2,Reason_2,1.00485
3,Reason_3,3.359994
4,Reason_4,0.705157
5,Month Value,0.219121
6,Transportation Expense,0.541566
7,Age,-0.153457
8,Body Mass Index,0.24861
9,Education,-0.36689


### interpreting the coefficients

### the coef are also called the wts(shows how we weigh a certain input: the closer they are to zero the smaller the wts). alternatively the 
further away from zero (no matter if +ve or -ve) the bigger the wts of this feature. this is true for our model but not universally true
it holds true for models where all variables are of same scale such as this one no matter
intercept are the biases

stdized coef are basically the coef of reg where all variables have been standardized. other packages and software include the stdized coef bc they 
allow for a simple and easy to understand comparison bw the variables. since they are stdized thwy all have a variance of one or the same scale

Also whenever we are dealing with logistic reg the coef we are predicting are the so called "log odds".this is a consequence of the choice of model
log reg are linear functions predicting log odds which are later transformed into 0s and 1s

In [48]:
# LETS FIND THE EXPONENTIALS OF THIS COEFS TO MAKE THEM MORE INTERPRETABLE
# odds ratio is what we will get after we finf the exponentials of the coef

summary_table['odds_ratio'] = np.exp(summary_table.coefficient)
summary_table

Unnamed: 0,feature_name,coefficient,odds_ratio
0,Intercept,-1.697256,0.183185
1,Reason_1,2.844642,17.195398
2,Reason_2,1.00485,2.731497
3,Reason_3,3.359994,28.78902
4,Reason_4,0.705157,2.024164
5,Month Value,0.219121,1.244982
6,Transportation Expense,0.541566,1.718696
7,Age,-0.153457,0.857738
8,Body Mass Index,0.24861,1.282242
9,Education,-0.36689,0.692886


In [49]:
# DataFrame.sort_values(Series)- sort the values in a data frame with respect to a given column(series)

summary_table.sort_values('odds_ratio') # by default DataFrame.sort_values is done in ascending order

# lets use this
summary_table.sort_values('odds_ratio', ascending= False)

Unnamed: 0,feature_name,coefficient,odds_ratio
3,Reason_3,3.359994,28.78902
1,Reason_1,2.844642,17.195398
2,Reason_2,1.00485,2.731497
4,Reason_4,0.705157,2.024164
6,Transportation Expense,0.541566,1.718696
10,Children,0.30734,1.359803
8,Body Mass Index,0.24861,1.282242
5,Month Value,0.219121,1.244982
7,Age,-0.153457,0.857738
9,Education,-0.36689,0.692886


### interpretation
A feature is not particularly important if: its coef is around zero and if its odds ratio is around 1
* a coef(wt) of 0 implies that no matter the feature value, we will multiply it by 0 (in the model)
* For the odds ratio - it implies that for a unit change in stdized feature, the odds increase by a multiple equal to the odds ratio (1= no change)

BASED ON THE ABOVE EXPLANATION 'Daily Work Load', 'Average Distance to Work', AND 'Day of the week'	MIGHT NOT HAVE IMPACT ON THE MODEL. LETS keep them for now

* when we where creating the dummies we we dropped the reason zero( represented a situation where a person was was absent but no reason was given
* therfore the base model is when there is no reason given

### stdizing only the Numerical variables (creating a custom scaler)
when we stdized the inputs we also stdized the dummies. this is bad practice bc when we stdize we lose the whole interpretability of a dummy
lets correct the code (using a custom scaler) from the cell we stdized the inputs

## INTERPRETING THE COEF OF LOGISTIC REGRESSION
Reason 0 = No reason = baseline model (when no reason is given)
Reason 3(poisoning)- odds of someone being absent after being poisoned are 29 times higher than when no reason was reported
Reason 1(various diseases)- odds of someone being absent after being various diseases are 18 times higher than when no reason was reported
Reason 2 (pregnancy and giving birth)- odds of someone being absent due to pregnancy are 3 times higher than when no reason was reported etc
Trasportation expense- this is the most important non dummy variable in the model but THE PROBLEM IS THAT ITS ONE OF OUR STDIZED VARIABLES (WE DONT HAVE DIRECT INTERPRETABILITY OF IT) its odd ratio implies that for 1 std unit (or 1 std dev increase in transportation expense it is close to twice as likely to be excessively absent( this is the main drawback of stdization. stdized models almost always yield higher accuracy bc the optimization algo work better in this way

* ML Engr. prefer models with higher accuracy, so they normally go for stdization
* ECONOMETRICIANS and STATISTICIANS prefer less accurate but more interpretable models, bc they care abt the underlying reason behind diff phenomena
* DATA SCIENTISTS may be in either position. sometimes they need higher accuracy, other times. they must find the main drivers of the problem
SO IT MAKES SENSE TO CREATE MODELS WITH AND WITHOUT STDIZATION AND THEM DRAW INSIGHT FROM BOTH HOWEVER IF WE ARE OPTING FOR PREDICTING VALUES WE GO FOR HIGHER ACCURACY

PET is a continous variable ( and a -ve coef) its odd ratio is 0.64 so for each additional stdized unit of pet the odds are (1 minus its odd ratio)= 36% lower than the base model (no pet) 



### BACKWARD ELIMINATION OR SIMPLIFYING OUR MODEL

In [50]:
# The idea is that we can simplify our model by removing all features which have close to no contribution to the model (Recall that when we have 
# the p-value, we get rid of all coefficients with p-value > 0.05
# if we remove these variables, the rest of our model should not really change in terms of coef values

#### TESTING THE MODEL WE CREATED

In [51]:
# MODEL ACCURACY IS THE TRAIN ACCURACY
# Recall that testing is done once and at the end of ml pprocess on a dataset that has not been see before

reg.score(x_test,y_test)

0.7642857142857142

### based on the data that the model has not seen before in 76% of the cases, the model will predict correctly if a person is going to be excessively absent. the test accuracy is always less(often 10% or 20%) than the train accuracy. this is bc our model overfitted( if we get a higher number: we either got lucky or made a mistake

In [52]:
# lets get the probability of an output being 0 or 1 using sklearn mthod (.predict_proba(x))
# SYNTAX - sklearn.linear_model.LogisticRegression.predict_proba(x)- returns the probability estimates for all possible outputs(classes)

predicted_proba = reg.predict_proba(x_test)
predicted_proba

# on the left is the probability of being 0 and on the right is probability of being 1 

# thats why summing both numbers horizontally gives a probability = 1

array([[0.70559492, 0.29440508],
       [0.63990181, 0.36009819],
       [0.46280359, 0.53719641],
       [0.79811833, 0.20188167],
       [0.08598967, 0.91401033],
       [0.31758118, 0.68241882],
       [0.33575952, 0.66424048],
       [0.12641903, 0.87358097],
       [0.85233378, 0.14766622],
       [0.75479099, 0.24520901],
       [0.06554993, 0.93445007],
       [0.01409077, 0.98590923],
       [0.06348969, 0.93651031],
       [0.13110265, 0.86889735],
       [0.27238366, 0.72761634],
       [0.63528094, 0.36471906],
       [0.73412531, 0.26587469],
       [0.08264799, 0.91735201],
       [0.4153025 , 0.5846975 ],
       [0.04794128, 0.95205872],
       [0.75536072, 0.24463928],
       [0.79811833, 0.20188167],
       [0.35080075, 0.64919925],
       [0.35080075, 0.64919925],
       [0.25171288, 0.74828712],
       [0.80848828, 0.19151172],
       [0.54671217, 0.45328783],
       [0.86705801, 0.13294199],
       [0.1442102 , 0.8557898 ],
       [0.79811833, 0.20188167],
       [0.

In [53]:
predicted_proba.shape

(140, 2)

In [54]:
# we are interested in probability of excess absenteeism(prob of getting one)
predicted_proba[:,1]

# IN REALITY LOG REG. MODELS CALCULATE THESE PROBABILITY IN THE BACKGROUND
# IF PROBABILITY IS:
# below 0.5, it places a 0
# above 0.5, it places a 1

array([0.29440508, 0.36009819, 0.53719641, 0.20188167, 0.91401033,
       0.68241882, 0.66424048, 0.87358097, 0.14766622, 0.24520901,
       0.93445007, 0.98590923, 0.93651031, 0.86889735, 0.72761634,
       0.36471906, 0.26587469, 0.91735201, 0.5846975 , 0.95205872,
       0.24463928, 0.20188167, 0.64919925, 0.64919925, 0.74828712,
       0.19151172, 0.45328783, 0.13294199, 0.8557898 , 0.20188167,
       0.92654644, 0.62517776, 0.70889693, 0.9417101 , 0.20188167,
       0.95556152, 0.15571503, 0.77859655, 0.32698822, 0.55579898,
       0.19198917, 0.44122344, 0.17288175, 0.38200963, 0.80093913,
       0.65257631, 0.70417874, 0.29440508, 0.20310225, 0.1824706 ,
       0.61110899, 0.3053038 , 0.68241882, 0.27058651, 0.82700489,
       0.38940467, 0.82872552, 0.20837148, 0.33598948, 0.350087  ,
       0.68447419, 0.66870832, 0.28970019, 0.80254929, 0.14553701,
       0.26909731, 0.10610092, 0.15571503, 0.78377809, 0.89487084,
       0.15571503, 0.27994759, 0.91302223, 0.39914423, 0.52087

### Saving the model and preparing it for Deployment

In [55]:
# saving the model is the process of creating a file that will contain all the infor regarding the ML

# THE reg. WHICH IS AN INSTANCE OF THE CLASS LogisticRegression contain all the infor ( ie intercept, coef and accuracy) so we will use the "reg"
# to save the model

# we will use the python module pickle((pickle[module])it is the process of converting a python object into a character stream).
# the main idea is that this character stream will contain sufficient infor. then later when we would like to convert the character stream into a python
# object in another notebook we will unpickle it( we will save the reg variable into a file. this file will then be loaded in a new notebook and thus 
# we will be able to use the machine learning algo

# the file size will be less than 1kb so it can be sent easily by email

# WHEN WE PICKLE WE ARE BASICALLY SEPERATING THE MODEL FROM THE TRAINING DATA


In [56]:
import pickle

In [57]:
with open('model','wb')as file:
    pickle.dump(reg, file)

# the file name is set to "model" as it basically contains the model
# wb( write bite)- when we unpickle, we will use rb or read bites
# the dump method- when we pickle we dump the infor in a file, when we unpickle we load it. in the dump method we specify the object to be dumped
# and the rest is just python syntax

# WE MUST SAVE THE ABSENTEEISM SCALER TOO. IT WAS USED TO STDIZE ALL NUMERICAL VARIABLES (IT STORED THE COLUMN TO SCALE AS WELL AS THE MEAN AND STD DEV
# OF EACH FEATURE)
# THE INFOR IN THE ABSENTEEISM SCALER IS NEEDED TO PROCESS ANY NEW DATA USING THE SAME RULES AS THE ONES APPLIED TO TRAINING DATA. THUS WE MUST PICKLE 
# THE TRAINING SCALER TOO

In [58]:
with open('scaler','wb')as file:
    pickle.dump(absenteeism_scaler, file)

## preparing the deployment of the model through a module

In [59]:
# Deploying a model consist in making it available and ready to use. it consist of two steps; saving the model and then applying it to new data
# second step involves creating a mechanism to load the saved model and make predictions
# THERE ARE TWO APPROACHES TO THIS; CLUMSY AND CLEVER.

# CLEVER APPROACH: In practice we prefer creating a module bc storing code in the module will allow us to reuse it without trouble. in essence we will 
# treat the methods in this module in same way we treat the numpy, sklearn and pandas methods