# We will use a logistic regression considering all variables as features to predict our target
## Target = absent hours
## The model will also tell us which variables are important for the analysis

In [1]:
# Define moderately absent vs. excessively absent (our target)
# we will use the median value of 'absenteeism in hours' and use it as the cut off line because 
# median is numerically stable and rigid
# below the median = normal ; above the median = excessive 

In [2]:
import pandas as pd
preprocessed_data = pd.read_csv('Absenteeism_preprocessed.csv')
preprocessed_data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2


In [3]:
preprocessed_data['Absenteeism Time in Hours'].median()

3.0

In [4]:
# Now we have our cutoff: Absenteeism time over 3 hours will be considered excessive
# We can create a new column that indicates whether excessive absenteeism has been met in that sample

import numpy as np
targets = np.where(preprocessed_data['Absenteeism Time in Hours'] > 3, 1, 0)
# parameters (condition, if true, if false)
preprocessed_data['Excessive Absenteeism'] = targets

# By using the median we implicitly balance the dataset (half targets = 1, half targets = 0)
# This will prevent our modell from learning to output only 0s or only 1s

preprocessed_data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4,1
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2,0
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4,1
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2,0


In [7]:
# check if dataset is balanced
targets.sum(), targets.shape[0]

(319, 700)

### 46% 1 ; 54% 0

In [9]:
data_with_targets = preprocessed_data.drop(['Absenteeism Time in Hours'], axis=1)
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,1
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,0
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,1
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,0


# Select inputs for Logistic Regression

In [11]:
unscaled_inputs = data_with_targets.iloc[:, :-1]
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

# Standarize

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# subtracts men and divides by standard deviation featurewise

scaler.fit(unscaled_inputs)
# calculates and sttores mean and standard deviation stored in scaler object


# Whenever there is new data, you know what standardization info is contained in scaler
# so you can standardize new data in the same way

In [13]:
# apply the scalaing mechanism
scaled_inputs = scaler.transform(unscaled_inputs)

# when you get new data:
# new_data_scaled = scaler.transform(new_data_raw)

In [14]:
scaled_inputs.shape

(700, 14)

# Train Test Split

In [16]:
# Address overfitting by splitting data into train and test
# shuffle data to remove dependencies that come from order of dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(scaled_inputs, targets, train_size= 0.8, random_state=42)

# train size of 80% of dataset is standard ; random state is 42 so that the observations are shuffled in the same random way for replicability
# look aat shapes to see what split has achieved
x_train.shape, y_train.shape

((560, 14), (560,))

# Fitting the Model and Assessing Accuracy

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logreg= LogisticRegression()
logreg.fit(x_train, y_train)

In [18]:
# evaluate model accuracy
logreg.score(x_train, y_train)

0.7732142857142857

## Manually check accuracy

In [19]:
# 77.3% of model outputs match the targets
# we now manually find the outputs and compare to targets

model_outputs = logreg.predict(x_train)
# this shows us the predicted class labels of the regression
# we are choosing to predict outputs associated w/ training inputs and contained in x_train
model_outputs

array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,

In [20]:
# directly compare using booleans
model_outputs == y_train

array([ True,  True,  True,  True, False, False, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True, False,  True, False,  True,
        True, False, False,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True, False,
        True,  True, False, False,  True,  True, False, False,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [21]:
# T= 1 ; F = 0
# we can use sum to measure correct predictions

np.sum([model_outputs == y_train])

433

### 433 correct predictions

In [22]:
model_outputs.shape[0]

560

In [23]:
433/560

0.7732142857142857

### We get the same accuracy!

### And we have a better idea of what this result means

# Summary Table with coefficients and intercept

In [24]:
# to apply the model outside of python we need the coefficients and intercept
logreg.intercept_, logreg.coef_

(array([-0.1807155]),
 array([[ 2.1388561 ,  0.34702326,  1.53810153,  1.41264252,  0.09468983,
         -0.15167965,  0.79489979, -0.09787614, -0.28710602, -0.01114994,
          0.28400839, -0.14757032,  0.4228082 , -0.36512155]]))

In [27]:
# we want to match up the coefficients with the features 
# whenever we use sklearn, results are usually arrays not dataframes
# we can use unscaled_inputs as the data frame to extract feature names

feature_name = unscaled_inputs.columns.values

summary_table = pd.DataFrame(columns=['Feature Name'], data=feature_name)
summary_table['Coefficients']=np.transpose(logreg.coef_)

# we use transpose because by default np arrays are rows and not columns

# then we can add the intercept in the 0th index of the table
# would first have to move all indices by 1; we use loc[0] to set the position and sort 

summary_table.index = summary_table.index + 1 
summary_table.loc[0]=['Intercept', logreg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficients
0,Intercept,-0.180715
1,Reason_1,2.138856
2,Reason_2,0.347023
3,Reason_3,1.538102
4,Reason_4,1.412643
5,Month Value,0.09469
6,Day of the Week,-0.15168
7,Transportation Expense,0.7949
8,Distance to Work,-0.097876
9,Age,-0.287106


## The closer the weights are to 0; The less impact (smaller the weight on the model) 

In [30]:
# This notion holds only for models where all the variables are on the same scale
# Standardized coefficients are basically the coefficients of a regression where all the variables have been standardized
# allows for simple comparison between variables

# features standardized: VARIANCE OF 1 (same scale)

# whenever the scale is standard we can simply say:
# whichever weights is bigger, the corresponding feature is more important

# whenever we deal with logistic regression, coefficients we are predicting are called LOG ODDS
# logistic regression by default are nothing but a linear function predicting log odds ->0 then -> 1 

# log(odds) = intercept _ b1x1 + b2x2 ...
# to understand log odds we need to take exponentials of the coefficients

summary_table['odds_ratio'] = np.exp(summary_table['Coefficients'])
# and sort
summary_table.sort_values('odds_ratio', ascending=False)
summary_table

Unnamed: 0,Feature Name,Coefficients,odds_ratio
0,Intercept,-0.180715,0.834673
1,Reason_1,2.138856,8.489721
2,Reason_2,0.347023,1.41485
3,Reason_3,1.538102,4.655743
4,Reason_4,1.412643,4.106793
5,Month Value,0.09469,1.099318
6,Day of the Week,-0.15168,0.859264
7,Transportation Expense,0.7949,2.214219
8,Distance to Work,-0.097876,0.906761
9,Age,-0.287106,0.750432


In [31]:
# A feature is not important if: 
# coef is around 0
# odds ratio around 1

# a weight of 0 implies that no matter the feature value, we multiply by 0 
# for a unit change in the standardized feature, the odds increase by a multiple equal to the odds ratio (1= no change)
# odds x odds ratio = new odds (for a unit change) 

# odds ratio = 1 when weight = 0

### We can identify "Daily Work Load Average" as almost useless for the model
### Along with Day of the Week, Distance to Work

In [1]:
# we drop these columns before standardizing
# Furthermore, we have to exclude dummy variables from standardization to retain interpretability
# This will be done in a new ipynb 