# Creating a logistic regression to predict absenteeism

## Import the relevant libraries

In [2]:
import pandas as pd
import numpy as np

## Load the data

In [3]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head(20)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,1,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,1,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
5,0,0,0,1,7,4,179,51,38,239.554,31,0,0,0,2
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4,8
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,4
8,0,0,1,1,7,0,155,12,34,239.554,25,0,2,0,40
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1,8


## Create the targets( 1's and 0's instead of number of hours)

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

We will create the targets as follow:

0 stand for moderatly absent, where Absenteeism time is less than or equal to 3 hours

1 stand for excessively absent, where Absenteeism time is greater than or equal 4 hours


In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

np.unique(targets)

array([0, 1])

In [6]:
targets.shape

(700,)

In [7]:
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,1,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,1,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


### Cheking for balance

In [8]:
number_of_ones = targets.sum()
total_number_of_elements = targets.shape[0]

Balance_check = number_of_ones / total_number_of_elements
Balance_check

0.45571428571428574

45% is acceptable, the data is balanced

### drop the 'Absenteeism Time in Hours' from the dataset

In [9]:
#after we see the result in the summary table we came back to this cell and drop the non-important features

data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week',
                                            'Daily Work Load Average','Distance to Work','Reason_4'],axis=1)

data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,7,289,33,30,0,2,1,1
1,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,7,179,38,31,0,0,0,0
3,1,0,0,7,279,39,24,0,2,0,1
4,0,0,0,7,289,33,30,0,2,1,0


# Select the inputs for the regression

In [10]:
data_with_targets.shape

(700, 11)

In [11]:
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,7,289,33,30,0,2,1
1,0,0,0,7,118,50,31,0,1,0
2,0,0,0,7,179,38,31,0,0,0
3,1,0,0,7,279,39,24,0,2,0
4,0,0,0,7,289,33,30,0,2,1


# Standardize the data

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler


class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns], axis=0)
        self.var_ = np.var(X[self.columns], axis=0)
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [13]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [14]:
#columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']

columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Education']

In [15]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [16]:
absenteeism_scaler = CustomScaler(columns = columns_to_scale)

In [17]:
absenteeism_scaler.fit(unscaled_inputs)

In [18]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [19]:
scaled_inputs.shape

(700, 10)

## Split the data into train & test and shuffle

### Import the relevant module

In [20]:
from sklearn.model_selection import train_test_split

### Split

In [21]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [22]:
print (x_train.shape, y_train.shape)

(560, 10) (560,)


In [23]:
print (x_test.shape, y_test.shape)

(140, 10) (140,)


# Logistic regression with sklearn

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [25]:
reg = LogisticRegression()

In [26]:
reg.fit(x_train,y_train)

In [27]:
reg.score(x_train,y_train)

0.7589285714285714

we got 76% accuracy on our train data

### Finding the intercept and coefficients

In [28]:
reg.intercept_

array([-0.89430347])

In [29]:
reg.coef_

array([[ 2.07829536,  0.56483534,  2.46124051,  0.14594384,  0.58473439,
        -0.19183695,  0.2830851 , -0.24881378,  0.33608699, -0.27816207]])

### Creating a summary table

In [30]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [31]:
feature_name = unscaled_inputs.columns.values

In [32]:
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.078295
1,Reason_2,0.564835
2,Reason_3,2.461241
3,Month Value,0.145944
4,Transportation Expense,0.584734
5,Age,-0.191837
6,Body Mass Index,0.283085
7,Education,-0.248814
8,Children,0.336087
9,Pets,-0.278162


a feature is not particularly important if it's coefficient is around 0

## Testing the model

In [33]:
reg.score(x_test,y_test)

0.7142857142857143

In [34]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.72950466, 0.27049534],
       [0.61849096, 0.38150904],
       [0.47002668, 0.52997332],
       [0.79008837, 0.20991163],
       [0.08538205, 0.91461795],
       [0.32020341, 0.67979659],
       [0.29191653, 0.70808347],
       [0.12475811, 0.87524189],
       [0.79299087, 0.20700913],
       [0.76111179, 0.23888821],
       [0.52181134, 0.47818866],
       [0.24765854, 0.75234146],
       [0.0672757 , 0.9327243 ],
       [0.75389786, 0.24610214],
       [0.30651775, 0.69348225],
       [0.57412812, 0.42587188],
       [0.57337703, 0.42662297],
       [0.5631545 , 0.4368455 ],
       [0.41334168, 0.58665832],
       [0.05062227, 0.94937773],
       [0.71633717, 0.28366283],
       [0.79008837, 0.20991163],
       [0.40654623, 0.59345377],
       [0.40654623, 0.59345377],
       [0.22388   , 0.77612   ],
       [0.75670646, 0.24329354],
       [0.54184086, 0.45815914],
       [0.86789731, 0.13210269],
       [0.18707494, 0.81292506],
       [0.79008837, 0.20991163],
       [0.

In [35]:
predicted_proba.shape

(140, 2)

predicted_proba[0] is the probability of 0 / not excessively absent

predicted_proba[1] is the probability of 1 / excessively absent


In [36]:
x_test_probability_of_excessively_absent = predicted_proba[:,1]
x_test_probability_of_excessively_absent

array([0.27049534, 0.38150904, 0.52997332, 0.20991163, 0.91461795,
       0.67979659, 0.70808347, 0.87524189, 0.20700913, 0.23888821,
       0.47818866, 0.75234146, 0.9327243 , 0.24610214, 0.69348225,
       0.42587188, 0.42662297, 0.4368455 , 0.58665832, 0.94937773,
       0.28366283, 0.20991163, 0.59345377, 0.59345377, 0.77612   ,
       0.24329354, 0.45815914, 0.13210269, 0.81292506, 0.20991163,
       0.34302973, 0.69393555, 0.6976588 , 0.49900902, 0.20991163,
       0.51336942, 0.21393279, 0.76248265, 0.37446312, 0.57274884,
       0.20308448, 0.46566767, 0.22828093, 0.57274884, 0.82340905,
       0.60536243, 0.70406498, 0.27049534, 0.20469835, 0.19642418,
       0.57952086, 0.49824123, 0.67979659, 0.2483982 , 0.83133907,
       0.40135771, 0.89320731, 0.22214099, 0.50917407, 0.51958218,
       0.70319994, 0.67065936, 0.27656714, 0.81023518, 0.1977299 ,
       0.25436846, 0.17417808, 0.21393279, 0.7510194 , 0.27831472,
       0.21393279, 0.44396366, 0.90378454, 0.42439063, 0.57529

In [37]:
x_test_probability_of_excessively_absent.shape

(140,)

### Preparing a *.csv file that contain data with predictions

In [40]:
data_to_load_in_tableau = unscaled_inputs.copy()
data_to_load_in_tableau

Unnamed: 0,Reason_1,Reason_2,Reason_3,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,7,289,33,30,0,2,1
1,0,0,0,7,118,50,31,0,1,0
2,0,0,0,7,179,38,31,0,0,0
3,1,0,0,7,279,39,24,0,2,0
4,0,0,0,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,5,179,40,22,1,2,0
696,1,0,0,5,225,28,24,0,1,2
697,1,0,0,5,330,28,25,1,0,0
698,0,0,0,5,235,32,25,1,0,0


In [42]:
predicitons = reg.predict_proba(scaled_inputs)
Probability_of_excessivly_absent = predicitons[:, 1]
Probability_of_excessivly_absent

array([0.56508671, 0.14731807, 0.23888821, 0.87107976, 0.56508671,
       0.23888821, 0.45744381, 0.57274884, 0.80476551, 0.27831472,
       0.91461795, 0.91461795, 0.91461795, 0.71494242, 0.23888821,
       0.70843286, 0.23888821, 0.78624803, 0.70808347, 0.41424517,
       0.87503771, 0.72335785, 0.87537315, 0.58291313, 0.94074051,
       0.46780232, 0.57529812, 0.16251436, 0.94074051, 0.2465466 ,
       0.79316738, 0.68553551, 0.28676085, 0.68553551, 0.2465466 ,
       0.46780232, 0.57529812, 0.83165286, 0.15684254, 0.58291313,
       0.20324872, 0.14388744, 0.25436846, 0.92399734, 0.27656714,
       0.14388744, 0.25436846, 0.38150904, 0.59300745, 0.83133907,
       0.59300745, 0.26703827, 0.26703827, 0.14388744, 0.58544566,
       0.15809704, 0.81023518, 0.25436846, 0.78909541, 0.42439063,
       0.25436846, 0.59300745, 0.26235206, 0.14909687, 0.16372253,
       0.1369537 , 0.26235206, 0.27527272, 0.14909687, 0.27527272,
       0.43460021, 0.39138847, 0.26235206, 0.27527272, 0.94681

In [43]:
data_to_load_in_tableau['Probability_of_excessivly_absent'] = Probability_of_excessivly_absent

In [44]:
data_to_load_in_tableau

Unnamed: 0,Reason_1,Reason_2,Reason_3,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Probability_of_excessivly_absent
0,0,0,0,7,289,33,30,0,2,1,0.565087
1,0,0,0,7,118,50,31,0,1,0,0.147318
2,0,0,0,7,179,38,31,0,0,0,0.238888
3,1,0,0,7,279,39,24,0,2,0,0.871080
4,0,0,0,7,289,33,30,0,2,1,0.565087
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,5,179,40,22,1,2,0,0.630117
696,1,0,0,5,225,28,24,0,1,2,0.711344
697,1,0,0,5,330,28,25,1,0,0,0.860716
698,0,0,0,5,235,32,25,1,0,0,0.228692


In [45]:
data_to_load_in_tableau.to_csv('data_to_load_in_tableau.csv', index = False)