### Backward Elimination
* after we chcecked our model, we noticed that some of the input columns have no impact on the model.
* since we are using the model to predict values, we don't need these variables that doesn't make any significant change in the equation. 
    * Thus, we will omitt those inputs from the model


In [1]:
import numpy as np
import pandas as pd

In [2]:
df_preprocessed = pd.read_csv('../S58_L411/Absenteeism_preprocessed.csv')
df_preprocessed.describe()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.25,0.008571,0.09,0.945714,6.36,2.011429,222.347143,29.892857,36.417143,271.801774,26.737143,0.167143,1.021429,0.687143,6.761429
std,0.433322,0.09225,0.286386,0.226743,3.50501,1.480396,66.31296,14.804446,6.379083,40.021804,4.254701,0.37337,1.112215,1.166095,12.670082
min,0.0,0.0,0.0,0.0,1.0,0.0,118.0,5.0,27.0,205.917,19.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,3.0,1.0,179.0,16.0,31.0,241.476,24.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,0.0,1.0,6.0,2.0,225.0,26.0,37.0,264.249,25.0,0.0,1.0,0.0,3.0
75%,0.25,0.0,0.0,1.0,10.0,3.0,260.0,50.0,40.0,294.217,31.0,0.0,2.0,1.0,8.0
max,1.0,1.0,1.0,1.0,12.0,6.0,388.0,52.0,58.0,378.884,38.0,1.0,4.0,8.0,120.0


#### Target data for classification
* set median value as the threshold to cut samples into excessive absence and regular absence
* using median, the resulting dataset will be balanced between the two classes
* But, keep in mind that such decision should be made according to the requests and questions you are trying to answer.

In [3]:
med = df_preprocessed['Absenteeism Time in Hours'].median(axis=0)
med

3.0

#### Drop unnecessary inputs based on the previous results

In [4]:
targets = df_preprocessed['Absenteeism Time in Hours'].map(lambda x: 1 if (x>med) else 0)
df_preprocessed['Excessively Absenteeism'] = targets
cols_to_drop = ['Absenteeism Time in Hours', 
                'Daily Work Load Average', 
                'Distance to Work', 
                'Day of the Week', 
                'Education']
df_with_targets = df_preprocessed.drop(cols_to_drop, axis=1)

In [5]:
df_with_targets.describe()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Children,Pets,Excessively Absenteeism
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.25,0.008571,0.09,0.945714,6.36,222.347143,36.417143,26.737143,1.021429,0.687143,0.455714
std,0.433322,0.09225,0.286386,0.226743,3.50501,66.31296,6.379083,4.254701,1.112215,1.166095,0.498391
min,0.0,0.0,0.0,0.0,1.0,118.0,27.0,19.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,3.0,179.0,31.0,24.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,6.0,225.0,37.0,25.0,1.0,0.0,0.0
75%,0.25,0.0,0.0,1.0,10.0,260.0,40.0,31.0,2.0,1.0,1.0
max,1.0,1.0,1.0,1.0,12.0,388.0,58.0,38.0,4.0,8.0,1.0


#### select the inputs

In [6]:
unscaled_inputs = df_with_targets.iloc[:, :-1]
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Children',
       'Pets'], dtype=object)

In [7]:
dummy_cols = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

# tobe_scaled_cols = ['Month Value',
#        'Day of the Week', 'Transportation Expense', 'Distance to Work',
#        'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
#        'Children', 'Pets']
tobe_scaled_cols = unscaled_inputs.loc[:, ~unscaled_inputs.columns.isin(dummy_cols)].columns.values

In [8]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()
absenteeism_scaler.fit(unscaled_inputs[tobe_scaled_cols])
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs[tobe_scaled_cols])
scaled_inputs_df = pd.DataFrame(data=scaled_inputs, columns=tobe_scaled_cols)
scaled_inputs_df = pd.concat([unscaled_inputs[dummy_cols], scaled_inputs_df], axis=1)
scaled_inputs_df.describe()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Children,Pets
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.25,0.008571,0.09,0.945714,-8.120488e-17,-2.131628e-16,1.319579e-16,1.446462e-16,9.135549e-17,-1.2688260000000002e-17
std,0.433322,0.09225,0.286386,0.226743,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715
min,0.0,0.0,0.0,0.0,-1.530333,-1.574681,-1.477309,-1.819793,-0.91903,-0.5896898
25%,0.0,0.0,0.0,1.0,-0.9593133,-0.6541427,-0.8498113,-0.643782,-0.91903,-0.5896898
50%,0.0,0.0,0.0,1.0,-0.1027836,0.04003371,0.09143539,-0.4085798,-0.01928035,-0.5896898
75%,0.25,0.0,0.0,1.0,1.039256,0.5682114,0.5620587,1.002633,0.8804693,0.2684866
max,1.0,1.0,1.0,1.0,1.610276,2.499833,3.385799,2.649049,2.679969,6.275721


In [9]:
scaled_inputs_df.shape

(700, 10)

#### Divide the data into train, validation and test
* Instead of manual shuffle and manual division, this time we use sklearn ready function

In [10]:
from sklearn.model_selection import train_test_split
x_train, test_input, y_train, test_target = train_test_split(scaled_inputs_df, targets, train_size=0.8, random_state=20)
unscaled_x_train, unscaled_test_input, unscaled_y_train, unscaled_test_target = train_test_split(unscaled_inputs, targets, train_size=0.8, random_state=20)

In [11]:
print(x_train.shape)
print(y_train.shape)
print(test_input.shape)
print(test_target.shape)

(560, 10)
(560,)
(140, 10)
(140,)


### Creating the Model with sklearn

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#### Create the model

In [13]:
reg = LogisticRegression()

#### Fit the model

In [14]:
reg.fit(x_train, y_train)

#### Show the accuracy of the model

In [15]:
reg.score(x_train, y_train)

0.7875

In [16]:
# reg.score(test_input, test_target)

#### Manually check the accuracy

In [17]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [18]:
model_outputs == y_train

346     True
91      True
299     True
129     True
695     True
       ...  
218     True
223    False
271     True
474     True
355     True
Name: Absenteeism Time in Hours, Length: 560, dtype: bool

In [19]:
np.sum(model_outputs == y_train)

441

In [20]:
model_outputs.shape[0]

560

In [21]:
np.sum(model_outputs == y_train)/model_outputs.shape[0]

0.7875

### We can now extract the logistic regression equation from the model and use it to prediction outside here

#### Finding the intercept and coefficients

In [22]:
reg.intercept_

array([-3.25788969])

In [23]:
reg.coef_

array([[ 1.96597815,  0.41796175,  2.33752272,  2.46025606,  0.18566673,
         0.65722913, -0.17543728,  0.34024856,  0.38920252, -0.29346415]])

In [24]:
scaled_inputs_df.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Children',
       'Pets'], dtype=object)

In [25]:
feature_name = scaled_inputs_df.columns.values

In [26]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,1.965978
1,Reason_2,0.417962
2,Reason_3,2.337523
3,Reason_4,2.460256
4,Month Value,0.185667
5,Transportation Expense,0.657229
6,Age,-0.175437
7,Body Mass Index,0.340249
8,Children,0.389203
9,Pets,-0.293464


#### intercept
* we would like to put the intercept at the beginning of the dataframe
* we can shift the index by one, and then add the new row at location 0
* after than sort the dataframe by index again

In [27]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table.sort_index(inplace=True)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-3.25789
1,Reason_1,1.965978
2,Reason_2,0.417962
3,Reason_3,2.337523
4,Reason_4,2.460256
5,Month Value,0.185667
6,Transportation Expense,0.657229
7,Age,-0.175437
8,Body Mass Index,0.340249
9,Children,0.389203


#### Calculate the odds_ratio for each feature
* since the equation is logistic regression, we use log function. log(odds) = intercept + b1x1 + b2x2 ..
* the odds_ratio for each variable means to add exponential to both sides of the equation
* odds_ration = e^intercept + e^b1x1 + ...

In [28]:
summary_table['odds_ratio'] = np.exp(summary_table['Coefficient'])
summary_table.sort_values(by= 'odds_ratio', ascending=False)
# Whenever a person states absence, we have a higher chance of getting excessive absence

Unnamed: 0,Feature name,Coefficient,odds_ratio
4,Reason_4,2.460256,11.707809
3,Reason_3,2.337523,10.355551
1,Reason_1,1.965978,7.141895
6,Transportation Expense,0.657229,1.929439
2,Reason_2,0.417962,1.518863
9,Children,0.389203,1.475803
8,Body Mass Index,0.340249,1.405297
5,Month Value,0.185667,1.204021
7,Age,-0.175437,0.83909
10,Pets,-0.293464,0.745676


#### a second model with unscaled inputs
* this model is easier to interpret but it loses accuracy

In [29]:
reg_unscaled = LogisticRegression(max_iter=10000)
reg_unscaled.fit(unscaled_x_train, unscaled_y_train)
reg_unscaled.score(unscaled_x_train, unscaled_y_train)


0.7875

In [30]:
feature_name2 = unscaled_inputs.columns.values
summary_table2 = pd.DataFrame(columns=['Feature name'], data=feature_name2)
summary_table2['Coefficient'] = np.transpose(reg_unscaled.coef_)
summary_table2.index = summary_table2.index + 1
summary_table2.loc[0] = ['Intercept', reg_unscaled.intercept_[0]]
summary_table2.sort_index(inplace=True)
summary_table2['odds_ratio'] = np.exp(summary_table2['Coefficient'])
summary_table2.sort_values(by= 'odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,odds_ratio
4,Reason_4,2.470408,11.827272
3,Reason_3,2.373264,10.732367
1,Reason_1,1.968642,7.160946
2,Reason_2,0.51133,1.667508
9,Children,0.349377,1.418183
8,Body Mass Index,0.082077,1.085539
5,Month Value,0.053165,1.054604
6,Transportation Expense,0.010057,1.010108
7,Age,-0.02885,0.971562
10,Pets,-0.256107,0.774059


### Backward Elimination

In [31]:
# DONE in this file :)

### Final step - TESTing the Model

In [32]:
reg.score(test_input, test_target)

0.7428571428571429

#### get the probability of assigning an x to a class

In [33]:
predict_proba = reg.predict_proba(test_input) 
# [probability of class 0, probability of class 1]
predict_proba

array([[0.70446357, 0.29553643],
       [0.574614  , 0.425386  ],
       [0.38709574, 0.61290426],
       [0.78460757, 0.21539243],
       [0.07556242, 0.92443758],
       [0.33776813, 0.66223187],
       [0.3018292 , 0.6981708 ],
       [0.11274358, 0.88725642],
       [0.79914929, 0.20085071],
       [0.74662327, 0.25337673],
       [0.46795698, 0.53204302],
       [0.19658591, 0.80341409],
       [0.06199265, 0.93800735],
       [0.68384528, 0.31615472],
       [0.30689931, 0.69310069],
       [0.48675937, 0.51324063],
       [0.534122  , 0.465878  ],
       [0.5209105 , 0.4790895 ],
       [0.41651613, 0.58348387],
       [0.04233085, 0.95766915],
       [0.70075358, 0.29924642],
       [0.78460757, 0.21539243],
       [0.4305104 , 0.5694896 ],
       [0.4305104 , 0.5694896 ],
       [0.24683468, 0.75316532],
       [0.75323437, 0.24676563],
       [0.49405402, 0.50594598],
       [0.84746008, 0.15253992],
       [0.18711318, 0.81288682],
       [0.78460757, 0.21539243],
       [0.

In [34]:
predict_proba[:,1]

array([0.29553643, 0.425386  , 0.61290426, 0.21539243, 0.92443758,
       0.66223187, 0.6981708 , 0.88725642, 0.20085071, 0.25337673,
       0.53204302, 0.80341409, 0.93800735, 0.31615472, 0.69310069,
       0.51324063, 0.465878  , 0.4790895 , 0.58348387, 0.95766915,
       0.29924642, 0.21539243, 0.5694896 , 0.5694896 , 0.75316532,
       0.24676563, 0.50594598, 0.15253992, 0.81288682, 0.21539243,
       0.37455136, 0.66619194, 0.68552394, 0.55832511, 0.21539243,
       0.56251286, 0.20949433, 0.73711739, 0.43180154, 0.63140484,
       0.20656906, 0.45687819, 0.22759228, 0.12763784, 0.84928779,
       0.55656583, 0.68254977, 0.29553643, 0.20570834, 0.1980159 ,
       0.60113339, 0.11545288, 0.66223187, 0.26251583, 0.84094506,
       0.45148261, 0.89546419, 0.22209945, 0.10844357, 0.11367612,
       0.73490559, 0.65027459, 0.29267953, 0.79908966, 0.19902104,
       0.27395187, 0.01674681, 0.20949433, 0.72840225, 0.36367213,
       0.20949433, 0.07838289, 0.92262253, 0.49763596, 0.63863

### Doing KFold Cross Validation on the scaled data

In [44]:
from sklearn.model_selection import cross_val_score
# score without kfold is 0.7875
score_lr=cross_val_score(LogisticRegression(), x_train, y_train, cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))

[0.80357143 0.76785714 0.73214286 0.78571429 0.75      ]
Avg : 0.7678571428571429


## Deploy the trained model
### 1. Saving the Model (pickle)
### 2. Create a module that works with this model
        * that means a model that will do the preprocessing same as in the trained data.

#### 1. Saving the model
* We need to make our model portable such that it can directly be used to predict new data without retraining it.
* that means we must somehow store the regression object that has reached pricise values in its' equation
    * that means the perfect weight and bias for the equation
* python Pickle is a module that can be used to convert a Python object into a character stream

In [130]:
import pickle

In [131]:
with open('absenteeism_model', 'wb') as file:
    pickle.dump(reg, file)

In [132]:
with open('absenteeism_scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)