<a href="https://colab.research.google.com/github/Haaz123/Absenteeism_Prediction/blob/master/Absenteeism_scaling_and_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_Preprocessed_1.csv')

In [3]:
unscaled_inputs = data_preprocessed.iloc[:,:-1] 
targets = data_preprocessed.iloc[:,-1]

# The following step is done after analysing our model we will eliminate feture that have no affect on our model

unscaled_inputs = unscaled_inputs.drop(['Distance to Work', 'Daily Work Load Average','Day of the Week'], axis =1)

In [4]:
#unscaled_inputs.head() #checking if our dataset is in correct order.
#targets

In [5]:
# now lets scale the data 
# Scaling dummy data makes it harder to interpret the results so lets build the custom scaler



In [6]:
# Scale all including dummy variable code for higher ML accuracy

#from sklearn.preprocessing import StandardScaler

#scaler = StandardScaler()

#scaler.fit(unscaled_inputs)


In [7]:
#scaled_inputs = scaler.transform(unscaled_inputs)

In [8]:
# Custom scaler 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class MyScaler(BaseEstimator,TransformerMixin):
    def __init__(self,columns,with_mean=True,with_std=True,copy=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.array(np.mean(X[self.columns]))
        self.var_ = np.array(np.var(X[self.columns]))
        return self

    def transform(self,X,y=None,copy=None):
        initial_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled],axis=1)[initial_col_order]



In [9]:
#unscaled_inputs.columns.values 
# copy the names into new variable and remove the names that you do not want to scale

In [10]:
columns_to_scale = ['Transportation Expense', 'Age',
        'Body Mass Index',
       'Children', 'Pets', 'Month of Year']

In [11]:
scaler = MyScaler(columns_to_scale)

In [12]:
scaler.fit(unscaled_inputs)



MyScaler(columns=['Transportation Expense', 'Age', 'Body Mass Index',
                  'Children', 'Pets', 'Month of Year'],
         copy=None, with_mean=None, with_std=None)

In [13]:
scaled_inputs = scaler.transform(unscaled_inputs)

In [14]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month of Year
0,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.182726
1,0,0,0,0,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690,0.182726
2,0,0,0,1,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690,0.182726
3,1,0,0,0,0.854936,0.405184,-0.643782,0,0.880469,-0.589690,0.182726
4,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.182726
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690,-0.388293
696,1,0,0,0,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663,-0.388293
697,1,0,0,0,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690,-0.388293
698,0,0,0,1,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690,-0.388293


In [15]:
# now lets ssplit our data into training and testing

from sklearn.model_selection import  train_test_split


In [16]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size= 0.8, random_state =20)

In [17]:
print(x_train.shape , y_train.shape) 
print(x_test.shape, y_test.shape)

(560, 11) (560,)
(140, 11) (140,)


In [18]:
# lets apply logisticc regression 
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [19]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
reg.score(x_train, y_train)

0.7732142857142857

In [21]:
# lets create a summary table 

feature_name = unscaled_inputs.columns.values

In [22]:
summary_table = pd.DataFrame(columns= ['Features Name'], data = feature_name)

In [23]:
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table.index = summary_table.index +1 
summary_table.loc[0] = ['intercept', reg.intercept_[0]]
summary_table['Odds_Ratio'] = np.exp(summary_table.Coefficient)
summary_table.sort_values('Odds_Ratio', ascending=False)


Unnamed: 0,Features Name,Coefficient,Odds_Ratio
3,Reason_3,3.115553,22.545903
1,Reason_1,2.800197,16.447892
2,Reason_2,0.951884,2.590585
4,Reason_4,0.839001,2.314054
5,Transportation Expense,0.605284,1.831773
9,Children,0.348262,1.416604
7,Body Mass Index,0.279811,1.32288
11,Month of Year,0.15893,1.172256
6,Age,-0.169891,0.843757
8,Education,-0.210533,0.810152


In [25]:
# After analysing lets perform backward elimination and remove the feature with no or negligble affect

In [26]:
# lets save the model 

import pickle

In [27]:
with open ('model', 'wb') as file:
  pickle.dump(reg, file)

In [28]:
with open ('Scaler' , 'wb') as file: 
  pickle.dump(scaler, file)

In [None]:
# Lets test our model in new notebook