# creating a logisitc regression to predict absenteeism

In [None]:
import pandas as pd
import numpy as np 

In [None]:
data_preprocessed=pd.read_csv("Absenteeism_preprocessed.csv")

In [None]:
data_preprocessed.head()

# create the targets

In [None]:
data_preprocessed['Absenteeism Time in Hours'].median()

In [None]:
# np.where(condition,value if True, value if False) 
targets=np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median() ,1,0)
targets

In [None]:
data_preprocessed['Excessive Absenteeism']=targets
data_preprocessed.head()

In [None]:
#divide the no of targets by 1 to the total no of targets
# no of targets by 1 =targets.sum()
# total no of targets = targets.shape[0]
targets.sum()/targets.shape[0]

In [None]:
data_with_targets=data_preprocessed.drop(['Absenteeism Time in Hours','Day of the week','Daily Work Load Average','Distance to Work'],axis=1)

In [None]:
# reserved word is --> returns either True or False
data_with_targets is data_preprocessed

In [None]:
data_with_targets.head()
#data_with_targets is our checkpoint

# select the inputs for the regression 

In [None]:
data_with_targets.shape

In [None]:
data_with_targets.iloc[:,:14]   # or data_with_targets.iloc[:,0:14] (same)

In [None]:
data_with_targets.iloc[:,:-1]

In [None]:
unscaled_inputs=data_with_targets.iloc[:,:-1]

# Standarize the data 


In [None]:
# our custom scaler will not standardize the inputs, but only the ones we choose, so dummies wont be touched
# custom scaler code is based on standard scaler
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import StandardScaler
class CustomScaler(BaseEstimator,TransformerMixin):
    def __init__(self,columns):
        self.scaler=StandardScaler()
        self.columns=columns
      

    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_=np.mean(X[self.columns])
        self.var_=np.var(X[self.columns])
        return self
    
    def transform(self,X,y=None,copy=None):
        init_col_order=X.columns
        X_scaled=pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
        X_not_scaled=X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled],axis=1)[init_col_order]

In [None]:
unscaled_inputs.columns.values

In [None]:
columns_to_omit=['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']
columns_to_scale=[x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [None]:
absenteeism_scaler=CustomScaler(columns_to_scale)

In [None]:
absenteeism_scaler.fit(unscaled_inputs) 

In [None]:
scaled_inputs=absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

In [None]:
scaled_inputs.shape

# Split the data into train & test and suffle

# Import the relevant module

In [None]:
from sklearn.model_selection import train_test_split

# split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(scaled_inputs,targets,train_size=0.8,random_state=20)

# logisitic regression with sklearn

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn import metrics

# training the model

In [None]:
reg=LogisticRegression()

In [None]:
#sklearn.linear_model.LogisticRegression.fit(x,y)
#fits the model according to the given training data
reg.fit(X_train,y_train) #does all the ML 

In [None]:
#sklearn.linear_model.LogisticRegression.score(input,targets)
#returns the mean accuracy on the given test data and labels
reg.score(X_train,y_train)

# manually check the accuracy

In [None]:
#sklearn.linear_model.LogisticRegression.predict(inputs)
#predicts class labels(logistic Regression outputs) for given input samples
model_outputs=reg.predict(X_train)
model_outputs

In [None]:
model_outputs==y_train   #compares predicted output to the actual output

In [None]:
np.sum((model_outputs==y_train)) #total number of correct predictions (True entries)

In [None]:
# acccuracy =correct predications/total no of observations
model_outputs.shape[0]   #gives total no of observations

In [None]:
#accuracy
np.sum((model_outputs==y_train))/model_outputs.shape[0]  #same result as sklearn.score

# Finding the intercepts and coefficients

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
feature_name=unscaled_inputs.columns.values
feature_name

In [None]:
summary_table=pd.DataFrame(columns=['Feature Name'],data=feature_name)
summary_table['Coefficients']=np.transpose(reg.coef_) 
summary_table

In [None]:
#to add intercept to the summary table
summary_table.index=summary_table.index+1  #increment index to put intercept at 0th location
summary_table.loc[0]=['Intercept',reg.intercept_[0]]
summary_table=summary_table.sort_index()
summary_table

# interpreting the cofficients

In [None]:

summary_table['Odd_Ratio']=np.exp(summary_table['Coefficients'])
summary_table

In [None]:
#DataFrame.sort_values(Series,ascending) 
#sorts the values in a data frame with respect to a given column(series)
summary_table.sort_values('Odd_Ratio',ascending=False)

# Testing the Model

In [None]:
reg.score(X_test,y_test)

In [None]:
#sklearn.linear_model.LogisticRegression.predict_proba(x)
#returns the probability estimates for all possible outputs (classes)
predicated_proba=reg.predict_proba(X_test)
predicated_proba

In [None]:
predicated_proba.shape

In [None]:
# we want probability of execessive absenteeism that is probability of being 1 ie second column
predicated_proba[:,1]

# Save the model

In [None]:
#pickle [module] is a Python module used to convert a Python object into a character stream
import pickle

In [None]:
with open('model','wb') as file:   
    pickle.dump(reg,file)          

In [None]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler,file)