                                                    CLASSIFICATION IN MACHINE LEARNING USING
                                                    i)logistic regression(sklearn)
                                                    ii)neural network
                                                    iii)random forest classifier
                                                    

IMPORTING RELEVANT LIBRARIES

In [269]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
import tensorflow as tf 
from tensorflow.keras import Sequential,optimizers
from tensorflow.keras.layers import Dense
from sklearn.ensemble import RandomForestClassifier
import pickle

In [270]:
df_preprocessed=pd.read_csv("Absenteeism_preprocessed_data.csv")
df_preprocessed=df_preprocessed.drop('Unnamed: 0',axis=1)

In [271]:
df_preprocessed['Targets']=np.where(df_preprocessed["Absenteeism Time in Hours"]>np.median(df_preprocessed['Absenteeism Time in Hours']),1,0) #define categorical target
df_preprocessed=df_preprocessed.drop("Absenteeism Time in Hours",axis=1) #drop original target as we are only concerned with the binary outcomes
inputs=df_preprocessed.iloc[:,0:14] 

In [272]:

targets=df_preprocessed['Targets']
targets=np.array(targets)
np.unique(targets)

array([0, 1])

In [273]:
print(inputs.shape)
print(targets.shape)

(700, 14)
(700,)


Split the datset into training and testing part

In [274]:
x_train,x_test,y_train,y_test=train_test_split(inputs,targets,train_size=0.85,random_state=10)
print(x_test.shape,x_train.shape ,"\n",y_test.shape,y_train.shape)
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,train_size=0.9,random_state=10)
print(x_val.shape,x_train.shape ,"\n",y_val.shape,y_train.shape)
x_train

(105, 14) (595, 14) 
 (105,) (595,)
(60, 14) (535, 14) 
 (60,) (535,)


Unnamed: 0,reason1,reason2,reason3,reason4,month,week_day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
75,1,0,0,0,10,2,1.036026,0.074838,0.562059,-0.458497,-0.408580,1,1,1
199,1,0,0,0,4,3,-1.016322,-1.209478,-0.379188,1.366488,-0.408580,1,2,0
696,1,0,0,0,5,3,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,1,1,2
213,0,0,0,0,5,3,0.854936,-1.682647,0.405184,2.677510,-0.643782,1,2,0
515,0,0,0,1,10,2,0.040034,-0.263140,-1.320435,0.326336,-0.643782,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,0,0,0,1,7,5,-0.654143,1.426749,0.248310,-0.806331,1.002633,1,0,0
281,0,0,0,1,9,5,1.036026,0.074838,0.562059,0.560476,-0.408580,1,1,1
360,0,0,0,1,1,1,-1.016322,-1.209478,-0.379188,1.456728,-0.408580,1,2,0
507,1,0,0,0,10,4,1.005844,1.223963,1.973929,0.326336,2.178644,1,0,2


I. USING LOGISTIC REGRESSION 

In [275]:


reg_model=LogisticRegression()
reg_model.fit(x_train,y_train)
print("score using builtin function:",reg_model.score(x_test,y_test))
print("\nchecking manually")
predicted=reg_model.predict(x_test)
accuracy_matrix=np.array([int(val) for val in predicted==y_test])
print("\t\t\t\taccuracy_matrix\n",accuracy_matrix)
print("\nscore:",sum(accuracy_matrix)/accuracy_matrix.shape[0])

score using builtin function: 0.7428571428571429

checking manually
				accuracy_matrix
 [0 1 0 0 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 0
 1 0 0 1 1 0 0 1 1 0 1 1 0 1 1 1 0 0 0 1 1 1 0 1 0 0 1 1 1 1 1]

score: 0.7428571428571429


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [276]:
features=inputs.columns.values
coefficents=reg_model.coef_[0]

df_reg=pd.DataFrame({"features": features,"coef":coefficents})
df_reg=pd.concat([df_reg,pd.DataFrame({"features":"intercept","coef":reg_model.intercept_})],axis=0)
df_reg['abs_coef']=df_reg['coef'].abs()
df_reg=df_reg.sort_values(by="abs_coef",ascending=False)
df_reg=df_reg.drop('abs_coef',axis=1)
df_reg


Unnamed: 0,features,coef
0,reason1,2.966127
2,reason3,2.959682
0,intercept,-1.684784
3,reason4,1.095586
1,reason2,0.826448
6,Transportation Expense,0.533247
12,Children,0.410194
13,Pets,-0.284627
8,Age,-0.234342
10,Body Mass Index,0.174425


Saving the regression model

In [277]:
file=open('absenteeism_logistic_model','wb')
pickle.dump(reg_model,file)
file.close()

II.USING NEURAL NETWORK

In [278]:

input_size=14
hidden_size=300
output_size=1


nn_model=Sequential([
    Dense(hidden_size,activation='tanh'),
    Dense(hidden_size,activation='tanh'),
    Dense(output_size,activation='sigmoid')])


optimizer=optimizers.Adam(learning_rate=0.0001)
nn_model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics='accuracy')



In [279]:

batch_size = 30

max_epochs = 100

# setting  an early stopping mechanism
# let's set patience=2, to be a bit tolerant against random validation loss increases
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

# fit the model

nn_model.fit(x_train, # train inputs
          y_train, # train targets
          batch_size=batch_size, # batch size
          epochs=max_epochs, # epochs that we will train for (assuming early stopping doesn't kick in)
          callbacks=[early_stopping], # early stopping 
          validation_data=(x_val, y_val), # validation data
          #verbose = 1 # making sure we get enough information about the training process
          )  

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


<keras.src.callbacks.History at 0x1589ee2e410>

In [280]:
nn_loss,nn_accuracy=nn_model.evaluate(x_test,y_test)
print("accuracy =",nn_accuracy)

accuracy = 0.723809540271759


Saving the nn_model

In [281]:
file=open('absenteeism_nn_model','wb')
pickle.dump(nn_model,file)
file.close()

III.USING RANDOM FOREST CLASSIFIER

In [282]:

clf=RandomForestClassifier(n_estimators=40,random_state=42)
clf_model=clf.fit(x_train,y_train)
outputs=clf_model.predict(x_test)
#print(outputs)
rf_accuracy_matrix=outputs==y_test
print(np.array([int(value) for value in rf_accuracy_matrix]))
rf_accuracy=sum(rf_accuracy_matrix)/len(rf_accuracy_matrix)
print(rf_accuracy)

[0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0
 1 0 0 1 1 0 0 0 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1]
0.780952380952381


Saving the random forest model

In [283]:

file=open('absenteeism_rf_model','wb')
pickle.dump(clf_model,file)
file.close()