In [1]:
#importing libraries

import pandas as pd
import numpy as np
from scipy import stats

import warnings # Used to supressed the warnings
warnings.filterwarnings('ignore')



# Loading Preprocess data 

In [2]:
data = pd.read_csv('employee_performance_analysis_preprocessed_data.csv')
data.drop('Unnamed: 0',axis=1,inplace=True) # Drop unwanted feature
data.head()

Unnamed: 0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10,...,pca17,pca18,pca19,pca20,pca21,pca22,pca23,pca24,pca25,PerformanceRating
0,-4.500581,-1.638596,1.231576,-0.856374,-0.004595,-1.470302,0.190307,0.628849,-1.438901,0.985492,...,0.764483,0.412585,-0.706154,0.13443,0.165226,-0.186988,-0.548228,0.119691,-0.319411,3
1,-4.380218,-0.0827,2.164411,-0.640767,0.094954,0.399433,1.791044,1.128307,0.083643,1.735789,...,0.678117,-0.254594,0.38331,-0.812818,-0.525499,-0.300267,-0.854725,-0.418567,-0.677438,3
2,-4.270004,2.533407,4.435022,-0.166208,-0.445744,-1.761277,1.369751,-0.465178,1.35212,1.743309,...,-0.528326,0.630269,0.59857,0.341855,-0.196459,0.289592,-0.379724,-0.393996,0.640393,4
3,2.80624,0.811354,3.138651,0.559976,-2.610698,0.403601,-1.322515,0.595991,-0.060525,-0.493627,...,-0.765197,0.678156,-1.357896,0.189924,-0.210269,0.019028,-0.650091,-0.112875,-0.411029,3
4,-4.264058,5.96105,0.091788,-1.359089,-0.437688,2.528713,0.428719,-0.988657,-0.108352,1.651643,...,0.866167,-0.433007,0.940494,-1.220986,0.734871,0.097861,-0.253388,0.396006,-0.113749,3


# DEFINE INDEPENDANT & DEPENDANT FEATURES

In [3]:
X = data.iloc[:,:-1]
y = data.PerformanceRating

In [4]:
X.head()

Unnamed: 0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10,...,pca16,pca17,pca18,pca19,pca20,pca21,pca22,pca23,pca24,pca25
0,-4.500581,-1.638596,1.231576,-0.856374,-0.004595,-1.470302,0.190307,0.628849,-1.438901,0.985492,...,-1.089882,0.764483,0.412585,-0.706154,0.13443,0.165226,-0.186988,-0.548228,0.119691,-0.319411
1,-4.380218,-0.0827,2.164411,-0.640767,0.094954,0.399433,1.791044,1.128307,0.083643,1.735789,...,-0.538598,0.678117,-0.254594,0.38331,-0.812818,-0.525499,-0.300267,-0.854725,-0.418567,-0.677438
2,-4.270004,2.533407,4.435022,-0.166208,-0.445744,-1.761277,1.369751,-0.465178,1.35212,1.743309,...,-1.603531,-0.528326,0.630269,0.59857,0.341855,-0.196459,0.289592,-0.379724,-0.393996,0.640393
3,2.80624,0.811354,3.138651,0.559976,-2.610698,0.403601,-1.322515,0.595991,-0.060525,-0.493627,...,2.610565,-0.765197,0.678156,-1.357896,0.189924,-0.210269,0.019028,-0.650091,-0.112875,-0.411029
4,-4.264058,5.96105,0.091788,-1.359089,-0.437688,2.528713,0.428719,-0.988657,-0.108352,1.651643,...,0.681571,0.866167,-0.433007,0.940494,-1.220986,0.734871,0.097861,-0.253388,0.396006,-0.113749


In [5]:
y.head()

0    3
1    3
2    4
3    3
4    3
Name: PerformanceRating, dtype: int64

# BALANCING THE TARGET FEATURE

SMOTE: SMOTE (synthetic minority oversampling technique) is one of the most commonly used oversampling methods to solve the imbalance problem. It aims to balance class distribution by randomly increasing minority class examples by replicating them. SMOTE synthesises new minority instances between existing minority instances.

In [6]:
from collections import Counter
from imblearn.over_sampling import SMOTE #SMOTE(synthetic minority oversampling techinque)
sm = SMOTE() # obeject creation
print("unbalanced data   :  ",Counter(y))
X_sm,y_sm = sm.fit_resample(X,y)
print("balanced data:    :",Counter(y_sm))

unbalanced data   :   Counter({3: 874, 2: 194, 4: 132})
balanced data:    : Counter({3: 874, 4: 874, 2: 874})


Now target feature in balance

# SPLIT TRAINING AND TESTING DATA

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_sm,y_sm,random_state=42,test_size=0.20) 

In [8]:
# Check shape of train and test
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2097, 25), (525, 25), (2097,), (525,))

# MODEL CREATION, PREDICTION AND EVALUATION

1.Support Vector Machine

In [9]:
# importing library
from sklearn.svm import SVC

# Object Creaation
svc = SVC()

# Fitting training and testing data
svc.fit(X_train,y_train)

# Prediction on train data
svc_train_predict = svc.predict(X_train)

# Prediction on test data
svc_test_predict = svc.predict(X_test)

TRAINING ACCURACY

In [10]:
# import metrics
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,classification_report,confusion_matrix
svc_train_accuracy = accuracy_score(svc_train_predict,y_train)
print("Training accuracy of support vector classifier model",svc_train_accuracy*100)
print("support vector classifier Classification report: \n",classification_report(svc_train_predict,y_train))

Training accuracy of support vector classifier model 97.42489270386267
support vector classifier Classification report: 
               precision    recall  f1-score   support

           2       0.99      0.96      0.98       712
           3       0.94      0.98      0.96       669
           4       0.99      0.98      0.98       716

    accuracy                           0.97      2097
   macro avg       0.97      0.97      0.97      2097
weighted avg       0.98      0.97      0.97      2097



Support vector classifier performing well on training accuracy

TESTING ACCURACY

In [11]:
svc_test_accuracy = accuracy_score(svc_test_predict,y_test)
print("Testing accuracy of support vector classifier model",svc_test_accuracy*100)
print("support vector classifier Classification report: \n",classification_report(svc_test_predict,y_test))

Testing accuracy of support vector classifier model 93.9047619047619
support vector classifier Classification report: 
               precision    recall  f1-score   support

           2       0.99      0.93      0.96       196
           3       0.83      0.98      0.90       147
           4       0.99      0.91      0.95       182

    accuracy                           0.94       525
   macro avg       0.94      0.94      0.94       525
weighted avg       0.95      0.94      0.94       525



In testing score is still lagging so we are going to do hyperparameter tunning with the help of grid search cv

# Hyperparameter Tunning

In [16]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.1,0.5,10,50,60,70,80],
             'gamma':[1,0.1,0.001,0.0001,0.00001],
              'kernel':['rbf'],
             'random_state':(list(range(1,20)))}
model = SVC() # Object creation
grid = GridSearchCV(model,param_grid,refit=True,verbose=2,scoring='f1',cv=5)

# Step:10 fitting the model for grid search
grid.fit(X,y)
print('Best Parameters:',grid.best_params_)
print('Best cross-validation score:',grid.best_score_)

Fitting 5 folds for each of 665 candidates, totalling 3325 fits
[CV] END .........C=0.1, gamma=1, kernel=rbf, random_state=1; total time=   0.1s
[CV] END .........C=0.1, gamma=1, kernel=rbf, random_state=1; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=rbf, random_state=1; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=rbf, random_state=1; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=rbf, random_state=1; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=rbf, random_state=2; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=rbf, random_state=2; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=rbf, random_state=2; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=rbf, random_state=2; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=rbf, random_state=2; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=rbf, random_state=3; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=rbf,

In [17]:
# set the best parameter 
clf =SVC(C=80,gamma=1,random_state=1)

# fit the model
clf.fit(X_train,y_train)

# Predict the x test
y_hat_clf = clf.predict(X_test)

# TESTING ACCURACY AFTER HYPERPARAMETER TUNNING

In [18]:
test_accuracy = accuracy_score(y_hat_clf,y_test)
print("Testing accuracy of support vector classifier model",test_accuracy*100)
print("support vector classifier Classification report: \n",classification_report(y_hat_clf,y_test))

Testing accuracy of support vector classifier model 83.61904761904762
support vector classifier Classification report: 
               precision    recall  f1-score   support

           2       0.71      1.00      0.83       130
           3       1.00      0.67      0.80       259
           4       0.81      1.00      0.89       136

    accuracy                           0.84       525
   macro avg       0.84      0.89      0.84       525
weighted avg       0.88      0.84      0.83       525



After hyperparameter tunning score is decrease

2.Random Forest 

In [19]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100) # 100 decision tree

# fitting training data
rf.fit(X_train,y_train)

# Prediction on testing data
rf_test_predict = rf.predict(X_test)

# Prediction on training data
rf_train_predict = rf.predict(X_train)

In [20]:
rf_train_accuracy = accuracy_score(rf_train_predict,y_train)
print("Training accuracy of random forest",rf_train_accuracy)
print("Classification report of training: \n",classification_report(rf_train_predict,y_train))

Training accuracy of random forest 1.0
Classification report of training: 
               precision    recall  f1-score   support

           2       1.00      1.00      1.00       690
           3       1.00      1.00      1.00       701
           4       1.00      1.00      1.00       706

    accuracy                           1.00      2097
   macro avg       1.00      1.00      1.00      2097
weighted avg       1.00      1.00      1.00      2097



Random forest classifier very well work on training data.

TESTING ACCURACY

In [22]:
rf_test_accuracy = accuracy_score(rf_test_predict,y_test)
print("Testing accuracy of random forest",rf_test_accuracy*100)
print("Classification report of testing: \n",classification_report(rf_test_predict,y_test))

Testing accuracy of random forest 94.85714285714286
Classification report of testing: 
               precision    recall  f1-score   support

           2       0.97      0.94      0.95       191
           3       0.90      0.95      0.92       164
           4       0.98      0.96      0.97       170

    accuracy                           0.95       525
   macro avg       0.95      0.95      0.95       525
weighted avg       0.95      0.95      0.95       525



# HYPER PARAMETER TUNNING WITH RANDOMIZED SEARCH CV

In [24]:
# import library and imputation of parameter
from sklearn.model_selection import RandomizedSearchCV  
#In random forest we are not used grid search CV because of memory reason.

n_estimators = [int(x) for x in np.linspace(start=100 ,stop=2000, num=10)] #No of decision tree in forest
max_features = ['auto', 'sqrt'] #Max no of feature consider to create decision tree
max_depth    = [int(x) for x in np.linspace(10,100,num=11)] #Max no of level in each decision tree
max_depth.append(None)
min_samples_split = [2,3,5,8] #Min number of data points placed in a node before the node is split
min_samples_leaf  = [1,2,3,4]  #Min number of data point allowed in leaf node

# Creating dictionary of paramter
random_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

# Object creation
rf_clf = RandomForestClassifier(random_state=42) #Provide random state because select rows and columns randomly

# Create Random search CV with parameter
rf_cv = RandomizedSearchCV(estimator=rf_clf,scoring='f1',param_distributions=random_grid,
                           n_iter=10,cv=2,verbose=2,random_state=1,n_jobs=-1)

# Fitting the training data
rf_cv.fit(X_train,y_train)

# Get best parameter
rf_best_params = rf_cv.best_params_
print(f"Best parameter: {rf_best_params}")

Fitting 2 folds for each of 10 candidates, totalling 20 fits
Best parameter: {'n_estimators': 311, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 37}


In [25]:
# Create object and place the best paramter
rf_clf1 = RandomForestClassifier(**rf_best_params)

# Fitting the training data
rf_clf1.fit(X_train,y_train)

# Prediction on test data
rf_clf1_predict = rf_clf1.predict(X_test)

TEST ACCURACY AFTER HYPER-PARAMETER TUNNING

In [26]:
rf_accuracy = accuracy_score(rf_clf1_predict,y_test)
print("Accuracy after hyperparameter tunning",rf_accuracy*100)
print("Classification report: \n",classification_report(rf_clf1_predict,y_test))

Accuracy after hyperparameter tunning 93.71428571428572
Classification report: 
               precision    recall  f1-score   support

           2       0.97      0.94      0.95       190
           3       0.87      0.94      0.90       160
           4       0.98      0.94      0.96       175

    accuracy                           0.94       525
   macro avg       0.94      0.94      0.94       525
weighted avg       0.94      0.94      0.94       525



After hyperparameter tunning score is not increases.

# 3.Artificial Neural Network [MLP Classifier]

In [27]:
# Importing library and object creation
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(60,3),
                      learning_rate='constant',
                      max_iter=250,
                      random_state=42)

In [28]:
# Fitting the training data
model.fit(X_train,y_train)

MLPClassifier(hidden_layer_sizes=(60, 3), max_iter=250, random_state=42)

In [29]:
# Predicting the probability
mlp_prdict_probability = model.predict_proba(X_test)
mlp_prdict_probability

array([[1.12169275e-02, 9.88779889e-01, 3.18363096e-06],
       [2.48142103e-14, 9.99988492e-01, 1.15078730e-05],
       [9.79051828e-01, 2.03751926e-02, 5.72979421e-04],
       ...,
       [9.99682086e-01, 3.17903736e-04, 1.05076642e-08],
       [1.30056223e-06, 9.99996418e-01, 2.28183485e-06],
       [2.66365236e-18, 4.38276363e-08, 9.99999956e-01]])

In [30]:
# Prediction on test data
mlp_test_predict = model.predict(X_test)

# Prediction on training data
mlp_train_predict = model.predict(X_train)

TRAINING ACCURACY

In [31]:
mlp_train_accuracy = accuracy_score(mlp_train_predict,y_train)
print("Training accuracy of MLP model is:",mlp_train_accuracy*100)
print("Classification report of training:"'\n',classification_report(mlp_train_predict,y_train))

Training accuracy of MLP model is: 99.76156413924654
Classification report of training:
               precision    recall  f1-score   support

           2       1.00      0.99      1.00       695
           3       0.99      1.00      1.00       696
           4       1.00      1.00      1.00       706

    accuracy                           1.00      2097
   macro avg       1.00      1.00      1.00      2097
weighted avg       1.00      1.00      1.00      2097



Multilayer percepton Perform well on training data.

TESTING ACCURACY

In [32]:
mlp_test_accuracy = accuracy_score(mlp_test_predict,y_test)
print("Testing accuracy of MLP model is:",mlp_test_accuracy*100)
print("Classification report of testing:"'\n',classification_report(mlp_test_predict,y_test))

Testing accuracy of MLP model is: 95.80952380952381
Classification report of testing:
               precision    recall  f1-score   support

           2       0.99      0.96      0.98       189
           3       0.90      0.97      0.93       159
           4       0.99      0.94      0.96       177

    accuracy                           0.96       525
   macro avg       0.96      0.96      0.96       525
weighted avg       0.96      0.96      0.96       525



In [34]:
#CONFUSION MATRIX
pd.crosstab(mlp_test_predict,y_test)

PerformanceRating,2,3,4
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,182,7,0
3,2,155,2
4,0,11,166


In [None]:
# Importing the pickle library to save the model
import pickle

# Open a file named 'mlp_classifier_model.pkl' in write binary mode ('wb')
with open('Multi-layered_Perceptron_Model.pkl', 'wb') as file:
    # Dump (save) the trained model to the file
    pickle.dump(model, file)

Conclusion:

Support vector machine well perform on training data with accuracy 97% but the test score is 94% after applying Hyperparameter tunning score is 84. 

Random forest very well perform in training data with 100% accuracy but in testing 94% after doing hyperparameter tunning testing score is decreases.

Artifical neural network[Multilayer percepton] perform very well on training data with 100% accuracy and testing score is 96%.

So we are select Artifical neuranl network [Multilayer percepton] model.