##### Importing necessary libraries


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

##### Loading  the data


In [None]:
data=pd.read_excel("INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls")

###### Basic Checks

In [None]:
data.head()

data.tail()
data.shape

data.dtypes

data.info()

data.isnull().sum()

data.describe()

data['PerformanceRating'].value_counts()

data['PerformanceRating'].unique()

data=data.drop(['EmpNumber'],axis=1)


## Exploratory Data Analysis(EDA)

In [None]:
### Univariate Analysis

!pip install sweetviz     

import sweetviz as sv            # library for univariant analysis
my_report = sv.analyze(data)     # pass the original dataframe
my_report.show_html() 


## Bivariate Analysis

## Create a new dataframe with numerical variables only(Check the datatype by using info function)
data2=data[['Age','DistanceFromHome','EmpEducationLevel','EmpEnvironmentSatisfaction','EmpHourlyRate','EmpJobInvolvement','EmpJobLevel','EmpJobSatisfaction','NumCompaniesWorked','EmpLastSalaryHikePercent','EmpRelationshipSatisfaction','TotalWorkExperienceInYears', 'TrainingTimesLastYear',
       'EmpWorkLifeBalance', 'ExperienceYearsAtThisCompany','ExperienceYearsInCurrentRole', 'YearsSinceLastPromotion','YearsWithCurrManager']]

plt.figure(figsize=(35,25), facecolor='white')#To set canvas 
plotnumber = 1#counter

for column in data2:#accessing the columns 
    if plotnumber<=28:
        ax = plt.subplot(7,4,plotnumber)
        sns.countplot(x=data2[column]
                        ,hue=data.PerformanceRating)
        plt.xlabel(column,fontsize=20)#assign name to x-axis and set font-20
        plt.ylabel('PerformanceRating',fontsize=20)
    plotnumber+=1#counter increment
plt.tight_layout()


## Create a new dataframe with categorical variables only(Check the datatype by using info function)
data1=data[['Gender','EducationBackground','MaritalStatus','EmpDepartment','EmpJobRole','BusinessTravelFrequency','OverTime','Attrition']]

# Plotting how every  categorical feature correlate with the "target"
plt.figure(figsize=(50,50), facecolor='white')#canvas size
plotnumber = 1#count variable

for column in data1:#for loop to acess columns form data1
    if plotnumber<=16:#checking whether count variable is less than 16 or not
        ax = plt.subplot(4,4,plotnumber)#plotting 8 graphs in canvas(4 rows and 4 columns)
        sns.countplot(x=data1[column].dropna(axis=0)#plotting count plot 
                        ,hue=data.PerformanceRating)
        plt.xlabel(column,fontsize=20)#assigning name to x-axis and increasing it's font 
        plt.ylabel('PerformanceRating',fontsize=20)#assigning name to y-axis and increasing it's font 
    plotnumber+=1#increasing counter
plt.tight_layout()


## Analysis of Department wise performance for each department seperately

# A new pandas Dataframe is created to analyze department wise performance
dept = data.iloc[:,[4,26]].copy()
dept_per = dept.copy()

# Creating a new dataframe to analyze each department separately
department = pd.get_dummies(dept_per['EmpDepartment'])
performance = pd.DataFrame(dept_per['PerformanceRating'])
dept_rating = pd.concat([department,performance],axis=1)

plt.figure(figsize=(20,10),facecolor='white')
ax = sns.countplot(x='EmpDepartment',hue=data.PerformanceRating,data=data)
for i in ax.patches:
    ax.annotate('{:.0f}'.format(i.get_height()), (i.get_x()+0.06, i.get_height()+2))
plt.title('Employee Departments vs PerformanceRating',fontsize=20)
plt.xlabel('EmpDepartment',fontsize=20)
plt.ylabel('PerformanceRating',fontsize=15)
plt.show()


## Data Preprocessing and Feature engineering

In [None]:
categorical_col = []#list
for column in data.columns:#for loop to acess columns form dataset
    if data[column].dtype == object and len(data[column].unique()) <= 50:#checking datatype whether datatype is object/string and number of unique label in the columns less than 50 
        categorical_col.append(column)#appending those columns in the list who statisfy the condition 
        print(f"{column} : {data[column].unique()}")#output
        print("====================================")

#### Converting categorical variables into numerical

In [None]:
data.Gender.value_counts()

data.Gender=pd.get_dummies(data.Gender,drop_first=True)
data.Gender


data.EducationBackground=data.EducationBackground.map({'Life Sciences':5,'Medical':4,'Marketing':3,'Technical Degree':2,'Other':1,'Human Resources':0 })


data.EmpDepartment=data.EmpDepartment.map({'Research & Development':3,'Sales':5,'Human Resources':2,'Development':4,'Finance':1,'Data Science':0})#imputation using map function


data.BusinessTravelFrequency=data.BusinessTravelFrequency.map({'Travel_Frequently':1,'Travel_Rarely':2,'Non-Travel':0})


data.Attrition=data.Attrition.map({'Yes':1,'No':0})

data.EmpJobRole=data.EmpJobRole.map({'Sales Executive':18, 'Manager':11, 'Developer':17, 'Sales Representative':14,
 'Human Resources':9, 'Senior Developer':12, 'Data Scientist':5,
 'Senior Manager R&D':1, 'Laboratory Technician':13, 'Manufacturing Director':7,
 'Research Scientist':15, 'Healthcare Representative':6, 'Research Director':4,
 'Manager R&D':16, 'Finance Manager':10, 'Technical Architect':1, 'Business Analyst':3,
 'Technical Lead':8, 'Delivery Manager':0})

## Encoding MaritalStatus

from sklearn.preprocessing import LabelEncoder#importing label encoder from sklearn 

label = LabelEncoder()#object creation 
data.MaritalStatus=label.fit_transform(data.MaritalStatus)#applying label encoder to  marital status

data.OverTime=label.fit_transform(data.OverTime)#label encoding

data.head()

data.dtypes

### Distribution of data

In [None]:
# let's see how data is distributed for every column

plt.figure(figsize=(20,25), facecolor='white')
plotnumber = 1 #maintian count for graph

for column in data:
    if plotnumber<=28 :# as there are 20 columns in the data
        ax = plt.subplot(7,4,plotnumber)# plotting 20 graphs (5-rows,4-columns) ,plotnumber is for count 
        sns.distplot(data[column])#plotting dist plot to know distribution
        plt.xlabel(column,fontsize=20)
        
    plotnumber+=1
plt.show()

### Outlier Analysis

In [None]:
plt.figure(figsize=(20,35), facecolor='white')
plotnumber = 1

for column in data:
    if plotnumber<=28:
        ax = plt.subplot(7,4,plotnumber)
        sns.boxplot(data[column]) 
        plt.xlabel(column,fontsize=20)
        
    plotnumber+=1
plt.show()


from scipy import stats

IQR=stats.iqr(data.TrainingTimesLastYear,interpolation='midpoint')
IQR

Q1=data.TrainingTimesLastYear.quantile(0.25)
Q3=data.TrainingTimesLastYear.quantile(0.75)
min_limit=Q1-1.5*IQR
max_limit=Q3+1.5*IQR
max_limit

data.loc[data['TrainingTimesLastYear'] > max_limit,'TrainingTimesLastYear']=data['TrainingTimesLastYear'].median()

plt.figure(figsize=(5,5))
sns.boxplot(data=data[['TrainingTimesLastYear']])
plt.show()


## Feature Selection

## Checking correlation

In [None]:
plt.figure(figsize=(30, 30))#canvas size
sns.heatmap(data.corr(), annot=True, cmap="RdYlGn", annot_kws={"size":15}) # plotting heat map to check correlation

## Model Creation

In [None]:
## Creating independent and dependent variable
X = data.drop('PerformanceRating', axis=1)#independent variable 
y = data.PerformanceRating#dependent variable 

## creating training and testing data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X, y,random_state=3)

# Checking the dimension of our train and test splits
print('Shape of x_train: ',X_train.shape)
print('Shape of y_train: ',y_train.shape)
print('Shape of x_test: ',X_test.shape)
print('Shape of y_test: ',y_test.shape)

## 1. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier #importing decision tree from sklearn.tree
dt=DecisionTreeClassifier() #object creation for decision tree  
dt.fit(X_train, y_train) #training the model

y_hat=dt.predict(X_train)#prediction
y_hat1=dt.predict(X_test)#prediction

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,f1_score,recall_score,precision_score

print(classification_report(y_train,y_hat)) # train data

print(classification_report(y_test,y_hat1)) # test data


## Hyperparameter tunning for decision tree

In [None]:
from sklearn.model_selection import GridSearchCV

#creating dictionary--> key value pair of hyperparameters having key as parameter and values as its values
params = {
    "criterion":("gini", "entropy"), #quality of split
    "splitter":("best", "random"), # searches the features for a split
    "max_depth":(list(range(1,20))), #depth of tree range from 1 to 19
    "min_samples_split":[2, 3, 4],    #the minimum number of samples required to split internal node
    "min_samples_leaf":list(range(1, 20)),#minimum number of samples required to be at a leaf node,we are passing list which is range from 1 to 19 
}


tree_clf = DecisionTreeClassifier(random_state=3)#object creation for decision tree with random state 3
tree_cv = GridSearchCV(tree_clf, params, scoring="f1", n_jobs=-1, verbose=3, cv=3)

#passing model to gridsearchCV 

#tree_cv.fit(X_train,y_train)#training data on gridsearch cv
#best_params = tree_cv.best_params_#it will give you best parameters 
#print(f"Best paramters: {best_params})")#printing  best parameters

dt1 = DecisionTreeClassifier(criterion='gini',max_depth=6,min_samples_leaf= 1,min_samples_split=2,splitter='best')#passing best parameter to decision tree

dt1.fit(X_train, y_train)#traing model with best parameter

y_pred_train1  = dt1.predict(X_train)
y_hat1 = dt1.predict(X_test)#predicting

print(classification_report(y_train, y_pred_train1))   #train data

print(classification_report(y_test,y_hat1))#it will give precision,recall,f1 scores and accuracy #test data



## 2. RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
MR=RandomForestClassifier(n_estimators=100)
MR.fit(X_train,y_train)

rf_hat=MR.predict(X_train)#prediction
rf_pred=MR.predict(X_test)

print(classification_report(y_train,rf_hat))

print(classification_report(y_test,rf_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier #importing decision tree from sklearn.tree
dt=DecisionTreeClassifier() #object creation for decision tree  
dt.fit(X_train, y_train) #training the model

y_hat=dt.predict(X_train)#prediction
y_hat1=dt.predict(X_test)#prediction

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,f1_score,recall_score,precision_score

print(classification_report(y_train,y_hat)) # train data

print(classification_report(y_test,y_hat1)) # test data


## Hyperparameter tunning for RandomForest

In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]#List Comprehension-using for loop in list
max_features = ['auto', 'sqrt']#maximum number of features allowed to try in individual tree
max_depth = [int(x) for x in np.linspace(10, 100, num=10)]#List Comprehension-using for loop in list
max_depth.append(None)
min_samples_split = [2, 5, 10]#minimum number of samples required to split an internal node
min_samples_leaf = [1, 2, 4]#minimum number of samples required to be at a leaf node.
bootstrap = [True, False]#sampling 

#dictionary for hyperparameters
random_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}

rf_clf1 = RandomForestClassifier(random_state=42)#model

rf_cv = RandomizedSearchCV(estimator=rf_clf1,param_distributions=random_grid, n_iter=100, cv=3, 
                               verbose=2, random_state=42, n_jobs=-1)
#estimator--number of decision tree
#scoring--->performance matrix to check performance
#param_distribution-->hyperparametes(dictionary we created)
#n_iter--->Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.default=10
##cv------> number of flods
#verbose=Controls the verbosity: the higher, the more messages.
#n_jobs---->Number of jobs to run in parallel,-1 means using all processors.

#rf_cv.fit(X_train, y_train)##training data on randomsearch cv
#rf_best_params = rf_cv.best_params_##it will give you best parameters 
#print(f"Best paramters: {rf_best_params})")##printing  best parameters

rf_clf2 = RandomForestClassifier(n_estimators=400,min_samples_split=10,min_samples_leaf=1,max_features='sqrt',max_depth=30,bootstrap=True)#passing best parameter to randomforest
rf_clf2.fit(X_train, y_train)

y_pred_train  = rf_clf2.predict(X_train)#training
y_predict=rf_clf2.predict(X_test)#testing

print(classification_report(y_train, y_pred_train))   #train data

print(classification_report(y_test, y_predict))   #test data

In [None]:
imp=rf_clf2.feature_importances_
imp

plt.figure(figsize=(10, 10))
sns.barplot(y=data.columns[:26],x=imp)


## 3. Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
dt=GradientBoostingClassifier()
dt.fit(X_train, y_train)

y_pred2=dt.predict(X_train)
y_pred3=dt.predict(X_test)

print(classification_report(y_train,y_pred2))   #train data

print(classification_report(y_test, y_pred3)) #test data

## Hyperparameter tunning for Gradient Boosting

In [None]:
gb_classifier = GradientBoostingClassifier()

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(gb_classifier, param_grid, cv=5)



#grid_search.fit(X_train, y_train)
#best_params = grid_search.best_params_
#print("Best Hyperparameters:", best_params)

best_gb_classifier = GradientBoostingClassifier(learning_rate= 0.01, max_depth=3, n_estimators= 300)

# Fit the classifier to the training data
best_gb_classifier.fit(X_train, y_train)
# Make predictions on the test data
y_ht=best_gb_classifier.predict(X_train)
y_hat3_pr = best_gb_classifier.predict(X_test)

print(classification_report(y_train,y_ht))

print(classification_report(y_test,y_hat3_pr))

## 4. Artificial Neural Network


In [None]:
#Training the model

from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100),batch_size=10,learning_rate_init=0.01,max_iter=2000,random_state=10)
mlp.fit(X_train,y_train)

y_hat_mlp = mlp.predict(X_train)
y_predict_mlp = mlp.predict(X_test)

print(classification_report(y_train,y_hat_mlp))

print(classification_report(y_test,y_predict_mlp))

## 5. Logistic Regression 

In [None]:
### Scaling data

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_data=scaler.fit_transform(X)

X_train,X_test,y_train,y_test=train_test_split(scaled_data, y,random_state=3)

from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)

y_pred1 = LR.predict(X_train)
y_pred = LR.predict(X_test)

print(classification_report(y_train, y_pred1)) #train data

print(classification_report(y_test, y_pred)) #test data


## 6. Support Vector Machine

In [None]:
from sklearn.svm import SVC
svclassifier = SVC() ## base model with default pbarameters
svclassifier.fit(X_train, y_train)

sv_hat=svclassifier.predict(X_train)#prediction
y_hat=svclassifier.predict(X_test)

print(classification_report(y_train,sv_hat))

print(classification_report(y_test,y_hat))

## Bagging for Support Vector Machine


In [None]:
from sklearn.ensemble import BaggingClassifier#import bagging 
model_bagg1=BaggingClassifier(base_estimator=svclassifier,n_estimators=20) ## model objet creation
#base_estimator---> algorithm which you want to pass
#n_estimotors-----> number of base learners
model_bagg1.fit(X_train,y_train) ## fitting the model

y_hat_bagg=model_bagg1.predict(X_test) ## getting the prediction
y_hat_bagg1=model_bagg1.predict(X_train)

print(classification_report(y_train,y_hat_bagg1))

print(classification_report(y_test,y_hat_bagg))

## 7. K Nearest Neighbour


In [None]:
from sklearn.neighbors import KNeighborsClassifier#USING KNN
KNN1=KNeighborsClassifier(n_neighbors=5) ## model object creation
KNN1.fit(X_train,y_train)  ## fitting the model

y_hat_knn=KNN1.predict(X_test) ## getting the predict from created model
knn_tr=KNN1.predict(X_train)

print(classification_report(y_train,knn_tr))

print(classification_report(y_test,y_hat_knn))

## Principal Component Analysis


In [None]:
df=pd.DataFrame(data=scaled_data,columns=X.columns)

## Getting the optimal number of PCA
from sklearn.decomposition import PCA
pca=PCA()
principalComponents=pca.fit_transform(data)
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel(" Number of Components")
plt.ylabel('Variance (%)') # for each components
plt.title("Explained Variance")
plt.show()

pca=PCA(n_components=10)
new_data=pca.fit_transform(df)
i=1
for i in range(1,10):
    i=i+1
    print(i)
principal_df=pd.DataFrame(data=new_data,columns=print(i))

print(np.cumsum(pca.explained_variance_ratio_))


## Fitting Models after Principal Component Analysis

# Logistic Regression with Principal Component Analysis

In [None]:
scaled_data=scaler.fit_transform(new_data)

X_train_pca,X_test_pca,y_train,y_test=train_test_split(scaled_data, y,random_state=3)

from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train_pca, y_train)

y_pred1 = LR.predict(X_train_pca)
y_pred = LR.predict(X_test_pca)

print(classification_report(y_test, y_pred)) #test data

print(classification_report(y_train, y_pred1)) #train data

## RandomForest Classifier with Principal Component Analysis


In [None]:
from sklearn.ensemble import RandomForestClassifier
MR=RandomForestClassifier(n_estimators=100)
MR.fit(X_train_pca,y_train)

rf_hat=MR.predict(X_train_pca)#prediction
rf_pred=MR.predict(X_test_pca)

print(classification_report(y_train,rf_hat))

print(classification_report(y_test,rf_pred))

In [None]:
## Hyperparameter tunning for RandomForest with Principal Component Analysis


In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]#List Comprehension-using for loop in list
max_features = ['auto', 'sqrt']#maximum number of features allowed to try in individual tree
max_depth = [int(x) for x in np.linspace(10, 100, num=10)]#List Comprehension-using for loop in list
max_depth.append(None)
min_samples_split = [2, 5, 10]#minimum number of samples required to split an internal node
min_samples_leaf = [1, 2, 4]#minimum number of samples required to be at a leaf node.
bootstrap = [True, False]#sampling 

#dictionary for hyperparameters
random_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}

rf_clf1 = RandomForestClassifier(random_state=42)#model

rf_cv = RandomizedSearchCV(estimator=rf_clf1,param_distributions=random_grid, n_iter=100, cv=3, 
                               verbose=2, random_state=42, n_jobs=-1)
#estimator--number of decision tree
#scoring--->performance matrix to check performance
#param_distribution-->hyperparametes(dictionary we created)
#n_iter--->Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.default=10
##cv------> number of flods
#verbose=Controls the verbosity: the higher, the more messages.
#n_jobs---->Number of jobs to run in parallel,-1 means using all processors.

#rf_cv.fit(X_train, y_train)##training data on randomsearch cv
#rf_best_params = rf_cv.best_params_##it will give you best parameters 
#print(f"Best paramters: {rf_best_params})")##printing  best parameters

rf_clf2 = RandomForestClassifier(n_estimators=400,min_samples_split=10,min_samples_leaf=1,max_features='sqrt',max_depth=30,bootstrap=True)#passing best parameter to randomforest
rf_clf2.fit(X_train_pca, y_train)

y_predict=rf_clf2.predict(X_test_pca)#testing
y_pred_train  = rf_clf2.predict(X_train_pca)#training

print(classification_report(y_train, y_pred_train))   

print(classification_report(y_test, y_predict))   #test data