In [1]:
# Importing the necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

ModuleNotFoundError: No module named 'seaborn'

In [None]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Data

### Importing Raw data 

In [None]:
!pip install xlrd

In [None]:
# Importing the csv file
data = pd.read_excel('INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls')

## Source Code

### Exploratory Data Analysis

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.head()

In [None]:
# Looking for missing data
data.info()

## Analysis of Department wise Perfomance

In [None]:
# A new pandas Dataframe is created to analyze department wise performance as asked.
dept = data.iloc[:,[5,27]].copy()
dept_per = dept.copy()

In [None]:
# Finding out the mean performance of all the departments and plotting its bar graph using seaborn.
dept_per.groupby(by='EmpDepartment')['PerformanceRating'].mean()

In [None]:
plt.figure(figsize=(10,4.5))
sns.barplot(dept_per['EmpDepartment'],dept_per['PerformanceRating'])

In [None]:
# Analyze each department separately
dept_per.groupby(by='EmpDepartment')['PerformanceRating'].value_counts()

In [None]:
# Creating a new dataframe to analyze each department separately
department = pd.get_dummies(dept_per['EmpDepartment'])
performance = pd.DataFrame(dept_per['PerformanceRating'])
dept_rating = pd.concat([department,performance],axis=1)

In [None]:
# Plotting a separate bar graph for performance of each department using seaborn
plt.figure(figsize=(15,10))
plt.subplot(2,3,1)
sns.barplot(dept_rating['PerformanceRating'],dept_rating['Sales'])
plt.subplot(2,3,2)
sns.barplot(dept_rating['PerformanceRating'],dept_rating['Development'])
plt.subplot(2,3,3)
sns.barplot(dept_rating['PerformanceRating'],dept_rating['Research & Development'])
plt.subplot(2,3,4)
sns.barplot(dept_rating['PerformanceRating'],dept_rating['Human Resources'])
plt.subplot(2,3,5)
sns.barplot(dept_rating['PerformanceRating'],dept_rating['Finance'])
plt.subplot(2,3,6)
sns.barplot(dept_rating['PerformanceRating'],dept_rating['Data Science'])
plt.show()

### Data Processing/ Data Munging

In [None]:
# Encoding all the ordinal columns and creating a dummy variable for them to see if there are any effects on Performance Rating
enc = LabelEncoder()
for i in (2,3,4,5,6,7,16,26):
    data.iloc[:,i] = enc.fit_transform(data.iloc[:,i])
data.head()

In [None]:
# Finding out the correlation coeffecient to find out which predictors are significant.
data.corr()

In [None]:
# Dropping the first columns as it is of no use for analysis.
data.drop(['EmpNumber'],inplace=True,axis=1)

In [None]:
data.head()

In [None]:
# Here we have selected only the important columns
y = data.PerformanceRating
#X = data.iloc[:,0:-1]  All predictors were selected it resulted in dropping of accuracy.
X = data.iloc[:,[4,5,9,20,21,22,23,24]] # Taking only variables with correlation coeffecient greater than 0.1
X.head()

In [None]:
X['YearsWithCurrManager'].value_counts()

In [None]:
# Splitting into train and test for calculating the accuracy
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [None]:
# Standardization technique is used
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train.shape

In [None]:
X_test.shape

# Performance Calculation

## 1. Logistic Regression

In [None]:
# Training the model
from sklearn.linear_model import LogisticRegression
model_logr = LogisticRegression()
model_logr.fit(X_train,y_train)

In [None]:
# Predicting the model
y_predict_log = model_logr.predict(X_test)

In [None]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_log))
print(classification_report(y_test,y_predict_log))

In [None]:
confusion_matrix(y_test,y_predict_log)

## 2. Decision Tree with GridSearchCV

In [None]:
# Training the model
from sklearn.tree import DecisionTreeClassifier

classifier_dtg=DecisionTreeClassifier(random_state=42,splitter='best')
parameters=[{'min_samples_split':[1,2,3,4,5],'criterion':['gini']},{'min_samples_split':[1,2,3,4,5],'criterion':['entropy']}]

model_griddtree=GridSearchCV(estimator=classifier_dtg, param_grid=parameters, scoring='accuracy',cv=10)
model_griddtree.fit(X_train,y_train)

In [None]:
model_griddtree.best_params_

In [None]:
# Predicting the model
y_predict_dtree = model_griddtree.predict(X_test)

In [None]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_dtree))
print(classification_report(y_test,y_predict_dtree))

In [None]:
confusion_matrix(y_test,y_predict_dtree)

## 3. Random Forest with GridSearchCV

In [None]:
# Training the model
from sklearn.ensemble import RandomForestClassifier

classifier_rfg=RandomForestClassifier(random_state=33,n_estimators=23)
parameters=[{'min_samples_split':[2,3,4,5],'criterion':['gini','entropy'],'min_samples_leaf':[1,2,3]}]

model_gridrf=GridSearchCV(estimator=classifier_rfg, param_grid=parameters, scoring='accuracy',cv=10)
model_gridrf.fit(X_train,y_train)

In [None]:
model_gridrf.best_params_

In [None]:
# Predicting the model
y_predict_rf = model_gridrf.predict(X_test)

In [None]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_rf))
print(classification_report(y_test,y_predict_rf))

In [None]:
confusion_matrix(y_test,y_predict_rf)

In [None]:
# Exporting the trained model
#from sklearn.externals import joblib
import joblib

joblib.dump(model_gridrf,'INX_Future_Inc.ml')

# Attrition

In [None]:
# Here we have selected only the important columns
at_y = data.Attrition
#X = data.iloc[:,0:-1]  All predictors were selected it resulted in dropping of accuracy.
at_X = data.iloc[:,[9,10,13,15,20,23]] # Taking only variables with correlation coeffecient greater than 0.1
at_X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(at_X,at_y,test_size=0.3,random_state=10)

# 1. Logistic regression

In [None]:
# Training the model
from sklearn.linear_model import LogisticRegression
model_logr = LogisticRegression()
model_logr.fit(at_X,at_y)

In [None]:
# Predicting the model
y_predict_log = model_logr.predict(X_test)

In [None]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_log))
print(classification_report(y_test,y_predict_log))

In [None]:
confusion_matrix(y_test,y_predict_log)

# 2.Random Forest

In [None]:
# Training the model
from sklearn.ensemble import RandomForestClassifier

classifier_rfg=RandomForestClassifier(random_state=33,n_estimators=23)
parameters=[{'min_samples_split':[2,3,4,5],'criterion':['gini','entropy'],'min_samples_leaf':[1,2,3]}]

model_gridrf=GridSearchCV(estimator=classifier_rfg, param_grid=parameters, scoring='accuracy',cv=10)
model_gridrf.fit(X_train,y_train)

In [None]:
model_gridrf.best_params_

In [None]:
y_predict_rf = model_gridrf.predict(X_test)

In [None]:
print(accuracy_score(y_test,y_predict_rf))
print(classification_report(y_test,y_predict_rf))

In [None]:
import joblib

joblib.dump(model_gridrf,'INX_Future_Inc_Attrition.ml')

# Hike

In [None]:
# Importing the csv file
dt1 = pd.read_excel('INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls')

In [None]:
# Encoding all the ordinal columns and creating a dummy variable for them to see if there are any effects on Performance Rating
enc = LabelEncoder()
for i in (2,3,4,5,6,7,16,26):
    dt1.iloc[:,i] = enc.fit_transform(dt1.iloc[:,i])
dt1.head()

In [None]:
# Here we have selected only the important columns
h_y = dt1.EmpLastSalaryHikePercent
#X = data.iloc[:,0:-1]  All predictors were selected it resulted in dropping of accuracy.
hike_acc=80
h_X = dt1.iloc[:,[4,5,9,20,21,22,23,24,27]] # Taking only variables with correlation coeffecient greater than 0.1
h_X.head()


In [None]:
# Splitting into train and test for calculating the accuracy
X_train, X_test, y_train, y_test = train_test_split(h_X,h_y,test_size=0.3,random_state=10)

In [None]:
# Standardization technique is used
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train

In [None]:
# Training the model Logistic Regression
from sklearn.linear_model import LogisticRegression
model_logr = LogisticRegression()
model_logr.fit(X_train,y_train)

In [None]:
y_predict_log = model_logr.predict(X_test)

In [None]:
# Finding accuracy, precision, recall and confusion matrix
#hike_acc+=int(classification_report(y_test,y_predict_log))
print(accuracy_score(y_test,y_predict_log))
print(classification_report(y_test,y_predict_log))

In [None]:
import joblib

joblib.dump(model_logr,'INX_Future_Inc_Hike.ml')