In [None]:
### Predicting why emploees are leaving the company , and learn to predict who will leave the company

# Employee Analysis
# Data Laoding and understanding feature
# Exploratory data analysis and Data visualization
# Cluster Analysis
# Building prediction model using Gradient Boosting Tree
# Evaluating model performance
# Conclusion

In [None]:
## Exploratory Analysis

# summarize characteristics of data such as pattern, trends, outliers,
# and hypothesis testing using #import mu=odles, load files

import pandas  # for dataframes
import matplotlib.pyplot as plt # for plotting graphs
import seaborn as sns # for plotting graphs
# matplotlib inline descriptive statistics and visualization.
data=pandas.read_csv('HR_comma_sep.csv')
data.head()



In [None]:
data.tail()

In [None]:
# Attributes names and datatypes using info()
data.info()


In [None]:
"""
This dataset has 14,999 samples, and 10 attributes(6 integer, 2 float, and 2 objects).
No variable column has null/missing values.
"""

col_names = data.columns.tolist()
print("Column names:")
print(col_names)
print("\nSample data:")
data.head()


In [None]:
#Rename column name from “sales” to “department”
hr = data
hr=hr.rename(columns = {'sales':'department'})


In [None]:
#Print the types
hr.dtypes


In [None]:
#data is pretty clean, no missing values?
hr.isnull().any()


In [None]:
#Number of records and features
hr.shape
hr['department'].unique()


In [None]:
#combine “technical”, “support” and “IT” these three together and call them “technical”
import numpy as np
hr['department']=np.where(hr['department'] =='support', 'technical', hr['department'])
hr['department']=np.where(hr['department'] =='IT', 'technical', hr['department'])

In [None]:
# Data Insights

left = data.groupby('left')
left.mean()
hr['left'].value_counts()

In [None]:
#summary statistics
data.describe()

In [None]:
# data visualization

import pandas as pd
%matplotlib inline                   
#plot a bar graph using Matplotlib 
import matplotlib.pyplot as plt
pd.crosstab(hr.department,hr.left).plot(kind='bar')
plt.title('Turnover Frequency for Department')
plt.xlabel('Department')
plt.ylabel('Frequency of Turnover')
plt.savefig('department_bar_chart')

In [None]:
table=pd.crosstab(hr.salary, hr.left)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Salary Level vs Turnover')
plt.xlabel('Salary Level')
plt.ylabel('Proportion of Employees')
plt.savefig('salary_bar_chart')


In [None]:
left_count=data.groupby('left').count()
plt.bar(left_count.index.values, left_count['satisfaction_level'])
plt.xlabel('Employees Left Company')
plt.ylabel('Number of Employees')
plt.show()

data.left.value_counts()



In [None]:
num_projects=data.groupby('number_project').count()
plt.bar(num_projects.index.values, num_projects['satisfaction_level'])
plt.xlabel('Number of Projects')
plt.ylabel('Number of Employees')
plt.show()


In [None]:
time_spent=data.groupby('time_spend_company').count()
plt.bar(time_spent.index.values, time_spent['satisfaction_level'])
plt.xlabel('Number of Years Spend in Company')
plt.ylabel('Number of Employees')
plt.show()


In [None]:
num_bins = 10
hr.hist(bins=num_bins, figsize=(20,15))
plt.savefig("hr_histogram_plots")
plt.show()


In [None]:
# plots using seaborn
features=['number_project','time_spend_company','Work_accident','left', 'promotion_last_5years', 'department','salary']
fig=plt.subplots(figsize=(10,15))
for i, j in enumerate(features):
    plt.subplot(4, 2, i+1)
    plt.subplots_adjust(hspace = 1.0)
    sns.countplot(x=j,data = hr)
    plt.xticks(rotation=90)
    plt.title("No. of employee")


In [None]:
fig=plt.subplots(figsize=(10,15))
for i, j in enumerate(features):
    plt.subplot(4, 2, i+1)
    plt.subplots_adjust(hspace = 1.0)
    sns.countplot(x=j,data = hr, hue='left')
    plt.xticks(rotation=90)
    plt.title("No. of employee")


Data Analysis and Visualization Summary

Promotions: Employees are far more likely to quit their job if they haven't received a promotion in the last 5 years.

Time with Company: Here, The three-year mark looks like a time to be a crucial point in an employee's career. Most of them quit their job around the three-year mark. Another important point is 6-years point, where the employee is very unlikely to leave.

Number Of Projects: Employee engagement is another critical factor to influence the employee to leave the company. Employees with 3-5 projects are less likely to leave the company. The employee with less and more number of projects are likely to leave.

Salary: Most of the employees that quit among the mid or low salary groups.


In [None]:
# Cluster Analysis: based on satisfaction and performance

#import module
from sklearn.cluster import KMeans
# Filter data
left_emp =  data[['satisfaction_level', 'last_evaluation']][data.left == 1]
# Create groups using K-means clustering.
kmeans = KMeans(n_clusters = 3, random_state = 0).fit(left_emp)
# Add new column "label" annd assign cluster labels.
left_emp['label'] = kmeans.labels_
# Draw scatter plot
plt.scatter(left_emp['satisfaction_level'], left_emp['last_evaluation'], c=left_emp['label'],cmap='Accent')
plt.xlabel('Satisfaction Level')
plt.ylabel('Last Evaluation')
plt.title('3 Clusters of employees who left')
plt.show()



High Satisfaction and High Evaluation(Shaded by green color in the graph). Winners.

Low Satisfaction and High Evaluation(Shaded by blue color), Frustrated.

Moderate Satisfaction and moderate Evaluation (Shaded by grey color in the graph), 'Bad match'.


In [None]:
#Label encoding of categorical using sklearn
# Import LabelEncoder
from sklearn import preprocessing
#creating labelEncoder
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
hr['salary']=le.fit_transform(hr['salary'])
hr['department']=le.fit_transform(hr['department'])



In [None]:
# Creating Dummy Variables for Categorical Varibles

cat_vars=['department','salary']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(hr[var], prefix=var)
    hr1=hr.join(cat_list)
    hr=hr1

hr.drop(hr.columns[[8, 9]], axis=1, inplace=True)
hr.columns.values

hr_vars=hr.columns.values.tolist()
y=['left']
X=[i for i in hr_vars if i not in y]


In [None]:
y

In [None]:
#Spliting data into Feature and

X=hr.drop(['left'], axis=1)
y=hr['left']
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
# 70% training and 30% test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [None]:
print( X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)


In [None]:
# fit a model
from sklearn import datasets, linear_model
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

#Print the first 5 predictions
predictions[0:5]


In [None]:
#plot the model

## the line/model
from matplotlib import pyplot as plt
plt.scatter(y_test, predictions)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.show()


In [None]:
print("Score:", model.score(X_test, y_test))


In [None]:
# K-folds Cross Validation

from sklearn.model_selection import KFold # import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) # create an array
y = np.array([1, 2, 3, 4]) # Create another array
kf = KFold(n_splits=2) # Define the split - into 2 folds 
kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator
print(kf) 
KFold(n_splits=2, random_state=None, shuffle=False)


In [None]:
for train_index, test_index in kf.split(X):
 print("TRAIN:", train_index, "TEST:", test_index)
 X_train, X_test = X[train_index], X[test_index]
 y_train, y_test = y[train_index], y[test_index]

