In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
employee = pd.read_csv("../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [None]:
employee

In [None]:
employee.head()

In [None]:
employee.tail()

In [None]:
employee.describe()

In [None]:
employee.info()

In [None]:
#Let's replace 'Attrition', 'OverTime', 'Over18' columns with integer before performing any visualization
employee['Attrition'] = employee['Attrition'].apply(lambda x:1 if x == 'Yes' else 0)
employee['OverTime'] = employee['OverTime'].apply(lambda x:1 if x == 'Y' else 0)
employee['Over18'] = employee['Over18'].apply(lambda x:1 if x == 'Yes' else 0)

In [None]:
employee.head()

In [None]:
employee.isnull().sum()

In [None]:
employee.hist(bins = 30, figsize = (20,20), color = 'r')

In [None]:
# Several features such as 'MonthlyIncome' and 'TotalWorkingYears' are tail heavy
# It makes sense to drop 'EmployeeCount' and 'Standardhours' since they do not change from one employee to the other

In [None]:
# It makes sense to drop 'EmployeeCount' , 'Standardhours' and 'Over18' since they do not change from one employee to the other
# Let's drop 'EmployeeNumber' as well
employee.drop(['EmployeeCount','StandardHours','Over18','EmployeeNumber'], axis = 1, inplace = True)

In [None]:
employee.head()

In [None]:
# Let's see how many employees left the company! 
left = employee[employee['Attrition'] == 1]
stay = employee[employee['Attrition'] == 0]

In [None]:
# Count the number of employees who stayed and left
# It seems that we are dealing with an imbalanced dataset 
print('Total Employees = ', len(employee))
print('Number of Employees Who Left = ', len(left))
print('% of Employees Who Left = ', 1*len(left)/len(employee)*100,'%')

print('Number of Employees Who Stay = ', len(stay))
print('% of Employees Who Stay = ', 1*len(stay)/len(employee)*100,'%')

In [None]:
left.describe()

In [None]:
stay.describe()

In [None]:
#  Let's compare the mean and std of the employees who stayed and left 
# 'age': mean age of the employees who stayed is higher compared to who left
# 'DailyRate': Rate of employees who stayed is higher
# 'DistanceFromHome': Employees who stayed live closer to home 
# 'EnvironmentSatisfaction' & 'JobSatisfaction': Employees who stayed are generally more satisifed with their jobs
# 'StockOptionLevel': Employees who stayed tend to have higher stock option level
correlations = employee.corr()
f, ax = plt.subplots(figsize = (20,20))
sns.heatmap(correlations, annot = True)

In [None]:
# Job level is strongly correlated with total working hours
# Monthly income is strongly correlated with Job level
# Monthly income is strongly correlated with total working hours
# Age is stongly correlated with monthly income

In [None]:
plt.figure(figsize = [25,12])
sns.countplot(x = 'Age', hue = 'Attrition', data = employee)

In [None]:
plt.figure(figsize = [20,20])

plt.subplot(411)
sns.countplot(x = 'JobRole', hue = 'Attrition', data = employee)

plt.subplot(412)
sns.countplot(x = 'MaritalStatus', hue = 'Attrition', data = employee)

plt.subplot(413)
sns.countplot(x = 'JobInvolvement', hue = 'Attrition', data = employee)

plt.subplot(413)
sns.countplot(x = 'JobLevel', hue = 'Attrition', data = employee)

In [None]:
# Single employees tend to leave compared to married and divorced
# Sales Representitives tend to leave compared to any other job 
# Less involved employees tend to leave the company 
# Less experienced (low job level) tend to leave the company 

In [None]:
# KDE (Kernel Density Estimate) is used for visualizing the Probability Density of a continuous variable.
# KDE describes the probability density at different values in a continuous variable.

plt.figure(figsize = [15,10])
sns.kdeplot(left['DistanceFromHome'], label = 'Employee who left', shade = True, color = 'r')
sns.kdeplot(stay['DistanceFromHome'], label = 'Employee who stay', shade = True, color = 'b')

plt.xlabel('Distance from Home')

In [None]:
plt.figure(figsize = [15,10])
sns.kdeplot(left['TotalWorkingYears'], label = 'Employee who left', shade = True, color = 'r')
sns.kdeplot(stay['TotalWorkingYears'], label = 'Employee who stay', shade = True, color = 'b')

plt.xlabel('Total Working Years')

In [None]:
plt.figure(figsize = [15,10])
sns.kdeplot(left['YearsWithCurrManager'], label = 'Employee who left', shade = True, color = 'r')
sns.kdeplot(stay['YearsWithCurrManager'], label = 'Employee who stay', shade = True, color = 'b')

plt.xlabel('Years With Current Manager')

In [None]:
# Let's see the Gender vs. Monthly Income
plt.figure(figsize = [10,15])
sns.boxplot(x = employee['Gender'], y = employee['MonthlyIncome'])

In [None]:
# Let's see the Job Role vs. Monthly Income
plt.figure(figsize = [10,15])
sns.barplot(x = employee['MonthlyIncome'] , y = employee['JobRole'])

# **Training and Testing Dataset**

In [None]:
employee.head(2)

In [None]:
X_cat = employee[['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus']]
X_cat

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
X_cat = onehotencoder.fit_transform(X_cat).toarray()
X_cat

In [None]:
X_cat.shape

In [None]:
X_cat = pd.DataFrame(X_cat)

In [None]:
# note that we dropped the target 'Atrittion'
X_numerical = employee[['Age','DailyRate','DistanceFromHome','Education','EnvironmentSatisfaction','HourlyRate','JobInvolvement','JobLevel','JobSatisfaction','MonthlyIncome','MonthlyRate','NumCompaniesWorked','OverTime','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel','TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']]

In [None]:
X_all = pd.concat([X_cat, X_numerical], axis = 1)
X_all.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X_all)
X

In [None]:
y = employee['Attrition']
y

# **Train and Evaluate Logistic Regression Classifier**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Testing Set Performance
y_pred

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print('Accuracy {} %'.format( 100 * accuracy_score(y_pred, y_test)))

In [None]:
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot = True)

In [None]:
print(classification_report(y_test, y_pred))

# **Train and Evaluate A Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
# Testing Set Performance
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot = True)

In [None]:
print(classification_report(y_test, y_pred))

# **TRAIN AND EVALUATE A DEEP LEARNING MODEL**

In [None]:
import tensorflow as tf

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units = 500, activation = 'relu', input_shape = (50, )))
model.add(tf.keras.layers.Dense(units = 500, activation = 'relu'))
model.add(tf.keras.layers.Dense(units = 500, activation = 'relu'))
model.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
epochs_hist = model.fit(X_train, y_train, epochs = 100, batch_size = 50)

In [None]:
# oversampler = SMOTE(random_state=0)
# smote_train, smote_target = oversampler.fit_sample(X_train, y_train)
# epochs_hist = model.fit(smote_train, smote_target, epochs = 100, batch_size = 50)

In [None]:
y_pred = model.predict(X_test)
y_pred = (y_pred>0.5)

In [None]:
y_pred

In [None]:
plt.plot(epochs_hist.history['loss'])
plt.title("Model Loss Progress During Traing")
plt.xlabel("Epoch")
plt.ylabel("Training Loss")
plt.legend(["Training Loss"])

In [None]:
plt.plot(epochs_hist.history['accuracy'])
plt.title("Model Accuracy Progress During Traing")
plt.xlabel("Epoch")
plt.ylabel("Training Accuracy")
plt.legend(["Training Accuracy"])

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot = True)

In [None]:
print(classification_report(y_test, y_pred))