Import all the neccessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load the dataset into a pandas DataFrame.

In [2]:
df1=pd.read_csv('hr_employee_churn_data.csv')

Check the dataframe and its data

In [3]:
df1.head()

Unnamed: 0,empid,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,1,0.38,0.53,2,157,3,0,0,low,1
1,2,0.8,0.86,5,262,6,0,0,medium,1
2,3,0.11,0.88,7,272,4,0,0,medium,1
3,4,0.72,0.87,5,223,5,0,0,low,1
4,5,0.37,0.52,2,159,3,0,0,low,1


Checking the dimensions of the DataFrame

In [4]:
df1.shape

(14999, 10)

View the DataFrame's structure, data types, and null value information.

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   empid                  14999 non-null  int64  
 1   satisfaction_level     14997 non-null  float64
 2   last_evaluation        14999 non-null  float64
 3   number_project         14999 non-null  int64  
 4   average_montly_hours   14999 non-null  int64  
 5   time_spend_company     14999 non-null  int64  
 6   Work_accident          14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   salary                 14999 non-null  object 
 9   left                   14999 non-null  int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 1.1+ MB


##Feature Engineering

Create a copy of the DataFrame for manipulation without affecting the original.

In [6]:
df2=df1.copy()

Remove the 'empid' column from the DataFrame

In [7]:
df2.drop(['empid'],axis=1,inplace=True)

In [8]:
df2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,0.38,0.53,2,157,3,0,0,low,1
1,0.8,0.86,5,262,6,0,0,medium,1
2,0.11,0.88,7,272,4,0,0,medium,1
3,0.72,0.87,5,223,5,0,0,low,1
4,0.37,0.52,2,159,3,0,0,low,1


 Check for missing values in the DataFrame

In [9]:
df2.isnull().sum()

Unnamed: 0,0
satisfaction_level,2
last_evaluation,0
number_project,0
average_montly_hours,0
time_spend_company,0
Work_accident,0
promotion_last_5years,0
salary,0
left,0


In [10]:
df2['satisfaction_level'].describe()

Unnamed: 0,satisfaction_level
count,14997.0
mean,0.612863
std,0.248634
min,0.09
25%,0.44
50%,0.64
75%,0.82
max,1.0


Filling missing values in 'satisfaction_level' with the column's mean.

In [11]:
df2['satisfaction_level'].fillna(df2['satisfaction_level'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2['satisfaction_level'].fillna(df2['satisfaction_level'].mean(), inplace=True)


Checking for any remaining missing values in the DataFrame.

In [12]:
df2.isnull().sum()

Unnamed: 0,0
satisfaction_level,0
last_evaluation,0
number_project,0
average_montly_hours,0
time_spend_company,0
Work_accident,0
promotion_last_5years,0
salary,0
left,0


View the unique values in the 'salary' column.

In [13]:
df2['salary'].unique()

array(['low', 'medium', 'high'], dtype=object)

In [14]:
salary_dummies = pd.get_dummies(df2['salary'],drop_first=True)

In [15]:
salary_dummies

Unnamed: 0,low,medium
0,True,False
1,False,True
2,False,True
3,True,False
4,True,False
...,...,...
14994,True,False
14995,True,False
14996,True,False
14997,True,False


In [16]:
df2=pd.concat([df2,salary_dummies],axis=1)

In [17]:
df2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left,low,medium
0,0.38,0.53,2,157,3,0,0,low,1,True,False
1,0.8,0.86,5,262,6,0,0,medium,1,False,True
2,0.11,0.88,7,272,4,0,0,medium,1,False,True
3,0.72,0.87,5,223,5,0,0,low,1,True,False
4,0.37,0.52,2,159,3,0,0,low,1,True,False


Dropping the salary feature

In [18]:
df2.drop(['salary'],axis=1,inplace=True)

In [19]:
df2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left,low,medium
0,0.38,0.53,2,157,3,0,0,1,True,False
1,0.8,0.86,5,262,6,0,0,1,False,True
2,0.11,0.88,7,272,4,0,0,1,False,True
3,0.72,0.87,5,223,5,0,0,1,True,False
4,0.37,0.52,2,159,3,0,0,1,True,False


Splitting the dataset into features and labels

In [20]:
X= df2.drop(labels='left',axis=1)
y= df2['left']

Splitting dataset into train and test set

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [22]:
len(X_train)

11999

In [23]:
len(X_test)

3000