# Employee turnover prediction

## Data preparation and data cleaning

In [37]:
import pandas as pd

**Importing data**

In [38]:
df = pd.read_csv("employee_churn_data.csv")
df

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,operations,0,0.577569,3,low,5.0,0.626759,0,180.866070,no
1,operations,0,0.751900,3,medium,6.0,0.443679,0,182.708149,no
2,support,0,0.722548,3,medium,6.0,0.446823,0,184.416084,no
3,logistics,0,0.675158,4,high,8.0,0.440139,0,188.707545,no
4,sales,0,0.676203,3,high,5.0,0.577607,1,179.821083,no
...,...,...,...,...,...,...,...,...,...,...
9535,operations,0,0.610988,4,medium,8.0,0.543641,0,188.155738,yes
9536,logistics,0,0.746887,3,medium,8.0,0.549048,0,188.176164,yes
9537,operations,0,0.557980,3,low,7.0,0.705425,0,186.531008,yes
9538,IT,0,0.584446,4,medium,8.0,0.607287,1,187.641370,yes


**Normalizing data (in this case it appears to not be needed but it is better to do this to alleviate possible problems)**

In [39]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')
df

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,operations,0,0.577569,3,low,5.0,0.626759,0,180.866070,no
1,operations,0,0.751900,3,medium,6.0,0.443679,0,182.708149,no
2,support,0,0.722548,3,medium,6.0,0.446823,0,184.416084,no
3,logistics,0,0.675158,4,high,8.0,0.440139,0,188.707545,no
4,sales,0,0.676203,3,high,5.0,0.577607,1,179.821083,no
...,...,...,...,...,...,...,...,...,...,...
9535,operations,0,0.610988,4,medium,8.0,0.543641,0,188.155738,yes
9536,logistics,0,0.746887,3,medium,8.0,0.549048,0,188.176164,yes
9537,operations,0,0.557980,3,low,7.0,0.705425,0,186.531008,yes
9538,it,0,0.584446,4,medium,8.0,0.607287,1,187.641370,yes


**Converting target variable to a numeric value**

In [40]:
df.left = (df.left == 'yes').astype(int)
df.left.value_counts()

left
0    6756
1    2784
Name: count, dtype: int64

In [41]:
df[['left']]

Unnamed: 0,left
0,0
1,0
2,0
3,0
4,0
...,...
9535,1
9536,1
9537,1
9538,1


**Checking for missing values (no missing values found)**

In [42]:
df.isnull().sum()

department       0
promoted         0
review           0
projects         0
salary           0
tenure           0
satisfaction     0
bonus            0
avg_hrs_month    0
left             0
dtype: int64

**Saving the DataFrame**

In [43]:
df.to_csv('cleaned_employee_churn_data.csv', index=False)

In [44]:
df = pd.read_csv("cleaned_employee_churn_data.csv")

In [45]:
df

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,operations,0,0.577569,3,low,5.0,0.626759,0,180.866070,0
1,operations,0,0.751900,3,medium,6.0,0.443679,0,182.708149,0
2,support,0,0.722548,3,medium,6.0,0.446823,0,184.416084,0
3,logistics,0,0.675158,4,high,8.0,0.440139,0,188.707545,0
4,sales,0,0.676203,3,high,5.0,0.577607,1,179.821083,0
...,...,...,...,...,...,...,...,...,...,...
9535,operations,0,0.610988,4,medium,8.0,0.543641,0,188.155738,1
9536,logistics,0,0.746887,3,medium,8.0,0.549048,0,188.176164,1
9537,operations,0,0.557980,3,low,7.0,0.705425,0,186.531008,1
9538,it,0,0.584446,4,medium,8.0,0.607287,1,187.641370,1


In [46]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9530,9531,9532,9533,9534,9535,9536,9537,9538,9539
department,operations,operations,support,logistics,sales,it,admin,support,sales,sales,...,engineering,retail,engineering,operations,engineering,operations,logistics,operations,it,finance
promoted,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
review,0.577569,0.7519,0.722548,0.675158,0.676203,0.683206,0.620158,0.499567,0.652818,0.642031,...,0.651929,0.771905,0.701959,0.711253,0.639663,0.610988,0.746887,0.55798,0.584446,0.626373
projects,3,3,3,4,3,2,4,4,4,3,...,4,3,4,4,3,4,3,3,4,3
salary,low,medium,medium,high,high,medium,high,medium,low,medium,...,low,high,medium,medium,medium,medium,medium,low,medium,low
tenure,5.0,6.0,6.0,8.0,5.0,5.0,5.0,7.0,6.0,6.0,...,7.0,8.0,8.0,8.0,7.0,8.0,8.0,7.0,8.0,7.0
satisfaction,0.626759,0.443679,0.446823,0.440139,0.577607,0.565252,0.686951,0.720451,0.678696,0.623365,...,0.745873,0.619672,0.645299,0.534439,0.717815,0.543641,0.549048,0.705425,0.607287,0.706455
bonus,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
avg_hrs_month,180.86607,182.708149,184.416084,188.707545,179.821083,178.841879,181.142956,184.977538,183.65579,181.851,...,187.114221,188.750091,188.448342,189.241601,187.426777,188.155738,188.176164,186.531008,187.64137,185.920934
left,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
