## Import Libraries & Modules

In [170]:
import sys
import os

# Add the path to the utils directory
sys.path.append(os.path.abspath('../utils'))

In [171]:
import pandas as pd
import numpy as np
from data_prep_utils import save_prep_df, drop_selected_cols, display_category_summary, turn_object_into_category

## Load Data Set

In [173]:
df1 = pd.read_csv(r"../data/train.csv")
df2 = pd.read_csv(r"../data/test.csv")

In [174]:
# Combine train & test data sets
df = pd.concat([df1, df2], ignore_index=True)

In [175]:
df.head()

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,8410,31,Male,19,Education,5390,Excellent,Medium,Average,2,...,0,Mid,Medium,89,No,No,No,Excellent,Medium,Stayed
1,64756,59,Female,4,Media,5534,Poor,High,Low,3,...,3,Mid,Medium,21,No,No,No,Fair,Low,Stayed
2,30257,24,Female,10,Healthcare,8159,Good,High,Low,0,...,3,Mid,Medium,74,No,No,No,Poor,Low,Stayed
3,65791,36,Female,7,Education,3989,Good,High,High,1,...,2,Mid,Small,50,Yes,No,No,Good,Medium,Stayed
4,65026,56,Male,41,Education,4821,Fair,Very High,Average,0,...,0,Senior,Medium,68,No,No,No,Fair,Medium,Stayed


In [176]:
# Chack if there is duplicated rows
(len(df[df.duplicated()]))

0

In [177]:
# Chack if there is NaN values
sum(df.isna().sum())

0

In [219]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74498 entries, 0 to 74497
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Gender                    74498 non-null  category
 1   Years at Company          74498 non-null  int64   
 2   Job Role                  74498 non-null  category
 3   Monthly Income            74498 non-null  int64   
 4   Work-Life Balance         74498 non-null  category
 5   Job Satisfaction          74498 non-null  category
 6   Performance Rating        74498 non-null  category
 7   Number of Promotions      74498 non-null  int64   
 8   Overtime                  74498 non-null  category
 9   Distance from Home        74498 non-null  int64   
 10  Education Level           74498 non-null  category
 11  Marital Status            74498 non-null  category
 12  Number of Dependents      74498 non-null  int64   
 13  Job Level                 74498 non-null  cate

In [179]:
# Make all the Object type features into Category type
df = turn_object_into_category(df)

In [180]:
# Chack if there is a need to reduce categories
display_category_summary(df)

Unnamed: 0_level_0,Unique Values,Categories
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
Gender,2,"[Male, Female]"
Job Role,5,"[Education, Media, Healthcare, Technology, Fin..."
Work-Life Balance,4,"[Excellent, Poor, Good, Fair]"
Job Satisfaction,4,"[Medium, High, Very High, Low]"
Performance Rating,4,"[Average, Low, High, Below Average]"
Overtime,2,"[No, Yes]"
Education Level,5,"[Associate Degree, Master’s Degree, Bachelor’s..."
Marital Status,3,"[Married, Divorced, Single]"
Job Level,3,"[Mid, Senior, Entry]"
Company Size,3,"[Medium, Small, Large]"


## Remove panctuation

In [182]:
df['Education Level'] = df['Education Level'].str.replace("’", "").astype("category")

In [183]:
df['Education Level'].value_counts()

Education Level
Bachelors Degree    22331
Associate Degree    18649
Masters Degree      15021
High School         14680
PhD                  3817
Name: count, dtype: int64

## Reduce Age Features

In [185]:
# Calculate the age when the employee started at the company
df['Start Age'] = df['Age'] - df['Years at Company']

df['Age Group'] = pd.cut(df.Age, bins=[18, 23, 30, 40, 50, 60], labels=['18-23', '23-30', '30-40', '40-50', '>50'], right=False)
print(df['Age Group'].value_counts(), end='\n\n')

df['Start Age Group'] = pd.cut(df.Age, bins=[8, 18, 23, 30, 40, 60], labels=['8-18', '18-23', '23-30', '30-40', '>40'], right=False)
print(df['Start Age Group'].value_counts())

Age Group
30-40    17980
40-50    17873
>50      17633
23-30    12180
18-23     8832
Name: count, dtype: int64

Start Age Group
>40      35506
30-40    17980
23-30    12180
18-23     8832
8-18         0
Name: count, dtype: int64


## Drop Unnecessary Features

In [187]:
# Strange values
df['Company Tenure'].value_counts()

Company Tenure
65     994
43     993
55     986
50     974
52     974
      ... 
123     13
125     12
126      7
127      3
128      1
Name: count, Length: 127, dtype: int64

In [188]:
# More than 50% of the employees have a number of years in the industry that pass their age (doesn't make sense)
len(df[df['Company Tenure'] > df['Age']])

52902

In [189]:
cols_to_drop = ['Company Tenure', 'Employee ID', 'Age', 'Start Age']
df = drop_selected_cols(df, cols_to_drop)

## Prepered data set

In [191]:
df.select_dtypes(['int64', 'int32', 'float64']).head()

Unnamed: 0,Years at Company,Monthly Income,Number of Promotions,Distance from Home,Number of Dependents
0,19,5390,2,22,0
1,4,5534,3,21,3
2,10,8159,0,11,3
3,7,3989,1,27,2
4,41,4821,0,71,0


In [192]:
df.select_dtypes(['category']).head()

Unnamed: 0,Gender,Job Role,Work-Life Balance,Job Satisfaction,Performance Rating,Overtime,Education Level,Marital Status,Job Level,Company Size,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition,Age Group,Start Age Group
0,Male,Education,Excellent,Medium,Average,No,Associate Degree,Married,Mid,Medium,No,No,No,Excellent,Medium,Stayed,30-40,30-40
1,Female,Media,Poor,High,Low,No,Masters Degree,Divorced,Mid,Medium,No,No,No,Fair,Low,Stayed,>50,>40
2,Female,Healthcare,Good,High,Low,No,Bachelors Degree,Married,Mid,Medium,No,No,No,Poor,Low,Stayed,23-30,23-30
3,Female,Education,Good,High,High,No,High School,Single,Mid,Small,Yes,No,No,Good,Medium,Stayed,30-40,30-40
4,Male,Education,Fair,Very High,Average,Yes,High School,Divorced,Senior,Medium,No,No,No,Fair,Medium,Stayed,>50,>40


In [193]:
df.describe().round(2)

Unnamed: 0,Years at Company,Monthly Income,Number of Promotions,Distance from Home,Number of Dependents
count,74498.0,74498.0,74498.0,74498.0,74498.0
mean,15.72,7299.38,0.83,49.99,1.65
std,11.22,2152.51,1.0,28.51,1.55
min,1.0,1226.0,0.0,1.0,0.0
25%,7.0,5652.0,0.0,25.0,0.0
50%,13.0,7348.0,1.0,50.0,1.0
75%,23.0,8876.0,2.0,75.0,3.0
max,51.0,16149.0,4.0,99.0,6.0


In [194]:
df.shape

(74498, 23)

## Save as Pickel

In [196]:
folder = "data"
file_name = "processed_employee_data"

save_prep_df(df, folder, file_name)

File saved as: ../data/processed_employee_data_20250305.pkl
