## Import Libraries & Modules

In [1]:
import sys
import os

# Add the path to the utils directory
sys.path.append(os.path.abspath('../utils'))

In [3]:
import pandas as pd
import numpy as np
from data_prep_utils import save_prep_df, drop_selected_cols, display_category_summary

## Load Data Set

In [6]:
df1 = pd.read_csv(r"../data/train.csv")
df2 = pd.read_csv(r"../data/test.csv")

In [7]:
# Combine train & test data sets
df = pd.concat([df1, df2], ignore_index=True)

In [10]:
df.head()

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,8410,31,Male,19,Education,5390,Excellent,Medium,Average,2,...,0,Mid,Medium,89,No,No,No,Excellent,Medium,Stayed
1,64756,59,Female,4,Media,5534,Poor,High,Low,3,...,3,Mid,Medium,21,No,No,No,Fair,Low,Stayed
2,30257,24,Female,10,Healthcare,8159,Good,High,Low,0,...,3,Mid,Medium,74,No,No,No,Poor,Low,Stayed
3,65791,36,Female,7,Education,3989,Good,High,High,1,...,2,Mid,Small,50,Yes,No,No,Good,Medium,Stayed
4,65026,56,Male,41,Education,4821,Fair,Very High,Average,0,...,0,Senior,Medium,68,No,No,No,Fair,Medium,Stayed


In [12]:
# Chack if there is duplicated rows
(len(df[df.duplicated()]))

0

In [14]:
# Chack if there is NaN values
sum(df.isna().sum())

0

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74498 entries, 0 to 74497
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Employee ID               74498 non-null  int64 
 1   Age                       74498 non-null  int64 
 2   Gender                    74498 non-null  object
 3   Years at Company          74498 non-null  int64 
 4   Job Role                  74498 non-null  object
 5   Monthly Income            74498 non-null  int64 
 6   Work-Life Balance         74498 non-null  object
 7   Job Satisfaction          74498 non-null  object
 8   Performance Rating        74498 non-null  object
 9   Number of Promotions      74498 non-null  int64 
 10  Overtime                  74498 non-null  object
 11  Distance from Home        74498 non-null  int64 
 12  Education Level           74498 non-null  object
 13  Marital Status            74498 non-null  object
 14  Number of Dependents  

In [18]:
# Make all the Object type features to Category type
for col in df.select_dtypes(['object']):
    df[col] = df[col].astype('category')

In [20]:
# Chack if there is a need to reduce categories
display_category_summary(df)

Unnamed: 0_level_0,Unique Values,Categories
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
Gender,2,"[Male, Female]"
Job Role,5,"[Education, Media, Healthcare, Technology, Fin..."
Work-Life Balance,4,"[Excellent, Poor, Good, Fair]"
Job Satisfaction,4,"[Medium, High, Very High, Low]"
Performance Rating,4,"[Average, Low, High, Below Average]"
Overtime,2,"[No, Yes]"
Education Level,5,"[Associate Degree, Master’s Degree, Bachelor’s..."
Marital Status,3,"[Married, Divorced, Single]"
Job Level,3,"[Mid, Senior, Entry]"
Company Size,3,"[Medium, Small, Large]"


## Remove panctuation

In [14]:
df['Education Level'] = df['Education Level'].str.replace("'", "")

## Feature Engineering

In [16]:
# Calculate the annual income from the monthly income
df['Annual Income'] = df['Monthly Income'] * 12

# Calculate the age when the employee started at the company
df['Start Age'] = df['Age'] - df['Years at Company']

# Chack if the employee has worked at the company for at least 10 years
df['At Least Decade'] = df['Years at Company'] >= 10

# Assuming each promotion results in a 10% increase in monthly income
promotion_factor = 0.10
df['Total Compensation'] = df['Monthly Income'] * (1 + df['Number of Promotions'] * promotion_factor)

# One-hot Encoding for some features
df = pd.get_dummies(data=df, columns=['Gender', 'Marital Status', 'Company Size'])

# The average time (years) for promotion (Assuming each promotion gets after an equal number of years)
df['avg time for promotion'] = np.where(df['Number of Promotions'] > 0,
                                        df['Years at Company'] // df['Number of Promotions'],
                                        0)

# Check if the employee has people to take care of
df['Has Dependents'] = (df['Number of Dependents'] > 0).astype(int)

# Convert miles to kilometers
df['Distance from Home'] = df['Distance from Home'] * 1.609344 

## Reduce Age Features

In [18]:
df['Age Group'] = pd.cut(df.Age, bins=[18, 23, 30, 40, 50, 60], labels=['18-23', '23-30', '30-40', '40-50', '>50'], right=False)
print(df['Age Group'].value_counts(), end='\n\n')

df['Start Age Group'] = pd.cut(df.Age, bins=[8, 18, 23, 30, 40, 60], labels=['8-18', '18-23', '23-30', '30-40', '>40'], right=False)
print(df['Start Age Group'].value_counts())

Age Group
30-40    17980
40-50    17873
>50      17633
23-30    12180
18-23     8832
Name: count, dtype: int64

Start Age Group
>40      35506
30-40    17980
23-30    12180
18-23     8832
8-18         0
Name: count, dtype: int64


## Drop Unnecessary Features

In [20]:
# Strange values
df['Company Tenure'].value_counts()

Company Tenure
65     994
43     993
55     986
50     974
52     974
      ... 
123     13
125     12
126      7
127      3
128      1
Name: count, Length: 127, dtype: int64

In [21]:
# ~10% of the employees have a number of years in the industry that pass their age (doesn't make sense)
len(df[df['Company Tenure'] > df['Age']])

52902

In [22]:
cols_to_drop = ['Company Tenure', 'Employee ID', 'Age', 'Start Age']
df = drop_selected_cols(df, cols_to_drop)

## Prepered data set

In [24]:
df.select_dtypes(['int64', 'float64']).head()

Unnamed: 0,Years at Company,Monthly Income,Number of Promotions,Distance from Home,Number of Dependents,Annual Income,Total Compensation,avg time for promotion
0,19,5390,2,35.405568,0,64680,6468.0,9.0
1,4,5534,3,33.796224,3,66408,7194.2,1.0
2,10,8159,0,17.702784,3,97908,8159.0,0.0
3,7,3989,1,43.452288,2,47868,4387.9,7.0
4,41,4821,0,114.263424,0,57852,4821.0,0.0


In [25]:
df.select_dtypes(['category']).head()

Unnamed: 0,Job Role,Work-Life Balance,Job Satisfaction,Performance Rating,Overtime,Job Level,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition,Age Group,Start Age Group
0,Education,Excellent,Medium,Average,No,Mid,No,No,No,Excellent,Medium,Stayed,30-40,30-40
1,Media,Poor,High,Low,No,Mid,No,No,No,Fair,Low,Stayed,>50,>40
2,Healthcare,Good,High,Low,No,Mid,No,No,No,Poor,Low,Stayed,23-30,23-30
3,Education,Good,High,High,No,Mid,Yes,No,No,Good,Medium,Stayed,30-40,30-40
4,Education,Fair,Very High,Average,Yes,Senior,No,No,No,Fair,Medium,Stayed,>50,>40


In [26]:
df.describe().round(2)

Unnamed: 0,Years at Company,Monthly Income,Number of Promotions,Distance from Home,Number of Dependents,Annual Income,Total Compensation,avg time for promotion,Has Dependents
count,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0
mean,15.72,7299.38,0.83,80.45,1.65,87592.55,7908.58,5.64,0.7
std,11.22,2152.51,1.0,45.89,1.55,25830.1,2456.71,8.95,0.46
min,1.0,1226.0,0.0,1.61,0.0,14712.0,1226.0,0.0,0.0
25%,7.0,5652.0,0.0,40.23,0.0,67824.0,6007.2,0.0,0.0
50%,13.0,7348.0,1.0,80.47,1.0,88176.0,7859.65,0.0,1.0
75%,23.0,8876.0,2.0,120.7,3.0,106512.0,9620.0,8.0,1.0
max,51.0,16149.0,4.0,159.33,6.0,193788.0,21088.2,51.0,1.0


In [27]:
df.shape

(74498, 33)

## Save as Pickel

In [29]:
folder = "data"
file_name = "processed_employee_data"

save_prep_df(df, folder, file_name)

File saved as: ../data/processed_employee_data_20250305.pkl
