## Import Libraries & Modules

In [363]:
import pandas as pd
import numpy as np

## Load Data Set

In [365]:
df1 = pd.read_csv(r"./Data/train.csv")
df2 = pd.read_csv(r"./Data/test.csv")

In [366]:
# Combine train & test data sets
df = pd.concat([df1, df2], ignore_index=True)

In [367]:
df.head()

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,8410,31,Male,19,Education,5390,Excellent,Medium,Average,2,...,0,Mid,Medium,89,No,No,No,Excellent,Medium,Stayed
1,64756,59,Female,4,Media,5534,Poor,High,Low,3,...,3,Mid,Medium,21,No,No,No,Fair,Low,Stayed
2,30257,24,Female,10,Healthcare,8159,Good,High,Low,0,...,3,Mid,Medium,74,No,No,No,Poor,Low,Stayed
3,65791,36,Female,7,Education,3989,Good,High,High,1,...,2,Mid,Small,50,Yes,No,No,Good,Medium,Stayed
4,65026,56,Male,41,Education,4821,Fair,Very High,Average,0,...,0,Senior,Medium,68,No,No,No,Fair,Medium,Stayed


In [368]:
# Chack if there is duplicated rows
(len(df[df.duplicated()]))

0

In [369]:
# Chack if there is NaN values
sum(df.isna().sum())

0

In [370]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74498 entries, 0 to 74497
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Employee ID               74498 non-null  int64 
 1   Age                       74498 non-null  int64 
 2   Gender                    74498 non-null  object
 3   Years at Company          74498 non-null  int64 
 4   Job Role                  74498 non-null  object
 5   Monthly Income            74498 non-null  int64 
 6   Work-Life Balance         74498 non-null  object
 7   Job Satisfaction          74498 non-null  object
 8   Performance Rating        74498 non-null  object
 9   Number of Promotions      74498 non-null  int64 
 10  Overtime                  74498 non-null  object
 11  Distance from Home        74498 non-null  int64 
 12  Education Level           74498 non-null  object
 13  Marital Status            74498 non-null  object
 14  Number of Dependents  

In [371]:
# Make all the Object type features to Category type
for col in df.select_dtypes(['object']):
    df[col] = df[col].astype('category')

In [372]:
# Chack if there is a need to reduce categories
cat_summary_df = pd.DataFrame({
    "Feature": df.select_dtypes(['category']).columns,
    "Unique Values": [df[col].nunique() for col in df.select_dtypes(['category'])],
    "Categories": [df[col].unique().tolist() for col in df.select_dtypes(['category'])],
    
})

cat_summary_df.set_index('Feature', inplace=True)
cat_summary_df

Unnamed: 0_level_0,Unique Values,Categories
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
Gender,2,"[Male, Female]"
Job Role,5,"[Education, Media, Healthcare, Technology, Fin..."
Work-Life Balance,4,"[Excellent, Poor, Good, Fair]"
Job Satisfaction,4,"[Medium, High, Very High, Low]"
Performance Rating,4,"[Average, Low, High, Below Average]"
Overtime,2,"[No, Yes]"
Education Level,5,"[Associate Degree, Master’s Degree, Bachelor’s..."
Marital Status,3,"[Married, Divorced, Single]"
Job Level,3,"[Mid, Senior, Entry]"
Company Size,3,"[Medium, Small, Large]"


## Feature Engineering

In [374]:
# Calculate the annual income from the monthly income
df['Annual Income'] = df['Monthly Income'] * 12

# Calculate the age when the employee started at the company
df['Start Age'] = df['Age'] - df['Years at Company']

# Chack if the employee has worked at the company for at least 10 years
df['At Least Decade'] = df['Years at Company'] >= 10

# Assuming each promotion results in a 10% increase in monthly income
promotion_factor = 0.10
df['Total Compensation'] = df['Monthly Income'] * (1 + df['Number of Promotions'] * promotion_factor)

# One-hot Encoding for some features
df = pd.get_dummies(data=df, columns=['Gender', 'Marital Status'])

# The average time (years) for promotion (Assuming each promotion gets after an equal number of years)
df['avg time for promotion'] = np.where(df['Number of Promotions'] > 0,
                                        df['Years at Company'] // df['Number of Promotions'],
                                        0)

# Check if the employee has people to take care of
df['Has Dependents'] = (df['Number of Dependents'] > 0).astype(int)

# Convert miles to kilometers
df['Distance from Home'] = df['Distance from Home'] * 1.609344 

## Remove panctuation

In [376]:
df['Education Level'] = df['Education Level'].str.replace("'", "")

In [377]:
df.select_dtypes(['int64', 'float64']).head()

Unnamed: 0,Employee ID,Age,Years at Company,Monthly Income,Number of Promotions,Distance from Home,Number of Dependents,Company Tenure,Annual Income,Start Age,Total Compensation,avg time for promotion
0,8410,31,19,5390,2,35.2,0,89,64680,12,6468.0,9.0
1,64756,59,4,5534,3,33.6,3,21,66408,55,7194.2,1.0
2,30257,24,10,8159,0,17.6,3,74,97908,14,8159.0,0.0
3,65791,36,7,3989,1,43.2,2,50,47868,29,4387.9,7.0
4,65026,56,41,4821,0,113.6,0,68,57852,15,4821.0,0.0


In [378]:
# Strange values
df['Company Tenure'].value_counts()

Company Tenure
65     994
43     993
55     986
50     974
52     974
      ... 
123     13
125     12
126      7
127      3
128      1
Name: count, Length: 127, dtype: int64

In [379]:
# ~10% of the employees have a number of years in the industry that pass their age (doesn't make sense)
len(df[df['Company Tenure'] > df['Age']])

52902

## Drop Unnecessary Features

In [381]:
cols_to_drop = ['Company Tenure', 'Employee ID']

for col in cols_to_drop:
    df = df.drop(columns=[col])

In [382]:
df.iloc[3]

Age                                  36
Years at Company                      7
Job Role                      Education
Monthly Income                     3989
Work-Life Balance                  Good
Job Satisfaction                   High
Performance Rating                 High
Number of Promotions                  1
Overtime                             No
Distance from Home                 43.2
Education Level             High School
Number of Dependents                  2
Job Level                           Mid
Company Size                      Small
Remote Work                         Yes
Leadership Opportunities             No
Innovation Opportunities             No
Company Reputation                 Good
Employee Recognition             Medium
Attrition                        Stayed
Annual Income                     47868
Start Age                            29
At Least Decade                   False
Total Compensation               4387.9
Gender_Female                      True


In [383]:
df.head()

Unnamed: 0,Age,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,Overtime,Distance from Home,...,Start Age,At Least Decade,Total Compensation,Gender_Female,Gender_Male,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,avg time for promotion,Has Dependents
0,31,19,Education,5390,Excellent,Medium,Average,2,No,35.2,...,12,True,6468.0,False,True,False,True,False,9.0,0
1,59,4,Media,5534,Poor,High,Low,3,No,33.6,...,55,False,7194.2,True,False,True,False,False,1.0,1
2,24,10,Healthcare,8159,Good,High,Low,0,No,17.6,...,14,True,8159.0,True,False,False,True,False,0.0,1
3,36,7,Education,3989,Good,High,High,1,No,43.2,...,29,False,4387.9,True,False,False,False,True,7.0,1
4,56,41,Education,4821,Fair,Very High,Average,0,Yes,113.6,...,15,True,4821.0,False,True,True,False,False,0.0,0


In [384]:
df.describe()

Unnamed: 0,Age,Years at Company,Monthly Income,Number of Promotions,Distance from Home,Number of Dependents,Annual Income,Start Age,Total Compensation,avg time for promotion,Has Dependents
count,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0
mean,38.529746,15.721603,7299.379514,0.832935,79.986534,1.650326,87592.554163,22.808143,7908.584883,5.644407,0.701227
std,12.083456,11.223744,2152.508566,0.995289,45.621777,1.553633,25830.102789,11.235584,2456.708038,8.950503,0.457723
min,18.0,1.0,1226.0,0.0,1.6,0.0,14712.0,8.0,1226.0,0.0,0.0
25%,28.0,7.0,5652.0,0.0,40.0,0.0,67824.0,14.0,6007.2,0.0,0.0
50%,39.0,13.0,7348.0,1.0,80.0,1.0,88176.0,20.0,7859.65,0.0,1.0
75%,49.0,23.0,8876.0,2.0,120.0,3.0,106512.0,30.0,9620.0,8.0,1.0
max,59.0,51.0,16149.0,4.0,158.4,6.0,193788.0,58.0,21088.2,51.0,1.0


In [385]:
df.shape

(74498, 31)