## Import Libraries & Modules

In [232]:
import pandas as pd
import numpy as np

## Load Data Set

In [235]:
df1 = pd.read_csv(r"./Data/train.csv")
df2 = pd.read_csv(r"./Data/test.csv")

In [237]:
# Combine train & test data sets
df = pd.concat([df1, df2], ignore_index=True)

In [239]:
df.head()

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,8410,31,Male,19,Education,5390,Excellent,Medium,Average,2,...,0,Mid,Medium,89,No,No,No,Excellent,Medium,Stayed
1,64756,59,Female,4,Media,5534,Poor,High,Low,3,...,3,Mid,Medium,21,No,No,No,Fair,Low,Stayed
2,30257,24,Female,10,Healthcare,8159,Good,High,Low,0,...,3,Mid,Medium,74,No,No,No,Poor,Low,Stayed
3,65791,36,Female,7,Education,3989,Good,High,High,1,...,2,Mid,Small,50,Yes,No,No,Good,Medium,Stayed
4,65026,56,Male,41,Education,4821,Fair,Very High,Average,0,...,0,Senior,Medium,68,No,No,No,Fair,Medium,Stayed


In [241]:
# Chack if there is duplicated rows
(len(df[df.duplicated()]))

0

In [243]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74498 entries, 0 to 74497
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Employee ID               74498 non-null  int64 
 1   Age                       74498 non-null  int64 
 2   Gender                    74498 non-null  object
 3   Years at Company          74498 non-null  int64 
 4   Job Role                  74498 non-null  object
 5   Monthly Income            74498 non-null  int64 
 6   Work-Life Balance         74498 non-null  object
 7   Job Satisfaction          74498 non-null  object
 8   Performance Rating        74498 non-null  object
 9   Number of Promotions      74498 non-null  int64 
 10  Overtime                  74498 non-null  object
 11  Distance from Home        74498 non-null  int64 
 12  Education Level           74498 non-null  object
 13  Marital Status            74498 non-null  object
 14  Number of Dependents  

In [209]:
df.describe()

Unnamed: 0,Employee ID,Age,Years at Company,Monthly Income,Number of Promotions,Distance from Home,Number of Dependents,Company Tenure
count,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0,74498.0
mean,37249.5,38.529746,15.721603,7299.379514,0.832935,49.991584,1.650326,55.727456
std,21505.864514,12.083456,11.223744,2152.508566,0.995289,28.513611,1.553633,25.399349
min,1.0,18.0,1.0,1226.0,0.0,1.0,0.0,2.0
25%,18625.25,28.0,7.0,5652.0,0.0,25.0,0.0,36.0
50%,37249.5,39.0,13.0,7348.0,1.0,50.0,1.0,56.0
75%,55873.75,49.0,23.0,8876.0,2.0,75.0,3.0,76.0
max,74498.0,59.0,51.0,16149.0,4.0,99.0,6.0,128.0


In [210]:
df.shape

(74498, 24)

In [211]:
# Make all the Object type features to Category type
for col in df.select_dtypes(['object']):
    df[col] = df[col].astype('category')

In [212]:
# Chack if there is a need to reduce categories
cat_summary_df = pd.DataFrame({
    "Feature": df.select_dtypes(['category']).columns,
    "Unique Values": [df[col].nunique() for col in df.select_dtypes(['category'])],
    "Categories": [df[col].unique().tolist() for col in df.select_dtypes(['category'])],
    
})

cat_summary_df.set_index('Feature', inplace=True)
cat_summary_df

Unnamed: 0_level_0,Unique Values,Categories
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
Gender,2,"[Male, Female]"
Job Role,5,"[Education, Media, Healthcare, Technology, Fin..."
Work-Life Balance,4,"[Excellent, Poor, Good, Fair]"
Job Satisfaction,4,"[Medium, High, Very High, Low]"
Performance Rating,4,"[Average, Low, High, Below Average]"
Overtime,2,"[No, Yes]"
Education Level,5,"[Associate Degree, Master’s Degree, Bachelor’s..."
Marital Status,3,"[Married, Divorced, Single]"
Job Level,3,"[Mid, Senior, Entry]"
Company Size,3,"[Medium, Small, Large]"


## Feature Engineering

In [214]:
# Calculate the annual income from the monthly income
df['Annual Income'] = df['Monthly Income'] * 12

# Calculate the age when the employee started at the company
df['Start Age'] = df['Age'] - df['Years at Company']

# Chack if the employee has worked at the company for at least 10 years
df['At Least Decade'] = df['Years at Company'] >= 10

# Assuming each promotion results in a 10% increase in monthly income
promotion_factor = 0.10
df['Total Compensation'] = df['Monthly Income'] * (1 + df['Number of Promotions'] * promotion_factor)

# One-hot Encoding for some features
df = pd.get_dummies(data=df, columns=['Gender', 'Marital Status'])

# The average time (years) for promotion (Assuming each promotion gets after an equal number of years)
df['avg time for promotion'] = np.where(df['Number of Promotions'] > 0,
                                        df['Years at Company'] // df['Number of Promotions'],
                                        0)

# Convert miles to kilometers
df['Distance from Home (km)'] = df['Distance from Home'] * 1.609344 

## Remove panctuation

In [216]:
df['Education Level'] = df['Education Level'].str.replace("'", "")

In [228]:
df.select_dtypes(['int64']).head()

Unnamed: 0,Employee ID,Age,Years at Company,Monthly Income,Number of Promotions,Distance from Home,Number of Dependents,Annual Income,Start Age
0,8410,31,19,5390,2,22,0,64680,12
1,64756,59,4,5534,3,21,3,66408,55
2,30257,24,10,8159,0,11,3,97908,14
3,65791,36,7,3989,1,27,2,47868,29
4,65026,56,41,4821,0,71,0,57852,15


In [218]:
# Strange values
df['Company Tenure'].value_counts()

Company Tenure
65     994
43     993
55     986
50     974
52     974
      ... 
123     13
125     12
126      7
127      3
128      1
Name: count, Length: 127, dtype: int64

In [219]:
# ~10% of the employees have a number of years in the industry that pass their age (doesn't make sense)
len(df[df['Company Tenure'] > df['Age']])

52902

## Drop Company Tenure

In [221]:
df = df.drop(columns=['Company Tenure'])

In [222]:
df.iloc[3]

Employee ID                       65791
Age                                  36
Years at Company                      7
Job Role                      Education
Monthly Income                     3989
Work-Life Balance                  Good
Job Satisfaction                   High
Performance Rating                 High
Number of Promotions                  1
Overtime                             No
Distance from Home                   27
Education Level             High School
Number of Dependents                  2
Job Level                           Mid
Company Size                      Small
Remote Work                         Yes
Leadership Opportunities             No
Innovation Opportunities             No
Company Reputation                 Good
Employee Recognition             Medium
Attrition                        Stayed
Annual Income                     47868
Start Age                            29
At Least Decade                   False
Total Compensation               4387.9


In [230]:
df['Number of Dependents'].value_counts()

Number of Dependents
0    22258
1    19331
2    11117
3    10375
4     7620
5     3006
6      791
Name: count, dtype: int64

In [223]:
df['Start Age'].min()

8

In [224]:
df[df['Start Age'] == 8]

Unnamed: 0,Employee ID,Age,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,Overtime,...,Annual Income,Start Age,At Least Decade,Total Compensation,Gender_Female,Gender_Male,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,avg time for promotion
57,4098,30,22,Technology,9894,Poor,Low,Average,1,No,...,118728,8,True,10883.4,True,False,False,False,True,22.0
141,41073,28,20,Finance,10513,Good,High,Average,0,No,...,126156,8,True,10513.0,False,True,False,True,False,0.0
147,34213,21,13,Healthcare,9152,Poor,High,Average,0,No,...,109824,8,True,9152.0,False,True,False,True,False,0.0
151,12809,44,36,Finance,8048,Fair,Very High,Average,2,No,...,96576,8,True,9657.6,False,True,False,False,True,18.0
194,62811,19,11,Education,4410,Fair,High,High,2,Yes,...,52920,8,True,5292.0,False,True,False,False,True,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74420,28618,38,30,Education,4627,Poor,High,Average,0,No,...,55524,8,True,4627.0,True,False,True,False,False,0.0
74421,19309,34,26,Technology,7949,Good,High,High,0,Yes,...,95388,8,True,7949.0,True,False,False,False,True,0.0
74426,44929,19,11,Education,3567,Poor,Medium,Below Average,2,No,...,42804,8,True,4280.4,False,True,False,False,True,5.0
74432,62715,53,45,Technology,8186,Good,Very High,Average,1,No,...,98232,8,True,9004.6,True,False,True,False,False,45.0
