### Import **HR Analysis case study** dataset

In [36]:
!pip install -q kagglehub pandas scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [37]:
import kagglehub
from pathlib import Path

# Download latest version
path = Path(kagglehub.dataset_download("vjchoudhary7/hr-analytics-case-study", path="general_data.csv"))

Path('hr_data_raw.csv').write_text(path.read_text());

### Clear data

In [38]:
import pandas as pd

df = pd.read_csv("hr_data_raw.csv", delimiter=',', decimal='.')
# drop unused columns
df = df.drop(columns=["EmployeeID", "EmployeeCount", "Over18", "StandardHours"])
print("Null values:")
display(df.isna().sum()[df.isna().sum() > 0])
# drop null values
df = df.dropna(how='any')

Null values:


NumCompaniesWorked    19
TotalWorkingYears      9
dtype: int64

### Save silver

In [39]:
df.to_csv("hr_data_silver.csv", index=False)

### Encode


In [40]:
# binary encoding
columns = ["Attrition"]

for c in columns:
    values = sorted(df[c].unique())
    df[c] = df[c].map({values[0]: 0, values[1]: 1})

In [41]:
# one hot encoding
columns = ["BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus"]
for column in columns:
    dummies = pd.get_dummies(df[column], prefix=column, drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(columns=column)
df = df.replace({True: 1, False: 0})

  df = df.replace({True: 1, False: 0})


In [42]:
display(list(df.columns))

['Age',
 'Attrition',
 'DistanceFromHome',
 'Education',
 'JobLevel',
 'MonthlyIncome',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'YearsAtCompany',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager',
 'BusinessTravel_Travel_Frequently',
 'BusinessTravel_Travel_Rarely',
 'Department_Research & Development',
 'Department_Sales',
 'EducationField_Life Sciences',
 'EducationField_Marketing',
 'EducationField_Medical',
 'EducationField_Other',
 'EducationField_Technical Degree',
 'Gender_Male',
 'JobRole_Human Resources',
 'JobRole_Laboratory Technician',
 'JobRole_Manager',
 'JobRole_Manufacturing Director',
 'JobRole_Research Director',
 'JobRole_Research Scientist',
 'JobRole_Sales Executive',
 'JobRole_Sales Representative',
 'MaritalStatus_Married',
 'MaritalStatus_Single']

### Normalize numeric values

In [43]:
from sklearn.preprocessing import StandardScaler

columns = ["Age", "DistanceFromHome", "Education", "JobLevel", "MonthlyIncome", "NumCompaniesWorked", "PercentSalaryHike", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "YearsAtCompany", "YearsSinceLastPromotion", "YearsWithCurrManager"]

scaler_standard = StandardScaler()

df_normalized = scaler_standard.fit_transform(df[columns])

df[columns] = df_normalized

### Equalize numer of males and females

In [44]:
print(df["Gender_Male"].map({1: "Male", 0: "Female"}).value_counts())

Gender_Male
Male      2626
Female    1756
Name: count, dtype: int64


In [45]:
# Oversample SMOTE Females
!pip install -q imbalanced-learn

from imblearn.over_sampling import SMOTE

features = df.drop(columns=["Gender_Male"])
target = df["Gender_Male"]

smote = SMOTE(sampling_strategy='auto', random_state=42)

features_rs, target_rs = smote.fit_resample(features, target)

df = pd.DataFrame(features_rs, columns=features.columns)
df['Gender_Male'] = target_rs



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [46]:
df['Gender_Male'].value_counts()

Gender_Male
0    2626
1    2626
Name: count, dtype: int64

### Save Gold ds

In [47]:
df.to_csv("hr_data_gold.csv", index=False)