# AI Project: Employee Attrition Prediction - **source code**

## Table of Contents
- [Framework](#framework)
- [Initial dataset](#initial-dataset)
- [Ethic](#ethic)
- [Data wrangling](#data-wrangling)
- [Model](#model)
  - [Implementation](#implementation)
  - [Quality indicators](#quality-indicators)
  - [Graph and observation](#graph-and-observation)


## Framework

In [120]:
import pandas as pd
import numpy as np
import os
# ...

## Initial dataset

little description of the data set

## Ethic

blabla ethic
What we delete

## Data wrangling

### Data importation

In [121]:
# Load data
general_df = pd.read_csv(os.path.join('data','general_data.csv'))
manager_survey_df = pd.read_csv(os.path.join('data', 'manager_survey_data.csv'))
employee_survey_df = pd.read_csv(os.path.join('data', 'employee_survey_data.csv'))
in_time = pd.read_csv(os.path.join('data', 'in_time.csv'))
out_time = pd.read_csv(os.path.join('data', 'out_time.csv'))

### Data observation

In [122]:
# Check initial data sizes
print(f"General Data: {general_df.shape} (rows, columns)")
print(f"Manager Survey: {manager_survey_df.shape}")
print(f"Employee Survey: {employee_survey_df.shape}")


print("\n===========================================================\n")


# Check if EmployeeID has duplicates
if general_df['EmployeeID'].duplicated().sum() == 0:
    print("EmployeeID is unique")
else:
    print("Warning")


print("\n===========================================================\n")


# Merge data
merge_df = pd.merge(general_df, manager_survey_df, on='EmployeeID', how='left')
final_df = pd.merge(merge_df, employee_survey_df, on='EmployeeID', how='left')

print(f"Size of Final Table: {final_df.shape}")


print("\n===========================================================\n")


print(f"First 5 rows:")
display(final_df.head())

In [123]:
# Check initial data sizes
print(f"In time: {in_time.shape} (rows, columns)")
print(f"Out time: {out_time.shape}")

# Rename first column to 'EmployeeID' for consistency
in_time.rename(columns={in_time.columns[0]: 'EmployeeID'}, inplace=True)
out_time.rename(columns={out_time.columns[0]: 'EmployeeID'}, inplace=True)

# Set 'EmployeeID' as index
in_time.set_index('EmployeeID', inplace=True)
out_time.set_index('EmployeeID', inplace=True)

# Change data into datetime. errors='coerce' if (NaT)
in_time_df = in_time.apply(pd.to_datetime, errors='coerce')
out_time_df = out_time.apply(pd.to_datetime, errors='coerce')

# Calculate working time
working_time = out_time_df - in_time_df
display(working_time.head())


print("\n===========================================================\n")


# Check average working time
average_working_time = working_time.mean(axis=1)
avg_working_hours_numeric = average_working_time.apply(lambda x: x.total_seconds() / 3600)
display(avg_working_hours_numeric.head())


print("\n===========================================================\n")


# Reset index to turn Series into DataFrame
time_features_df = avg_working_hours_numeric.reset_index()
# Rename columns
time_features_df.columns = ['EmployeeID', 'AvgWorkingHours']
# Display final time features DataFrame
display(time_features_df.head())


print("\n===========================================================\n")


# Merge time features back to final_df
final_df = pd.merge(final_df, time_features_df, on='EmployeeID', how='left')
# Check final data sizes
print(f"Final DataFrame: {final_df.shape} (rows, columns)")


print("\n===========================================================\n")


print("First 5 rows of final DataFrame:")
display(final_df.head())

### Data cleaning

In [124]:
# Delete unneeded columns
columns_to_drop = ['EmployeeCount', 'Over18', 'StandardHours']
final_df.drop(columns=columns_to_drop, inplace=True)

# Handle missing values
numeric_columns = final_df.select_dtypes(include=[np.number]).columns
final_df[numeric_columns] = final_df[numeric_columns].fillna(final_df[numeric_columns].mean()) 
missing_values = final_df.isnull().sum()
print(missing_values)


print("\n===========================================================\n")


# Label encoding 
attrition_map = {'Yes': 1, 'No': 0}
final_df['Attrition'] = final_df['Attrition'].map(attrition_map)
gender_map = {'Male': 1, 'Female': 0}
final_df['Gender'] = final_df['Gender'].map(gender_map)
travel_map = {
    'Non-Travel': 0,
    'Travel_Rarely': 1,
    'Travel_Frequently': 2
}
final_df['BusinessTravel'] = final_df['BusinessTravel'].map(travel_map)

# One-hot encoding
categorical_columns = final_df.select_dtypes(include=['object']).columns
final_df = pd.get_dummies(final_df, columns=categorical_columns, drop_first=True)
final_df = final_df.replace({True: 1, False: 0})
print(final_df.info())
print("\n===========================================================\n")
print("First 5 rows after encoding:")
print("\n")
display(final_df.head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate correlation matrix
corr_matrix = final_df.corr()

# Plot heatmap
plt.figure(figsize=(24, 18)) 
sns.heatmap(corr_matrix, 
            annot=True,       
            fmt=".2f",        
            cmap="coolwarm",  
            linewidths=0.5,   
            vmin=-1, vmax=1)  

plt.title("Correlation Matrix", fontsize=20)
plt.show()

In [125]:
final_df.to_csv(os.path.join('data', 'final_data_processed.csv'), index=False)

## model

### Implementation

In [126]:
# implemente the model
# with  pipeline

### Quality indicators

In [127]:
# MSE, MAE RMSE, RÂ²
# Cross validation

### Graph and observation

In [128]:
# plt if we have