In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Célula 2: Carregamento do Dataset e Transformações Iniciais (Binárias e Remoção de Colunas)

employee_df = pd.read_csv('../data/raw/Human_Resources.csv')

# 'Attrition': 'Yes' -> 1, 'No' -> 0
employee_df['Attrition'] = employee_df['Attrition'].str.strip().str.upper().apply(lambda x: 1 if x == 'YES' else 0)

# 'OverTime': 'Yes' -> 1, 'No' -> 0
employee_df['OverTime'] = employee_df['OverTime'].str.strip().apply(lambda x: 1 if x == 'Yes' else 0)

#  ATENCAO AQ ==> 'Over18': 'Y' -> 1, 'N' -> 0 (todos são 'Y' no dataset, então será uma coluna constante de 1s)
employee_df['Over18'] = employee_df['Over18'].str.strip().str.upper().apply(lambda x: 1 if x == 'Y' else 0)

''' 2. removendo colunas constantes e desnecessárias identificadas na EDA '''
# 'EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours' identificadas como constantes ou irrelevantes
employee_df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1, inplace=True)

print("Shape do DataFrame após transformações binárias e remoção de colunas:", employee_df.shape)
display(employee_df.head())

Shape do DataFrame após transformações binárias e remoção de colunas: (1470, 31)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2


In [5]:
'''3. Aplicar One-Hot Encoding nas colunas categóricas restantes'''

# colunas categóricas para One-Hot Encoding
X_cat_cols = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus']

employee_df_processed = employee_df.copy()
onehotencoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_cat_encoded = onehotencoder.fit_transform(employee_df_processed[X_cat_cols])
X_cat_df = pd.DataFrame(X_cat_encoded, columns=onehotencoder.get_feature_names_out(X_cat_cols))

employee_df_processed = employee_df_processed.drop(columns=X_cat_cols)
employee_df_processed = pd.concat([employee_df_processed.reset_index(drop=True), X_cat_df.reset_index(drop=True)], axis=1)

# separando as features (X) da variável alvo (y)
X = employee_df_processed.drop('Attrition', axis=1)
y = employee_df_processed['Attrition']

print("Shape de X após One-Hot Encoding:", X.shape)
print("Shape de y:", y.shape)
print("\nPrimeiras 5 linhas de X (features) após One-Hot Encoding:")
display(X.head())

Shape de X após One-Hot Encoding: (1470, 50)
Shape de y: (1470,)

Primeiras 5 linhas de X (features) após One-Hot Encoding:


Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,41,1102,1,2,2,94,3,2,4,5993,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,279,8,1,3,61,2,2,2,5130,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,37,1373,2,2,4,92,2,1,3,2090,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,33,1392,3,4,4,56,3,1,3,2909,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,27,591,2,1,1,40,3,1,2,3468,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
'''Normalizar os dados numéricos (MinMaxScaler)'''

numerical_cols = X.select_dtypes(include=np.number).columns

scaler = MinMaxScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

print("Shape de X após Normalização:", X.shape)
print("\nPrimeiras 5 linhas de X (features) após Normalização:")
display(X.head())

Shape de X após Normalização: (1470, 50)

Primeiras 5 linhas de X (features) após Normalização:


Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,0.547619,0.71582,0.0,0.25,0.333333,0.914286,0.666667,0.25,1.0,0.262454,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.738095,0.1267,0.25,0.0,0.666667,0.442857,0.333333,0.25,0.333333,0.217009,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.452381,0.909807,0.035714,0.25,1.0,0.885714,0.333333,0.0,0.666667,0.056925,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.357143,0.923407,0.071429,0.75,1.0,0.371429,0.666667,0.0,0.666667,0.100053,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.214286,0.350036,0.035714,0.0,0.0,0.142857,0.666667,0.0,0.333333,0.129489,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
''' balanceamento de classes com SMOTE '''

print("Contagem de valores de 'Attrition' antes do SMOTE:")
print(y.value_counts())

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("\nBalanceamento de classes com SMOTE concluído.")
print("Shape de X após SMOTE:", X_resampled.shape)
print("Shape de y após SMOTE:", y_resampled.shape)
print("Contagem de valores de 'Attrition' após SMOTE:")
print(y_resampled.value_counts())

Contagem de valores de 'Attrition' antes do SMOTE:
Attrition
0    1233
1     237
Name: count, dtype: int64

Balanceamento de classes com SMOTE concluído.
Shape de X após SMOTE: (2466, 50)
Shape de y após SMOTE: (2466,)
Contagem de valores de 'Attrition' após SMOTE:
Attrition
1    1233
0    1233
Name: count, dtype: int64


In [8]:
# Separar em treino e teste (já com os dados balanceados)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42, stratify=y_resampled)

print("\nDivisão em Treino e Teste concluída.")
print("Shape de X_train:", X_train.shape)
print("Shape de X_test:", X_test.shape)
print("Shape de y_train:", y_train.shape)
print("Shape de y_test:", y_test.shape)

print("\nContagem de Attrition em y_train:")
print(y_train.value_counts())
print("\nContagem de Attrition em y_test:")
print(y_test.value_counts())


Divisão em Treino e Teste concluída.
Shape de X_train: (1849, 50)
Shape de X_test: (617, 50)
Shape de y_train: (1849,)
Shape de y_test: (617,)

Contagem de Attrition em y_train:
Attrition
1    925
0    924
Name: count, dtype: int64

Contagem de Attrition em y_test:
Attrition
0    309
1    308
Name: count, dtype: int64


In [9]:
''' salvamento dos Dados Processados'''

import os

output_dir = '../data/processed'

# Criar a pasta 'processed' dentro da pasta 'data' se ainda não existir
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False) # Correção aplicada aqui novamente

print(f"\nDados de treino e teste salvos em: {output_dir}")


Dados de treino e teste salvos em: ../data/processed
