### Importações

In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import inflection
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
warnings.filterwarnings("ignore")

### Leitura e tratamento de dados

In [2]:
#leitura d abase
df = pd.read_csv('../../datasets/raw/training_data.csv', sep=';')
print(f'Shape da base: {df.shape}')
df.head()

Shape da base: (1176, 35)


Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,25,Travel_Rarely,685,Research & Development,1,3,Life Sciences,1,350,1,Female,62,3,2,Manufacturing Director,3,Married,4898,7505,0,Y,No,12,3,4,80,2,5,3,3,4,2,1,2,0
1,44,Travel_Rarely,1376,Human Resources,1,2,Medical,1,1098,2,Male,91,2,3,Human Resources,1,Married,10482,2326,9,Y,No,14,3,4,80,1,24,1,3,20,6,3,6,1
2,27,Travel_Rarely,135,Research & Development,17,4,Life Sciences,1,1405,4,Female,51,3,1,Research Scientist,3,Single,2394,25681,1,Y,Yes,13,3,4,80,0,8,2,3,8,2,7,7,1
3,40,Travel_Frequently,580,Sales,5,4,Life Sciences,1,729,4,Male,48,2,3,Sales Executive,1,Married,10475,23772,5,Y,Yes,21,4,3,80,1,20,2,3,18,13,1,12,0
4,24,Travel_Rarely,477,Research & Development,24,3,Medical,1,1173,4,Male,49,3,1,Laboratory Technician,2,Single,3597,6409,8,Y,No,22,4,4,80,0,6,2,3,4,3,1,2,0


In [3]:
#padronizando nomes de colunas-> snake_case
df.columns = df.columns.map(lambda x: inflection.underscore(x))
print(f'Nomes das colunas após padronização: {df.columns}')

Nomes das colunas após padronização: Index(['age', 'business_travel', 'daily_rate', 'department',
       'distance_from_home', 'education', 'education_field', 'employee_count',
       'employee_number', 'environment_satisfaction', 'gender', 'hourly_rate',
       'job_involvement', 'job_level', 'job_role', 'job_satisfaction',
       'marital_status', 'monthly_income', 'monthly_rate',
       'num_companies_worked', 'over18', 'over_time', 'percent_salary_hike',
       'performance_rating', 'relationship_satisfaction', 'standard_hours',
       'stock_option_level', 'total_working_years', 'training_times_last_year',
       'work_life_balance', 'years_at_company', 'years_in_current_role',
       'years_since_last_promotion', 'years_with_curr_manager', 'attrition'],
      dtype='object')


In [4]:
# Verifica se há colunas com apenas um tipo de valor e ID do empregado
colunas_para_eliminar = ['employee_number']
for coluna in df.columns:
    if df[coluna].nunique() == 1:
        colunas_para_eliminar.append(coluna)       
# Elimina as colunas com apenas um tipo de valor
df = df.drop(colunas_para_eliminar, axis=1)
print(f'Colunas eliminadas: {colunas_para_eliminar}')

Colunas eliminadas: ['employee_number', 'employee_count', 'over18', 'standard_hours']


In [5]:
#overview do df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176 entries, 0 to 1175
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   age                         1176 non-null   int64 
 1   business_travel             1176 non-null   object
 2   daily_rate                  1176 non-null   int64 
 3   department                  1176 non-null   object
 4   distance_from_home          1176 non-null   int64 
 5   education                   1176 non-null   int64 
 6   education_field             1176 non-null   object
 7   environment_satisfaction    1176 non-null   int64 
 8   gender                      1176 non-null   object
 9   hourly_rate                 1176 non-null   int64 
 10  job_involvement             1176 non-null   int64 
 11  job_level                   1176 non-null   int64 
 12  job_role                    1176 non-null   object
 13  job_satisfaction            1176 non-null   int6

In [6]:
# Convertendo as colunas do tipo 'object' para 'category'
df = df.astype({col: 'category' for col in df.select_dtypes('object').columns})

In [7]:
df = df.apply(lambda x: LabelEncoder().fit_transform(x) if x.name in ['business_travel', 'gender', 'over_time'] else x)

In [8]:
features = df.drop(columns=['attrition'])
target = df['attrition']

categorical = features.select_dtypes(include='category').columns.to_list()
numerical = features.select_dtypes(include='number').columns.to_list()


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.3,
    random_state=42,
    stratify=target
)

### Experimentação

#### LogisticRegression

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical)
    ]
)

# Crie um pipeline com o pré-processador e o modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

# Divisão dos dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.3,
    random_state=42,
    stratify=target
)

# Treinamento do modelo
pipeline.fit(X_train, y_train)

# Predição do modelo
pred_train = pipeline.predict(X_train)
pred_test = pipeline.predict(X_test)

# Avaliação do modelo com matriz de confusão
confusion_matrix_train = confusion_matrix(y_train, pred_train)
confusion_matrix_test = confusion_matrix(y_test, pred_test)

scores = cross_val_score(pipeline, features, target, cv=5, scoring='recall')
print(f'Cross-validation scores: {scores}')
print(f"Matriz de confusão - Treino:\n{confusion_matrix_train}")
print(f"Matriz de confusão - Teste:\n{confusion_matrix_test}")

Cross-validation scores: [0.42105263 0.39473684 0.44736842 0.36842105 0.39473684]
Matriz de confusão - Treino:
[[674  16]
 [ 80  53]]
Matriz de confusão - Teste:
[[279  17]
 [ 28  29]]


#### RandomForest

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical)
    ]
)

# Crie um pipeline com o pré-processador e o modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Divisão dos dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.3,
    random_state=42,
    stratify=target
)

# Treinamento do modelo
pipeline.fit(X_train, y_train)

# Predição do modelo
pred_train = pipeline.predict(X_train)
pred_test = pipeline.predict(X_test)

# Avaliação do modelo com matriz de confusão
confusion_matrix_train = confusion_matrix(y_train, pred_train)
confusion_matrix_test = confusion_matrix(y_test, pred_test)

scores = cross_val_score(pipeline, features, target, cv=5, scoring='recall')
print(f'Cross-validation scores: {scores}')
print(f"Matriz de confusão - Treino:\n{confusion_matrix_train}")
print(f"Matriz de confusão - Teste:\n{confusion_matrix_test}")

Cross-validation scores: [0.15789474 0.18421053 0.18421053 0.23684211 0.10526316]
Matriz de confusão - Treino:
[[690   0]
 [  0 133]]
Matriz de confusão - Teste:
[[295   1]
 [ 46  11]]


#### KNN

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical)
    ]
)

# Crie um pipeline com o pré-processador e o modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

# Divisão dos dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.3,
    random_state=42,
    stratify=target
)

# Treinamento do modelo
pipeline.fit(X_train, y_train)

# Predição do modelo
pred_train = pipeline.predict(X_train)
pred_test = pipeline.predict(X_test)

# Avaliação do modelo com matriz de confusão
confusion_matrix_train = confusion_matrix(y_train, pred_train)
confusion_matrix_test = confusion_matrix(y_test, pred_test)

scores = cross_val_score(pipeline, features, target, cv=5, scoring='recall')
print(f'Cross-validation scores: {scores}')
print(f"Matriz de confusão - Treino:\n{confusion_matrix_train}")
print(f"Matriz de confusão - Teste:\n{confusion_matrix_test}")

Cross-validation scores: [0.13157895 0.10526316 0.18421053 0.13157895 0.07894737]
Matriz de confusão - Treino:
[[689   1]
 [107  26]]
Matriz de confusão - Teste:
[[285  11]
 [ 52   5]]
