In [170]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import joblib
import os

In [171]:
# x_train_full = pd.read_csv("data/processed/x_train.csv")
# y_train_full = pd.read_csv("data/processed/y_train.csv")
# y = y_train_full.values.ravel()

In [172]:
# selector = SelectKBest(score_func=f_classif, k=12)
# selector.fit(x_train_full, y)
# top_features = x_train_full.columns[selector.get_support()]
# print(top_features)

In [173]:
# x = x_train_full[top_features]

In [174]:
# x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)

In [175]:
x_train = pd.read_csv("data/processed/x_train.csv")
x_test = pd.read_csv("data/processed/x_test.csv")

y_train = pd.read_csv("data/processed/y_train.csv").values.ravel()
y_test = pd.read_csv("data/processed/y_test.csv").values.ravel()


In [176]:
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

In [177]:
# scaler = StandardScaler()
# x_train = scaler.fit_transform(x_res)
# x_test = scaler.transform(x_test)

In [178]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_macro = make_scorer(f1_score, average='macro')

In [179]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'random_state': [42]
}

rf_grid = GridSearchCV(RandomForestClassifier(n_jobs=-1), rf_param_grid, scoring=f1_macro, cv=cv, n_jobs=-1, verbose=2)
# rf_grid.fit(x_train, y_res)
rf_grid.fit(x_res, y_res)
rf_best = rf_grid.best_estimator_
print("\nMelhores parâmetros RF:", rf_grid.best_params_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42; total time=   2.4s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42; total time=   3.3s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42; total time=   5.3s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42; total time=   5.4s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200, random_state=42; total time=   6.6s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42; total time=   6.5s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100, random_state=42; tot

In [180]:
joblib.dump(rf_best, "models/rf.pkl")

['models/rf.pkl']

In [181]:
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs','liblinear'],
    'max_iter': [1000],
    'random_state': [42]
}

lr_grid = GridSearchCV(LogisticRegression(), lr_param_grid, scoring=f1_macro, cv=cv, n_jobs=-1, verbose=2)
# lr_grid.fit(x_train, y_res)
lr_grid.fit(x_res, y_res)
lr_best = lr_grid.best_estimator_
print("\nMelhores parâmetros LR:", lr_grid.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END C=0.01, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.01, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.01, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.01, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.01, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.01, max_iter=1000, random_state=42, solver=liblinear; total time=   0.2s
[CV] END C=0.1, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.1, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.1, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.1, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.1, max_iter=1000, random_state=42, solver=lbfgs; total time=   0.1s
[CV] END C=0.01, max_iter=1000, random_

In [182]:
joblib.dump(lr_best, "models/lr.pkl")

['models/lr.pkl']

In [184]:
from sklearn.metrics import classification_report, confusion_matrix

# Random Forest
rf_pred_train = rf_best.predict(x_res)
print("\nRandom Forest - Classification Report (Treino)")
print(classification_report(y_res, rf_pred_train))
print("Confusion Matrix (Treino):")
print(confusion_matrix(y_res, rf_pred_train))

# Logistic Regression
lr_pred_train = lr_best.predict(x_res)
print("\nLogistic Regression - Classification Report (Treino)")
print(classification_report(y_res, lr_pred_train))
print("Confusion Matrix (Treino):")
print(confusion_matrix(y_res, lr_pred_train))



Random Forest - Classification Report (Treino)
              precision    recall  f1-score   support

           0       0.91      0.92      0.92     17402
           1       0.92      0.91      0.92     17402

    accuracy                           0.92     34804
   macro avg       0.92      0.92      0.92     34804
weighted avg       0.92      0.92      0.92     34804

Confusion Matrix (Treino):
[[16069  1333]
 [ 1499 15903]]

Logistic Regression - Classification Report (Treino)
              precision    recall  f1-score   support

           0       0.68      0.80      0.73     17402
           1       0.76      0.62      0.68     17402

    accuracy                           0.71     34804
   macro avg       0.72      0.71      0.71     34804
weighted avg       0.72      0.71      0.71     34804

Confusion Matrix (Treino):
[[13915  3487]
 [ 6631 10771]]


In [185]:
# Random Forest - Teste
rf_pred_test = rf_best.predict(x_test)
print("\nRandom Forest - Classification Report (Teste)")
print(classification_report(y_test, rf_pred_test))
print("Confusion Matrix (Teste):")
print(confusion_matrix(y_test, rf_pred_test))

# Logistic Regression - Teste
lr_pred_test = lr_best.predict(x_test)
print("\nLogistic Regression - Classification Report (Teste)")
print(classification_report(y_test, lr_pred_test))
print("Confusion Matrix (Teste):")
print(confusion_matrix(y_test, lr_pred_test))


Random Forest - Classification Report (Teste)
              precision    recall  f1-score   support

           0       0.86      0.83      0.84      4350
           1       0.48      0.54      0.51      1264

    accuracy                           0.76      5614
   macro avg       0.67      0.68      0.68      5614
weighted avg       0.77      0.76      0.77      5614

Confusion Matrix (Teste):
[[3601  749]
 [ 581  683]]

Logistic Regression - Classification Report (Teste)
              precision    recall  f1-score   support

           0       0.87      0.80      0.83      4350
           1       0.46      0.59      0.52      1264

    accuracy                           0.75      5614
   macro avg       0.67      0.70      0.68      5614
weighted avg       0.78      0.75      0.76      5614

Confusion Matrix (Teste):
[[3480  870]
 [ 513  751]]
