In [31]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os

In [32]:
X_train = np.load("data/processed/X_train.npy")
y_train = pd.read_csv("data/processed/y_train.csv").values.ravel() 

In [33]:
def train_classifier(model, param_grid, X, y, cv=5):
    grid = GridSearchCV(
        model,
        param_grid,
        cv=cv,
        scoring='f1',
        n_jobs=-1
    )
    
    grid.fit(X, y)
    best_model = grid.best_estimator_
    
    scores = cross_val_score(best_model, X, y, scoring='f1', cv=cv, n_jobs=-1)
    mean_cv_f1 = scores.mean()
    
    return best_model, mean_cv_f1, grid.best_params_


In [34]:
# Decision Tree
dt_params = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced']
}

# Random Forest
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']
}


In [35]:
best_dt, f1_dt, best_params_dt = train_classifier(DecisionTreeClassifier(random_state=42), dt_params, X_train, y_train)
best_rf, f1_rf, best_params_rf = train_classifier(RandomForestClassifier(random_state=42), rf_params, X_train, y_train)

print("Decision Tree - Melhores parâmetros:", best_params_dt, "| F1-score CV:", f1_dt)
print("Random Forest - Melhores parâmetros:", best_params_rf, "| F1-score CV:", f1_rf)


Decision Tree - Melhores parâmetros: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2} | F1-score CV: 0.9996937212863706
Random Forest - Melhores parâmetros: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100} | F1-score CV: 0.9996937212863706


In [36]:
# Decision Tree
y_pred_dt = best_dt.predict(X_train)
print("Decision Tree Classification Report:\n", classification_report(y_train, y_pred_dt))
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_train, y_pred_dt))

# Random Forest
y_pred_rf = best_rf.predict(X_train)
print("Random Forest Classification Report:\n", classification_report(y_train, y_pred_rf))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_train, y_pred_rf))


Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1353
           1       1.00      1.00      1.00      1631

    accuracy                           1.00      2984
   macro avg       1.00      1.00      1.00      2984
weighted avg       1.00      1.00      1.00      2984

Decision Tree Confusion Matrix:
 [[1353    0]
 [   0 1631]]
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1353
           1       1.00      1.00      1.00      1631

    accuracy                           1.00      2984
   macro avg       1.00      1.00      1.00      2984
weighted avg       1.00      1.00      1.00      2984

Random Forest Confusion Matrix:
 [[1353    0]
 [   0 1631]]


In [37]:
# Comparar y_real e y_pred para Decision Tree
comparison_dt = pd.DataFrame({
    'y_real': y_train,
    'y_pred': y_pred_dt
})
print("Decision Tree - Comparação alvo x predição (treino):")
print(comparison_dt.head())

# Comparar y_real e y_pred para Random Forest
comparison_rf = pd.DataFrame({
    'y_real': y_train,
    'y_pred': y_pred_rf
})
print("Random Forest - Comparação alvo x predição (treino):")
print(comparison_rf.head())


Decision Tree - Comparação alvo x predição (treino):
   y_real  y_pred
0       0       0
1       0       0
2       0       0
3       0       0
4       0       0
Random Forest - Comparação alvo x predição (treino):
   y_real  y_pred
0       0       0
1       0       0
2       0       0
3       0       0
4       0       0


In [38]:
import os
import joblib

# Criar pasta para salvar os modelos, se não existir
os.makedirs("models", exist_ok=True)

# Salvar os modelos treinados
joblib.dump(best_dt, "models/decision_tree_model.pkl")
joblib.dump(best_rf, "models/random_forest_model.pkl")

print("Modelos salvos em 'models/'")


Modelos salvos em 'models/'
