In [126]:
import os
import pandas as pd
from functions import get_all_origins, find_pattern_for_quantity, convert_to_grams, relation_qnt_preco, remove_spaces, clean_text, local
import re
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings("ignore")

import ast
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df_x = get_all_origins("model_x")
df_y = get_all_origins("model_y")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

encoder = LabelEncoder()
data_encoder = encoder.fit_transform(df_x['brand'].values)
df_x['brand'] = data_encoder

df_x = df_x[[i for i in df_x.columns if i not in ["title", "ref"]]]
df_y = df_y[[i for i in df_y.columns if i not in ["title", "ref"]]]

In [132]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from imblearn.pipeline import Pipeline as imbpipeline  # Para evitar confusão com sklearn.pipeline.Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np

def evaluate_model_kfold(model, X, y, n_splits=3):
    print("==========================================>")
    # Configurando o KFold
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Listas para armazenar os resultados de cada fold
    auc_rocs = []
    accuracies = []
    precisions = []
    recalls = []
    f1s = []

    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Configurando o SMOTE
        smote = SMOTE(random_state=42, k_neighbors=1)
        
        # Criando o pipeline com SMOTE e o modelo fornecido
        pipeline = imbpipeline([('smote', smote), ('model', model)])
        
        # Treinando o modelo
        pipeline.fit(X_train, y_train)
        
        # Fazendo previsões
        y_pred = pipeline.predict(X_test)
        y_proba = pipeline.predict_proba(X_test)[:, 1]  # Para AUC-ROC

        # Calculando métricas e adicionando aos resultados
        auc_rocs.append(roc_auc_score(y_test, y_proba))
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))

    # Calculando a média das métricas
    results = {
        'AUC-ROC': np.mean(auc_rocs),
        'Accuracy': np.mean(accuracies),
        'Precision': np.mean(precisions),
        'Recall': np.mean(recalls),
        'F1-Score': np.mean(f1s)
    }

    return results

results = []
target = "whey_target"

X = df_x
y = df_y[target]

extra_trees = ExtraTreesClassifier(n_estimators=200, random_state=42, class_weight='balanced')
xgb = XGBClassifier(scale_pos_weight=(len(y) - sum(y)) / sum(y), use_label_encoder=False, eval_metric='logloss', tree_method='gpu_hist')
random_forest = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
decision_tree = DecisionTreeClassifier(random_state=42, class_weight='balanced')

# Avaliando cada modelo
models = [extra_trees, xgb, random_forest, decision_tree]
model_names = ['ExtraTrees', 'XGBClassifier', "RandomForest", "DecisionTree"]

for model, name in zip(models, model_names):
    try:
        result = evaluate_model_kfold(model, X, y)
        result['Model'] = name
        print(name)
        result['Target'] = target
        print(target)
        results.append(result)
    except:
        pass

# Convertendo os resultados em um DataFrame para uma visualização mais fácil
results_df = pd.DataFrame(results)
print(results_df)


ExtraTrees
whey_target
XGBClassifier
whey_target
RandomForest
whey_target
DecisionTree
whey_target
    AUC-ROC  Accuracy  Precision    Recall  F1-Score          Model  \
0  0.987995  0.958282   0.911318  0.907804  0.908859     ExtraTrees   
1  0.981485  0.950497   0.853341  0.947729  0.897927  XGBClassifier   
2  0.988128  0.957575   0.900084  0.917035  0.908318   RandomForest   
3  0.926217  0.944836   0.871000  0.892343  0.881346   DecisionTree   

        Target  
0  whey_target  
1  whey_target  
2  whey_target  
3  whey_target  


In [133]:
results_df = pd.DataFrame(results)
results_df.sort_values('Recall')
# ExtraTrees
# XGBClassifier
# RandomForest
# DecisionTree

Unnamed: 0,AUC-ROC,Accuracy,Precision,Recall,F1-Score,Model,Target
3,0.926217,0.944836,0.871,0.892343,0.881346,DecisionTree,whey_target
0,0.987995,0.958282,0.911318,0.907804,0.908859,ExtraTrees,whey_target
2,0.988128,0.957575,0.900084,0.917035,0.908318,RandomForest,whey_target
1,0.981485,0.950497,0.853341,0.947729,0.897927,XGBClassifier,whey_target
