In [None]:
"""
Membros do grupo:

Alonso Batista de Oliveira Júnior
André Moreira de Carvalho
Gustavo Castro Candeia
Halex Maciel Silva Vieira
Welbert Luiz Silva Junior

"""


import pandas as pd
import numpy as np
from typing import Tuple, Dict, Any, List
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix, cohen_kappa_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.exceptions import NotFittedError
import joblib
import logging
from tqdm import tqdm
import warnings
import gensim
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

In [None]:
# Definição de constantes para o caminho dos arquivos
DATA_FILE_PATH = '../data/cleaned_dataset.csv'
MODEL_DIR = '../models'
LOG_DIR = '../logs'

# Save the log to a file in the logs directory
logging.basicConfig(filename=f'{LOG_DIR}/training.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

MODEL_CONFIGS: Dict[str, Dict[str, Any]] = {
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'clf__n_estimators': [100, 300, 500],
            'clf__max_depth': [10, 30, 50],
            'clf__min_samples_split': [2, 5, 10],
            'clf__min_samples_leaf': [1, 2, 4],
            'clf__max_features': ['auto', 'sqrt', 'log2']
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'clf__n_estimators': [100, 300, 500],
            'clf__learning_rate': [0.01, 0.1, 0.001],
            'clf__max_depth': [3, 5, 7],
            'clf__subsample': [0.7, 0.8, 1.0]
        }
    }
}

In [None]:
def load_data(filepath: str) -> pd.DataFrame:
    try:
        return pd.read_csv(filepath, usecols=['Clean_Text_LSTM', 'Label'])
    except FileNotFoundError as e:
        logging.error(f"Arquivo não encontrado: {filepath}")
        raise e

def prepare_data(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, LabelEncoder]:
    df['Clean_Text_LSTM'] = df['Clean_Text_LSTM'].fillna('').astype(str)
    texts = df['Clean_Text_LSTM'].apply(lambda x: x.split()).values
    
    encoder = LabelEncoder()
    labels = encoder.fit_transform(df['Label'])
    
    # Imprime os labels e seus códigos
    print("Classes codificadas e seus códigos:")
    for label, code in zip(encoder.classes_, range(len(encoder.classes_))):
        print(f"Label '{label}' é codificado como {code}")
    print("")
    
    return train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels), encoder

def train_word2vec(texts: List[List[str]], vector_size: int = 300, window: int = 5, min_count: int = 2) -> Word2Vec:
    model = Word2Vec(sentences=texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
    return model

def text_to_word2vec(texts: List[List[str]], model: Word2Vec) -> np.ndarray:
    def get_vector(text):
        vectors = [model.wv[word] for word in text if word in model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    
    return np.array([get_vector(text) for text in texts])

def create_pipeline(model: Any) -> Pipeline:
    return Pipeline([
        ('clf', model)
    ])

def additional_metrics(y_test: np.ndarray, predicted_probabilities: np.ndarray, predictions: np.ndarray) -> Dict[str, Any]:
    conf_matrix = confusion_matrix(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    kappa = cohen_kappa_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predicted_probabilities, multi_class='ovr', average='weighted')
    
    return {
        "confusion_matrix": conf_matrix,
        "f1_score": f1,
        "kappa_score": kappa,
        "roc_auc_per_class": roc_auc
    }

In [None]:
def evaluate_model(model: Pipeline, X_test: np.ndarray, y_test: np.ndarray) -> Dict[str, Any]:
    try:
        predictions = model.predict(X_test)
        predicted_probabilities = model.predict_proba(X_test)

        base_metrics = {
            'classification_report': classification_report(y_test, predictions, output_dict=True),
            'accuracy': accuracy_score(y_test, predictions),
            'roc_auc': roc_auc_score(y_test, predicted_probabilities, multi_class='ovr', average='weighted')
        }

        add_metrics = additional_metrics(y_test, predicted_probabilities, predictions)
        base_metrics.update(add_metrics)
        return base_metrics
    except NotFittedError as e:
        logging.error("Modelo não ajustado")
        raise e
    except Exception as e:
        logging.error(f"Erro ao avaliar o modelo: {e}")
        raise e

In [None]:
def plot_training_performance(cv_results, model_name):
    plt.figure(figsize=(12, 6))
    plt.plot(cv_results['mean_test_score'], label='Média da Pontuação de Teste')
    plt.plot(cv_results['mean_train_score'], label='Média da Pontuação de Treino')
    plt.fill_between(range(len(cv_results['mean_test_score'])), 
                     cv_results['mean_test_score'] - cv_results['std_test_score'], 
                     cv_results['mean_test_score'] + cv_results['std_test_score'], alpha=0.2)
    plt.fill_between(range(len(cv_results['mean_train_score'])), 
                     cv_results['mean_train_score'] - cv_results['std_train_score'], 
                     cv_results['mean_train_score'] + cv_results['std_train_score'], alpha=0.2)
    plt.title(f'Desempenho do Treinamento do {model_name}')
    plt.xlabel('Índice de Combinação de Hiperparâmetros')
    plt.ylabel('Pontuação')
    plt.legend()
    plt.show()

def plot_confusion_matrix(conf_matrix, classes):
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predito')
    plt.ylabel('Real')
    plt.title('Matriz de Confusão')
    plt.show()

In [None]:
def main() -> None:
    df = load_data(DATA_FILE_PATH)
    (X_train, X_test, y_train, y_test), label_encoder = prepare_data(df)
    joblib.dump(label_encoder, f'{MODEL_DIR}/label_encoder.joblib')
    
    word2vec_model = train_word2vec(X_train)
    X_train_w2v = text_to_word2vec(X_train, word2vec_model)
    X_test_w2v = text_to_word2vec(X_test, word2vec_model)
    
    model_performances = []
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for name, config in tqdm(MODEL_CONFIGS.items(), desc="Treinando modelos"):
        pipeline = create_pipeline(config['model'])
        search = RandomizedSearchCV(pipeline, config['params'], cv=kfold, scoring='accuracy', n_jobs=-1, verbose=2, return_train_score=True)
        search.fit(X_train_w2v, y_train)
        
        # Plot training performance
        plot_training_performance(search.cv_results_, name)
        
        metrics = evaluate_model(search.best_estimator_, X_test_w2v, y_test)
        logging.info(f"Relatório de classificação para {name}:\n{metrics['classification_report']}")
        model_performances.append((name, metrics['accuracy'], metrics['roc_auc'], search.best_estimator_))
        logging.info(f"Melhores hiperparâmetros para {name}: {search.best_params_}")
        logging.info(f"Acurácia para {name}: {metrics['accuracy']:.4f}")
        logging.info(f"ROC-AUC para {name}: {metrics['roc_auc']:.4f}")
        logging.info(f"Melhor modelo para {name}: {search.best_estimator_}\n")

    model_performances.sort(key=lambda x: x[1], reverse=True)
    for i, (name, acc, auc, model) in enumerate(model_performances, 1):
        model_path = f'{MODEL_DIR}/top_{i}_{name}_model.joblib'
        joblib.dump(model, model_path)
        logging.info(f"Modelo {name} salvo com acurácia de {acc:.4f} em {model_path}")

    # Plot confusion matrix for the best model
    best_model_name, _, _, best_model = model_performances[0]
    best_predictions = best_model.predict(X_test_w2v)
    best_conf_matrix = confusion_matrix(y_test, best_predictions)
    plot_confusion_matrix(best_conf_matrix, label_encoder.classes_)

In [None]:
if __name__ == "__main__":
    main()