In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time
import pickle
import logging
import re

from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tensorflow.keras.callbacks import EarlyStopping
# Usando o Adam compat√≠vel:
from tensorflow.keras.optimizers import Adam 
from sklearn.utils import resample 


# Suprimir avisos
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

# --- Defini√ß√µes de Fun√ß√µes Auxiliares (CI e Avalia√ß√£o) ---
def ci95(data):
    """Calcula o Intervalo de Confian√ßa de 95% assumindo distribui√ß√£o normal."""
    data = np.array(data)
    mean = np.mean(data)
    std_err = np.std(data) / np.sqrt(len(data))
    # Z-score para 95% (1.96)
    lower = mean - 1.96 * std_err
    upper = mean + 1.96 * std_err
    return (lower, upper)

def bootstrap_ci(y_true, y_pred, metric_func, n_iterations=1000, alpha=0.95):
    """Calcula o CI de 95% usando Bootstrap para m√©tricas n√£o gaussianas (e.g., F1)."""
    n_size = len(y_true)
    scores = []
    
    for _ in range(n_iterations):
        indices = np.random.randint(0, n_size, n_size)
        y_true_sample = y_true[indices]
        y_pred_sample = y_pred[indices]
        
        try:
            score = metric_func(y_true_sample, y_pred_sample, average='weighted', zero_division=0)
            scores.append(score)
        except ValueError:
            continue

    if not scores:
        return (np.nan, np.nan)

    scores = np.array(scores)
    p = ((1.0 - alpha) / 2.0) * 100
    lower = np.percentile(scores, p)
    p = (alpha + ((1.0 - alpha) / 2.0)) * 100
    upper = np.percentile(scores, p)
    
    return (lower, upper)

def evaluate_roberta_model(model_name, y_test, y_pred, pred_time_ms, train_time, cv_scores, test_result):
    """Consolida os resultados de CV e Teste Final."""
    
    # 1. Resultados do Teste Final
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # 2. Consolida√ß√£o das M√©tricas de CV
    cv_mean = {k: np.mean(v) for k, v in cv_scores.items()}
    cv_std = {k: np.std(v) for k, v in cv_scores.items()}
    cv_ci = {k: ci95(v) for k, v in cv_scores.items()} # Usa ci95 simples para os folds
    
    final_results = {
        'Model': model_name,
        'Train Time (s)': round(train_time, 2),
        'Predict Time (ms/sample)': round(pred_time_ms / len(y_test), 2),
        'Accuracy (Test)': round(acc * 100, 4),
        'F1 Score (Test)': round(f1 * 100, 4),
        
        'CV_Acc_Mean': round(cv_mean['accuracy'] * 100, 4),
        'CV_Acc_Std': round(cv_std['accuracy'] * 100, 4),
        'CV_Acc_CI95': f"({round(cv_ci['accuracy'][0]*100, 2)}, {round(cv_ci['accuracy'][1]*100, 2)})",
        
        'CV_F1_Mean': round(cv_mean['f1_weighted'] * 100, 4),
        'CV_F1_Std': round(cv_std['f1_weighted'] * 100, 4),
        'CV_F1_CI95': f"({round(cv_ci['f1_weighted'][0]*100, 2)}, {round(cv_ci['f1_weighted'][1]*100, 2)})",
    }
    
    # Adicionar ao DataFrame de resultados
    return pd.concat([test_result, pd.DataFrame([final_results])], ignore_index=True)


# --------------------------------------------------------------------------------
PATH_DATA = './'

## --- 1. Carregamento de Dados e √çndices do Disco ---
print("--- 1. Carregando Dados do Disco ---")
try:
    # Carregar Dados Reamostrados (Usados para CV e Treinamento Final)
    df_train_resampled = pd.read_csv(f'{PATH_DATA}train_resampled_full.csv')
    X_train_resampled = df_train_resampled['content_corrected']
    y_train_resampled = df_train_resampled['target']
    
    # Carregar Dados de Teste (Usados para Avalia√ß√£o Final)
    df_test_original = pd.read_csv(f'{PATH_DATA}test_original_full.csv')
    X_test_original = df_test_original['content_corrected']
    y_test_original = df_test_original['target']

    # Carregar √çndices do K-Fold (Base Reamostrada)
    file_path_kfolds = f'{PATH_DATA}kfolds_resampled_indices.pkl'
    with open(file_path_kfolds, 'rb') as f:
        loaded_kfolds_indices = pickle.load(f)
        
    print("‚úÖ Todos os dados e √≠ndices carregados com sucesso.")

except FileNotFoundError as e:
    print(f"‚ùå ERRO: Arquivo n√£o encontrado. Certifique-se de que os dados foram salvos na pasta './'. Detalhe: {e}")
    exit()

## --- 2. Configura√ß√£o Global e Prepara√ß√£o de Vari√°veis ---
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
num_classes = len(np.unique(y_train_resampled))
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
PATIENCE = 10 
K_FOLDS = len(loaded_kfolds_indices)

# Inicializa√ß√£o dos containers de resultados
cv_scores = {
    "accuracy": [], "precision_weighted": [], "recall_weighted": [], "f1_weighted": []
}
total_cv_train_time = 0.0
# Inicializa√ß√£o do DataFrame de resultados
test_result = pd.DataFrame() 

# --------------------------------------------------------------------------------
# Prepara√ß√£o dos dados de CV (Base Reamostrada)
X_CV_content = X_train_resampled.tolist() # Lista de strings para tokeniza√ß√£o
y_CV = y_train_resampled.values # Array numpy de labels

# Prepara√ß√£o dos dados de Teste (Base Original)
X_test_content = X_test_original.tolist()
y_test = y_test_original.values 

print(f"Iniciando CV de {K_FOLDS} folds em {len(X_CV_content)} amostras reamostradas.")
# --------------------------------------------------------------------------------

## --- 3. Loop de Cross-Validation (K-Fold CV) ---
print("\n--- 3. Iniciando a valida√ß√£o cruzada nos dados reamostrados ---")
for fold_num, (train_index, val_index) in enumerate(loaded_kfolds_indices):
    print(f"\n--- Fold {fold_num + 1}/{K_FOLDS} ---")

    # A. Obter dados de treino e valida√ß√£o para o fold atual (do conte√∫do da lista/array)
    X_train_fold_content = [X_CV_content[i] for i in train_index]
    y_train_fold = y_CV[train_index]
    
    X_val_fold_content = [X_CV_content[i] for i in val_index]
    y_val_fold = y_CV[val_index]
    
    # B. Tokeniza√ß√£o dos dados do Fold
    train_encodings = tokenizer(X_train_fold_content, max_length=128, padding='max_length', truncation=True, return_tensors='tf')
    X_train_fold_ids = train_encodings['input_ids'].numpy()
    X_train_fold_mask = train_encodings['attention_mask'].numpy()

    val_encodings = tokenizer(X_val_fold_content, max_length=128, padding='max_length', truncation=True, return_tensors='tf')
    X_val_fold_ids = val_encodings['input_ids'].numpy()
    X_val_fold_mask = val_encodings['attention_mask'].numpy()
    
    # C. Re-instanciar e compilar o modelo
    tf.keras.backend.clear_session() 
    model = TFRobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
    
    # üåü CORRE√á√ÉO: Usando tf.keras.optimizers.Adam explicitamente para resolver o AttributeError
    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE) 
    
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    # D. Preparar Datasets do Fold
    train_fold_dataset = tf.data.Dataset.from_tensor_slices(
        ({'input_ids': X_train_fold_ids, 'attention_mask': X_train_fold_mask}, y_train_fold)
    ).shuffle(len(y_train_fold)).batch(BATCH_SIZE)

    val_fold_dataset = tf.data.Dataset.from_tensor_slices(
        ({'input_ids': X_val_fold_ids, 'attention_mask': X_val_fold_mask}, y_val_fold)
    ).batch(BATCH_SIZE)

    # E. Fine-tune o Modelo no Fold
    early_stop_fold = EarlyStopping(monitor='val_accuracy', mode='max', patience=PATIENCE, restore_best_weights=True, verbose=0)

    start_time_fold = time.time()
    model.fit(train_fold_dataset, validation_data=val_fold_dataset, epochs=1000, callbacks=[early_stop_fold], verbose=0) 
    train_time_fold = time.time() - start_time_fold
    total_cv_train_time += train_time_fold
    
    # F. Avalia√ß√£o no Fold de Valida√ß√£o
    y_val_pred_logits = model.predict(val_fold_dataset, verbose=0).logits
    y_val_pred = np.argmax(y_val_pred_logits, axis=1)

    acc = accuracy_score(y_val_fold, y_val_pred)
    prec = precision_score(y_val_fold, y_val_pred, average='weighted', zero_division=0)
    rec = recall_score(y_val_fold, y_val_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_val_fold, y_val_pred, average='weighted', zero_division=0)
    
    cv_scores["accuracy"].append(acc)
    cv_scores["precision_weighted"].append(prec)
    cv_scores["recall_weighted"].append(rec)
    cv_scores["f1_weighted"].append(f1)
    
    print(f"M√©tricas do Fold {fold_num+1} (Tempo: {train_time_fold:.2f} s): Accuracy={acc:.4f}, F1={f1:.4f}")

print("\n‚úÖ Cross-validation Conclu√≠da.")
average_cv_train_time = total_cv_train_time / K_FOLDS

## --- 4. Treinamento do Modelo Final (Usando X_train_resampled + y_train_resampled) ---

print("\n--- 4. Re-treinando Modelo Final na Base Reamostrada Completa ---")

# A. Tokeniza√ß√£o Final (J√° que os dados reamostrados j√° estavam em disco)
final_train_encodings = tokenizer(X_CV_content, max_length=128, padding='max_length', truncation=True, return_tensors='tf')
X_final_train_ids = final_train_encodings['input_ids'].numpy()
X_final_train_mask = final_train_encodings['attention_mask'].numpy()
y_final_train = y_CV

test_encodings = tokenizer(X_test_content, max_length=128, padding='max_length', truncation=True, return_tensors='tf')
X_final_test_ids = test_encodings['input_ids'].numpy()
X_final_test_mask = test_encodings['attention_mask'].numpy()
y_final_test = y_test

# B. Recria√ß√£o e compila√ß√£o do modelo final
tf.keras.backend.clear_session()
final_model = TFRobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
# üåü CORRE√á√ÉO: Usando tf.keras.optimizers.Adam explicitamente
final_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss=loss, metrics=metrics)

# C. Preparar Datasets de treino e teste finais
final_train_dataset = tf.data.Dataset.from_tensor_slices(
    ({'input_ids': X_final_train_ids, 'attention_mask': X_final_train_mask}, y_final_train)
).shuffle(len(y_final_train)).batch(BATCH_SIZE)

final_test_dataset = tf.data.Dataset.from_tensor_slices(
    ({'input_ids': X_final_test_ids, 'attention_mask': X_final_test_mask}, y_final_test)
).batch(BATCH_SIZE)

# D. Treinamento
early_stop_final = EarlyStopping(
    monitor='val_accuracy', mode='max', patience=PATIENCE, restore_best_weights=True, verbose=1)

start_time_final_train = time.time()
final_model.fit(
    final_train_dataset,
    validation_data=final_test_dataset, 
    epochs=1000,
    callbacks=[early_stop_final],
    verbose=1)
final_train_time = time.time() - start_time_final_train

## --- 5. Avalia√ß√£o Final (Test Set Original) ---
print("\n--- 5. Predi√ß√£o e Avalia√ß√£o no Conjunto de Teste Original ---")
start_time_predict = time.time()
predictions = final_model.predict(final_test_dataset, verbose=0)
pred_time_ms = (time.time() - start_time_predict) * 1000

y_pred_logits = predictions.logits
y_pred_test = np.argmax(y_pred_logits, axis=1)

## --- 6. Chamar a Fun√ß√£o de Avalia√ß√£o e Salvar Resultados ---
model_display_name = f'RoBERTa (base) {K_FOLDS}-fold CV (Reamostrado)'

test_result = evaluate_roberta_model(
    model_name=model_display_name,
    y_test=y_final_test, 
    y_pred=y_pred_test, 
    pred_time_ms=pred_time_ms, 
    train_time=final_train_time,
    cv_scores=cv_scores, 
    test_result=test_result,
)

print("\n--- Resultados Finais Consolidados (M√©dia CV, Desvio Padr√£o e CI 95%) ---")
print(test_result.T) 

## --- 7. Salvar Modelo Final ---
save_directory = "./roberta_classifier_final_cv_resampled"
final_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print(f"\nModelo final e tokenizer salvos em {save_directory}")

In [None]:

!pip install "tensorflow==2.15.0" "keras==2.15.0" "transformers==4.35.2"

In [None]:
!pip uninstall tf-keras
!pip uninstall keras
!pip uninstall tensorflow-addons