In [None]:
# Modelo entrenado en Google Colab
#  Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Verifica que los archivos estén disponibles
ls "/content/drive/My Drive/dataset"

In [None]:
pip install transformers

In [None]:
# Biblioteca para crear barras de progreso
pip install tqdm

In [None]:
# Notebook para entrenar modelo BERT de toxicidad en Google Colab

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import time
import os
from tqdm import tqdm

# Verificar si hay GPU disponible
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {DEVICE}")

# Definir rutas de archivos
# Ajusta estas rutas según donde hayas subido tus archivos en Google Drive
DRIVE_PATH = "/content/drive/My Drive/dataset"  # Updated path to include the "dataset" folder
TRAIN_PATH = f"{DRIVE_PATH}/train.csv"
TEST_PATH = f"{DRIVE_PATH}/test.csv"
MODEL_PATH = f"{DRIVE_PATH}/bert_toxicity_model"

# Configuración
MAX_LEN = 128
BATCH_SIZE = 32  # Aumentado para aprovechar la GPU
EPOCHS = 3
LEARNING_RATE = 2e-5

# Cargar datos
print("Cargando datos...")
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

# Definir etiquetas
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Dividir para validación
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Clase Dataset personalizada
class ToxicityDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten()
        }

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)

        return item

# Cargar tokenizador y modelo
print("Cargando tokenizador y modelo BERT...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_columns),
    problem_type="multi_label_classification"
)
model.to(DEVICE)

# Crear datasets
print("Preparando datasets...")
train_dataset = ToxicityDataset(
    texts=train_df['comment_text'].values,
    labels=train_df[label_columns].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

val_dataset = ToxicityDataset(
    texts=val_df['comment_text'].values,
    labels=val_df[label_columns].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_dataset = ToxicityDataset(
    texts=test_df['comment_text'].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

# Crear dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Optimizador
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Función de entrenamiento
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    losses = []

    progress_bar = tqdm(data_loader, desc="Entrenando", leave=True)

    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )

        loss = outputs.loss
        losses.append(loss.item())

        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

        loss.backward()
        optimizer.step()

    return np.mean(losses)

# Función de evaluación
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []

    progress_bar = tqdm(data_loader, desc="Evaluando", leave=True)

    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )

            logits = outputs.logits
            probs = torch.sigmoid(logits).cpu().numpy()
            predictions.extend(probs)

            if 'labels' in batch:
                actual_labels.extend(batch['labels'].cpu().numpy())

    predictions = np.array(predictions)

    if actual_labels:
        actual_labels = np.array(actual_labels)
        auc_scores = []

        for i in range(len(label_columns)):
            auc = roc_auc_score(actual_labels[:, i], predictions[:, i])
            auc_scores.append(auc)
            print(f"AUC-ROC para {label_columns[i]}: {auc:.4f}")

        mean_auc = np.mean(auc_scores)
        print(f"AUC-ROC promedio: {mean_auc:.4f}")
        return mean_auc, predictions

    return None, predictions

# Entrenamiento
print("Iniciando entrenamiento...")
best_auc = 0

for epoch in range(EPOCHS):
    print(f"Época {epoch + 1}/{EPOCHS}")
    start_time = time.time()

    train_loss = train_epoch(model, train_loader, optimizer, DEVICE)
    print(f"Pérdida de entrenamiento: {train_loss:.4f}")

    print("Evaluando...")
    val_auc, _ = evaluate(model, val_loader, DEVICE)

    print(f"Tiempo de época: {time.time() - start_time:.2f} segundos")

    # Guardar el mejor modelo
    if val_auc > best_auc:
        best_auc = val_auc
        # Crear directorio si no existe
        if not os.path.exists(MODEL_PATH):
            os.makedirs(MODEL_PATH)
        # Guardar modelo
        model.save_pretrained(MODEL_PATH)
        tokenizer.save_pretrained(MODEL_PATH)
        print(f"Modelo guardado con AUC: {val_auc:.4f}")

# Generar predicciones para el conjunto de prueba
print("Generando predicciones para el conjunto de prueba...")
_, test_predictions = evaluate(model, test_loader, DEVICE)

# Crear DataFrame para las predicciones
submission_df = pd.DataFrame({
    'id': test_df['id']
})

for i, label in enumerate(label_columns):
    submission_df[label] = test_predictions[:, i]

# Guardar predicciones
submission_df.to_csv(f"{DRIVE_PATH}/bert_submission.csv", index=False)
print("Predicciones guardadas en bert_submission.csv")

# Descargar el modelo y las predicciones (opcional)
from google.colab import files
files.download(f"{DRIVE_PATH}/bert_submission.csv")