# CLASIFICADOR DE CORREO ELECTRONICO SPAM UTILIZANDO FINE TUNING A MODELO PREENTRENADO


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `hf auth whoami` to get more information or `hf auth logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The tok

# 🛠️ Paso 1: Instalar librerías necesarias

In [None]:
!pip install datasets==3.5.0 transformers==4.48.3 evaluate==0.4.5



# Cargar un dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("ucirvine/sms_spam")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 5574
    })
})

## DATASET EN TRAIN Y TEST

In [None]:
ds = ds["train"].train_test_split(test_size=0.2)
ds

DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 4459
    })
    test: Dataset({
        features: ['sms', 'label'],
        num_rows: 1115
    })
})

## TOKENIZACIÓN

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["sms"], padding="max_length", truncation=True)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
ds_train = ds['train']
ds_test = ds['test']
tokenized_train = ds_train.map(tokenize_function, batched=True)
tokenized_test = ds_test.map(tokenize_function, batched=True)

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

## DATA LOADERS

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Cargar el modelo para clasificación

In [None]:
from transformers import AutoModelForSequenceClassification

# Cargamos BERT con una capa final para clasificación binaria (2 clases)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Parámetros de entrenamiento

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",             # 📁 Carpeta donde se guardarán los resultados del modelo entrenado.
    evaluation_strategy="epoch",        # 📊 Estrategia de evaluación: aquí se evalúa el modelo al final de cada época.
    save_strategy="epoch",              # 💾 Guarda el modelo también al final de cada época.
    logging_dir="./logs",               # 📝 Directorio donde se almacenan los registros del entrenamiento (logs).
    per_device_train_batch_size=8,      # 🧠 Tamaño del batch (lote) por dispositivo para entrenamiento. Aquí se usan 8 ejemplos por lote.
    per_device_eval_batch_size=8,       # 🧠 Tamaño del batch por dispositivo para evaluación.
    num_train_epochs=3,                 # 🔁 Número total de épocas de entrenamiento (pasa 3 veces por todos los datos).
    weight_decay=0.01,                  # ⚖️ Aplicación de regularización L2 (weight decay) para evitar overfitting.
    logging_steps=10,                   # 📌 Número de pasos de entrenamiento entre cada log (registro en consola).
    load_best_model_at_end=True,        # 🏆 Carga automáticamente el mejor modelo evaluado al final del entrenamiento.
    save_total_limit=2                  # 🧹 Limita a 2 el número total de checkpoints guardados para ahorrar espacio.
)



# Función de métricas

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    prec, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {"accuracy": acc, "precision": prec, "recall": recall, "f1": f1}

# Entrenar el modelo con Trainer

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Iniciar entrenamiento
trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmax-ponce[0m ([33mmax-ponce-tecsup[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0411,0.043126,0.991928,0.981013,0.962733,0.971787
2,0.0917,0.060628,0.989238,0.962733,0.962733,0.962733
3,0.0001,0.072707,0.990135,0.974684,0.956522,0.965517


TrainOutput(global_step=1674, training_loss=0.038466544096627114, metrics={'train_runtime': 1356.5971, 'train_samples_per_second': 9.861, 'train_steps_per_second': 1.234, 'total_flos': 3519636587550720.0, 'train_loss': 0.038466544096627114, 'epoch': 3.0})

# 📈 Paso 8: Evaluar el modelo

In [None]:
trainer.evaluate()

{'eval_loss': 0.04312586411833763,
 'eval_accuracy': 0.9919282511210762,
 'eval_precision': 0.9810126582278481,
 'eval_recall': 0.9627329192546584,
 'eval_f1': 0.9717868338557993,
 'eval_runtime': 30.9796,
 'eval_samples_per_second': 35.991,
 'eval_steps_per_second': 4.519,
 'epoch': 3.0}

# PROBAR CON UN CORREO NUEVO SI ES SPAM O NO


In [None]:
# Define a new email to test
new_email = "Congratulations! You've won a free vacation. Click here to claim."

# Tokenize the new email
tokenized_email = tokenizer(new_email, padding="max_length", truncation=True, return_tensors="pt")

# Make a prediction
# Move the tokenized input to the same device as the model
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenized_email = {k: v.to(device) for k, v in tokenized_email.items()}

# Use the trained model for prediction
outputs = trainer.model(**tokenized_email)

# Get the predicted class (0 for ham, 1 for spam)
predicted_class = torch.argmax(outputs.logits, dim=1).item()

# Map the predicted class to a label
labels = ["ham", "spam"]
predicted_label = labels[predicted_class]

print(f"The email is classified as: {predicted_label}")

The email is classified as: spam


#Guardar el modelo para publicarlo en Hugging face

In [None]:
trainer.save_model("mponcetf-bert-imdb-finetuned")
tokenizer.save_pretrained("mponcetf-bert-imdb-finetuned")

('mponcetf-bert-imdb-finetuned/tokenizer_config.json',
 'mponcetf-bert-imdb-finetuned/special_tokens_map.json',
 'mponcetf-bert-imdb-finetuned/vocab.txt',
 'mponcetf-bert-imdb-finetuned/added_tokens.json',
 'mponcetf-bert-imdb-finetuned/tokenizer.json')

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `hf auth whoami` to get more information or `hf auth logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The tok

In [None]:
# 💾 Guardar y subir el modelo a Hugging Face Hub
trainer.push_to_hub("mponcetf-bert-imdb-finetuned")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /content/results/training_args.bin    : 100%|##########| 5.30kB / 5.30kB            

  /content/results/model.safetensors    :   0%|          | 14.2kB /  438MB            

CommitInfo(commit_url='https://huggingface.co/MaxFPonce/results/commit/58dfd4f0f4356e2849c18217e595b7572830102b', commit_message='mponcetf-bert-imdb-finetuned', commit_description='', oid='58dfd4f0f4356e2849c18217e595b7572830102b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MaxFPonce/results', endpoint='https://huggingface.co', repo_type='model', repo_id='MaxFPonce/results'), pr_revision=None, pr_num=None)