In [2]:
!pip install torch transformers scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [1]:
import pandas as pd
df = pd.read_csv("/content/noticias_fake_true.csv")


In [2]:
import torch # para construir y entrenar modelos de deep learning de redes neuronales
import torch.nn as nn # definir capas neuronales
import torch.optim as optim # para actualixar los pesos del modelo
from torch.utils.data import DataLoader, Dataset # para crear conjuntos de datos personalizados, cargar mini lotes y hacer ell entrenamiento eficiente
from transformers import BertTokenizer, BertForSequenceClassification #  librería transformers de Hugging Face y nos permite usar BERT.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Configuración
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Preparación de los datos
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# División de datos
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
test_dataset = NewsDataset(test_texts, test_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Modelo
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Entrenamiento
def train_model(model, train_loader, optimizer, criterion, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

train_model(model, train_loader, optimizer, criterion, device)

# Evaluación
def evaluate_model(model, test_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    print("Accuracy:", accuracy_score(true_labels, predictions))
    print(classification_report(true_labels, predictions))

evaluate_model(model, test_loader, device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.009456310622747722


KeyboardInterrupt: 

In [3]:
import torch # para construir y entrenar modelos de deep learning de redes neuronales
from torch.utils.data import DataLoader, Dataset # definir capas neuronales
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

#  Verificar si hay GPU disponible y mover modelo a GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Dispositivo usado:", device)


#  Preprocesamiento: Usar solo el texto y la etiqueta
X = df['text'].fillna(" ")  # Rellenamos valores nulos con texto vacío
y = df['label']

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cargar un modelo más liviano: DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)

# Tokenizar los textos
def encode_texts(texts, tokenizer, max_length=256):
    return tokenizer(list(texts), padding=True, truncation=True, max_length=max_length, return_tensors="pt")

train_encodings = encode_texts(X_train, tokenizer)
test_encodings = encode_texts(X_test, tokenizer)

# Crear Dataset personalizado
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

train_dataset = NewsDataset(train_encodings, y_train)
test_dataset = NewsDataset(test_encodings, y_test)

# Usar batch_size más pequeño para evitar problemas de memoria
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4)

#  Definir optimizador y función de pérdida
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Función de entrenamiento
def train_model(model, train_loader, optimizer, loss_fn, device, epochs=2):
    model.train()
    for epoch in range(epochs):
        print(f"Época {epoch+1}/{epochs}")
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Pérdida promedio: {total_loss / len(train_loader)}")

#  Entrenar modelo (2 épocas para prueba rápida)
train_model(model, train_loader, optimizer, loss_fn, device, epochs=2)

#  Evaluar el modelo
def evaluate_model(model, test_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    print(f"Precisión del modelo: {accuracy:.4f}")
    print("Reporte de clasificación:")
    print(classification_report(true_labels, predictions))

#  Ejecutar evaluación
evaluate_model(model, test_loader, device)


Dispositivo usado: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Época 1/2
Pérdida promedio: 0.007961040178913678
Época 2/2
Pérdida promedio: 0.0011333290111552625
Precisión del modelo: 0.9994
Reporte de clasificación:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4650
           1       1.00      1.00      1.00      4330

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



🔹 Precisión (Precision): Proporción de predicciones correctas entre todas las predicciones positivas.

🔹 Recall: Qué tan bien el modelo identifica correctamente las noticias de cada clase.

🔹 F1-score: Media armónica entre precisión y recall (valor clave para balance entre ambas métricas).

🔹 Support: Número de ejemplos de cada clase en los datos de prueba.

En este caso:

Clase 0 (Noticias Falsas) y Clase 1 (Noticias Verdaderas) tienen un 100% en todas las métricas.
El modelo no está cometiendo errores, lo cual es extremadamente raro en un problema del mundo real.

In [6]:
import torch
from sklearn.metrics import accuracy_score

def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    return accuracy_score(true_labels, predictions)

# Calcular precisión en entrenamiento
train_accuracy = evaluate(model, train_loader, torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Calcular precisión en prueba
test_accuracy = evaluate(model, test_loader, torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Imprimir resultados
print(f"📊 Precisión en entrenamiento: {train_accuracy:.4f}")
print(f"📊 Precisión en prueba: {test_accuracy:.4f}")


📊 Precisión en entrenamiento: 0.9997
📊 Precisión en prueba: 0.9994




Ambas precisiones son extremadamente altas y casi iguales.
No hay una diferencia significativa entre la precisión en entrenamiento y en prueba.
El modelo no está sobreajustando gravemente, ya que generaliza bien a los datos de prueba.