In [1]:
import accelerate
print(accelerate.__version__)

0.26.1


In [2]:
import transformers
print(transformers.__version__)

4.51.2


In [3]:
import sys
print(sys.executable)

C:\Users\Usuario\anaconda3\envs\bart_env_gpu\python.exe


### Se carga la gpu pero finalmente no se puede utilizar ya que da errores. 

In [16]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce GTX 1050


### Tras haber configurado la gpu y importado ciertos paquetes clave, se lee el csv con las noticias

In [17]:
### Tras haber configurado la gpu y importado ciertos paquetes clave, se lee el csv con las noticias

import pandas as pd

ruta = 'bbc_data.csv'
# Leer un archivo CSV (si ya lo tienes subido)
df = pd.read_csv(ruta)

# Ver las primeras filas
df.head()

Unnamed: 0,data,labels
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment


In [18]:
from transformers import pipeline
import pandas as pd
from tqdm import tqdm

### Se utiliza el modelo distilbart-cnn-12-6 ya entrenado para resumir noticias de nuestros conjunto de datos y así tener datos con los que hacer fine tuning a un modelo que se adecúe tanto a nuestra noticias como a los resúmenes.

In [22]:
### Se utiliza el modelo distilbart-cnn-12-6 ya entrenado para resumir noticias de nuestros conjunto de datos y así tener datos con los que hacer fine tuning a un modelo que se adecúe tanto a nuestra noticias como a los resúmenes.

import pandas as pd
import torch
from transformers import pipeline
from tqdm import tqdm

# 1. Cargar el dataset original y quedarnos solo con las primeras 150 noticias
df = pd.read_csv("pseudo_summary_dataset.csv").head(150)

# 2. Forzar uso de CPU (la GPU no tiene suficiente memoria)
device = -1
print(f"✅ Usando CPU (procesando 150 textos para mayor velocidad)")

# 3. Crear pipeline de resumen con modelo ligero
summarizer = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    device=device
)

# 4. Generar resúmenes
resumenes = []
for texto in tqdm(df["data"], desc="🔄 Generando resúmenes"):
    try:
        texto = texto[:1024]  # truncamos si el texto es muy largo
        resumen = summarizer(texto, max_length=60, min_length=20, do_sample=False)[0]["summary_text"]
    except Exception as e:
        resumen = f"ERROR: {str(e)}"
    resumenes.append(resumen)

# 5. Guardar dataset reducido con los resúmenes generados
df["summary"] = resumenes
df.to_csv("pseudo_summary_dataset_150.csv", index=False)
print("✅ ¡Resúmenes generados y guardados en 'pseudo_summary_dataset_150.csv'!")

✅ Usando CPU (procesando 150 textos para mayor velocidad)


Device set to use cpu
🔄 Generando resúmenes: 100%|████████████████████████████████████████████████████████| 150/150 [10:51<00:00,  4.35s/it]


✅ ¡Resúmenes generados y guardados en 'pseudo_summary_dataset_150.csv'!


### Se comprueba que no hay valores erróneos que puedan dar problemas en el entrenamiento

In [9]:
### Se comprueba que no hay valores erróneos que puedan dar problemas en el entrenamiento

from datasets import Dataset
print(df[['data', 'summary']].isnull().sum())  # ← no debe haber NAs
print(df.dtypes)  # ← ambas deben ser string


data       0
summary    0
dtype: int64
data       object
summary    object
dtype: object


### Se utiliza el conjunto de datos con noticias y sus respectivos resúmenes para hacer fine tuning al mismo modelo distilbart-cnn-12-6 y de esta manera que sea mas personalizado. 

In [23]:
### Se utiliza el conjunto de datos con noticias y sus respectivos resúmenes para hacer fine tuning al mismo modelo distilbart-cnn-12-6 y de esta manera que sea mas personalizado. 

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.optim import AdamW
from tqdm import tqdm

# 1. Cargar el dataset
df = pd.read_csv("pseudo_summary_dataset_150.csv")  # columnas: 'data' y 'summary'

# 2. Tokenizer y modelo
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.train()

# 3. Forzar modo CPU
device = torch.device("cpu")
print(f"📍 Usando dispositivo: {device}")
model.to(device)

# ✅ Congelar todas las capas excepto la última del decodificador
for name, param in model.model.named_parameters():
    if "decoder.layers" in name:
        layer_num = int(name.split("decoder.layers.")[1].split(".")[0])
        if layer_num < len(model.model.decoder.layers) - 1:
            param.requires_grad = False  # congelar todas excepto la última
        else:
            param.requires_grad = True   # descongelar solo la última
    else:
        param.requires_grad = False  # congelar todo lo demás (encoder, embeddings, etc.)


# 5. Dataset personalizado
class SummaryDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_len=512, max_target_len=64):
        self.inputs = texts
        self.targets = summaries
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        target_text = str(self.targets[idx])
        input_enc = self.tokenizer(
            input_text, max_length=self.max_input_len, padding="max_length", truncation=True, return_tensors="pt"
        )
        target_enc = self.tokenizer(
            target_text, max_length=self.max_target_len, padding="max_length", truncation=True, return_tensors="pt"
        )
        return {
            "input_ids": input_enc["input_ids"].squeeze(),
            "attention_mask": input_enc["attention_mask"].squeeze(),
            "labels": target_enc["input_ids"].squeeze()
        }

# 6. DataLoader
dataset = SummaryDataset(df["data"], df["summary"], tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# 7. Optimizador
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)

# 8. Entrenamiento
epochs = 4
for epoch in range(epochs):
    print(f"\n🔁 Epoch {epoch+1}/{epochs}")
    epoch_loss = 0
    for batch in tqdm(dataloader, desc="Entrenando"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()

    print(f"✅ Pérdida media epoch {epoch+1}: {epoch_loss / len(dataloader):.4f}")

# 9. Guardar modelo fine-tuned
model.save_pretrained("distilbart_custom_model_frozen")
tokenizer.save_pretrained("distilbart_custom_model_frozen")
print("🎉 ¡Modelo personalizado con capas congeladas guardado en 'distilbart_custom_model_frozen'!")


📍 Usando dispositivo: cpu

🔁 Epoch 1/4


Entrenando: 100%|██████████████████████████████████████████████████████████████████████| 75/75 [03:19<00:00,  2.67s/it]


✅ Pérdida media epoch 1: 1.3927

🔁 Epoch 2/4


Entrenando: 100%|██████████████████████████████████████████████████████████████████████| 75/75 [03:09<00:00,  2.53s/it]


✅ Pérdida media epoch 2: 0.4797

🔁 Epoch 3/4


Entrenando: 100%|██████████████████████████████████████████████████████████████████████| 75/75 [03:11<00:00,  2.55s/it]


✅ Pérdida media epoch 3: 0.3404

🔁 Epoch 4/4


Entrenando: 100%|██████████████████████████████████████████████████████████████████████| 75/75 [03:06<00:00,  2.49s/it]


✅ Pérdida media epoch 4: 0.2945
🎉 ¡Modelo personalizado con capas congeladas guardado en 'distilbart_custom_model_frozen'!


### Se prueban diferentes noticias del conjunto de datos. Resume bien pero hay que tener en cuenta que eran noticias que han participado en el entrenamiento y hace completamente el mismo resumen, lo cual dice que ha aprendido exactamente como se hacen los resumenes de nuestro conjunto de datos (overfitting)

In [31]:
### Se prueban diferentes noticias del conjunto de datos. Resume bien pero hay que tener en cuenta que eran noticias que han participado en el entrenamiento y hace completamente el mismo resumen, lo cual dice que ha aprendido exactamente como se hacen los resumenes de nuestro conjunto de datos (overfitting)

from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import pandas as pd

# 1. Ruta local al modelo y tokenizer en formato POSIX (muy importante evitar backslashes o errores de path)
model_path = "C:/Users/Usuario/AnalisisDatosNoEstructurados/ProyectoAdne_2/distilbart_custom_model_frozen"

# 2. Cargar tokenizer y modelo fine-tuned desde la carpeta local
tokenizer = BartTokenizer.from_pretrained(model_path, local_files_only=True)
model = BartForConditionalGeneration.from_pretrained(model_path, local_files_only=True)

# 3. Dispositivo (si tu GPU no da, se usará CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

print("✅ Modelo cargado correctamente")

# 4. Cargar dataset reducido
df = pd.read_csv("pseudo_summary_dataset_150.csv")

# 5. Seleccionar una noticia cualquiera del dataset
idx = 46  # Cambia este índice para ver otras noticias
texto = df.loc[idx, "data"]
resumen_real = df.loc[idx, "summary"]

# 6. Tokenizar y generar resumen
inputs = tokenizer(
    texto,
    return_tensors="pt",
    max_length=1024,
    truncation=True
).to(device)

summary_ids = model.generate(
    inputs["input_ids"],
    max_length=60,
    min_length=20,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)

resumen_generado = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# 7. Mostrar resultados
print(f"\n📰 Texto original (idx {idx}):\n{texto.strip()}")
print(f"\n📋 Resumen real:\n{resumen_real.strip()}")
print(f"\n📌 Resumen generado:\n{resumen_generado.strip()}")




✅ Modelo cargado correctamente





📰 Texto original (idx 46):
Grammys honour soul star Charles  The memory of soul legend Ray Charles dominated the music worlds leading music ceremony on Sunday as he was given eight posthumous Grammy Awards.  Charles, who died in 2004, got honours including record and album of the year, while Alicia Keys and actor Jamie Foxx performed a musical tribute to him. R&B star Keys won four awards herself at the Grammy ceremony in Los Angeles. U2, Usher, Norah Jones and Kanye West got three each. West led the race going into the ceremony with 10 nominations.  Charles last album, Genius Loves Company, a collection of duets that has sold more than two million copies, was named album of the year and best pop vocal album. His song Here We Go Again with Norah Jones won record of the year and best pop vocal collaboration, while Heaven Help Us All with Gladys Knight picked up best gospel performance. Jones said: "Im glad hes getting recognised, because of who he is and how much I love him." Actor Jam

### Se utilizan noticias generadas sintéticamente para ver que tal resume y lo hace correctamente. Todo parece indicar que el modelo es lo suficientemente robusto debido a que solo se ha entrenado la última capa, para resumir correctamente cualquier tipo de noticia

In [33]:
### Se utilizan noticias generadas sintéticamente para ver que tal resume y lo hace correctamente. Todo parece indicar que el modelo es lo suficientemente robusto debido a que solo se ha entrenado la última capa, para resumir correctamente cualquier tipo de noticia

from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Ruta al modelo fine-tuned local
model_path = "C:/Users/Usuario/AnalisisDatosNoEstructurados/ProyectoAdne_2/distilbart_custom_model_frozen"

# Cargar tokenizer y modelo
tokenizer = BartTokenizer.from_pretrained(model_path, local_files_only=True)
model = BartForConditionalGeneration.from_pretrained(model_path, local_files_only=True)

# Seleccionar dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Noticia inventada en inglés
fake_news = """
In a surprising turn of events, Galician scientists have discovered a new subatomic particle 
that could completely reshape our understanding of the universe. The discovery, made in an underground laboratory in Lugo, 
has been described as "revolutionary" by international experts, although further experiments are still being conducted 
to confirm its properties. The research team, led by Dr. Carmela Piñeiro, claims the particle interacts with dark matter 
in ways never seen before. The government has already announced plans to financially support the development of new experiments 
to explore this phenomenon.
"""

# Tokenizar
inputs = tokenizer(
    fake_news,
    return_tensors="pt",
    max_length=1024,
    truncation=True
).to(device)

# Generar resumen
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=60,
    min_length=20,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)

# Decodificar
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Mostrar resultados
print("\n📰 Fake news:\n", fake_news.strip())
print("\n📌 Generated summary:\n", summary.strip())



📰 Fake news:
 In a surprising turn of events, Galician scientists have discovered a new subatomic particle 
that could completely reshape our understanding of the universe. The discovery, made in an underground laboratory in Lugo, 
has been described as "revolutionary" by international experts, although further experiments are still being conducted 
to confirm its properties. The research team, led by Dr. Carmela Piñeiro, claims the particle interacts with dark matter 
in ways never seen before. The government has already announced plans to financially support the development of new experiments 
to explore this phenomenon.

📌 Generated summary:
 Galician scientists have discovered a new subatomic particle that could completely reshape our understanding of the universe . The discovery, made in an underground laboratory in Lugo, has been described as "revolutionary" by international experts . The government has already announced plans to financially support the development of


In [36]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Ruta al modelo fine-tuned local
model_path = "C:/Users/Usuario/AnalisisDatosNoEstructurados/ProyectoAdne_2/distilbart_custom_model_frozen"

# Cargar tokenizer y modelo
tokenizer = BartTokenizer.from_pretrained(model_path, local_files_only=True)
model = BartForConditionalGeneration.from_pretrained(model_path, local_files_only=True)

# Seleccionar dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

new_fake_news = """
The International Space Agency has announced the successful deployment of the first solar-powered station on the moon. 
The facility, built in collaboration with European and Asian partners, is expected to provide continuous energy 
to future lunar missions. Scientists believe this marks a major milestone in sustainable space exploration 
and opens the door for permanent human presence on the moon by the 2030s.
"""

# Tokenizar
inputs = tokenizer(
    new_fake_news,
    return_tensors="pt",
    max_length=1024,
    truncation=True
).to(device)

# Generar resumen
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=60,
    min_length=20,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)

# Decodificar
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Mostrar resultados
print("\n📰 Fake news:\n", new_fake_news.strip())
print("\n📌 Generated summary:\n", summary.strip())



📰 Fake news:
 The International Space Agency has announced the successful deployment of the first solar-powered station on the moon. 
The facility, built in collaboration with European and Asian partners, is expected to provide continuous energy 
to future lunar missions. Scientists believe this marks a major milestone in sustainable space exploration 
and opens the door for permanent human presence on the moon by the 2030s.

📌 Generated summary:
 International Space Agency has announced the successful deployment of the first solar-powered station on the moon . The facility is expected to provide continuous energy to future lunar missions . Scientists believe this marks a major milestone in sustainable space exploration .


### Se prueba con una noticia más larga para probar si su rendimiento no se ve afectado

In [37]:
### Se prueba con una noticia más larga para probar si su rendimiento no se ve afectado

from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Ruta al modelo fine-tuned local
model_path = "C:/Users/Usuario/AnalisisDatosNoEstructurados/ProyectoAdne_2/distilbart_custom_model_frozen"

# Cargar tokenizer y modelo
tokenizer = BartTokenizer.from_pretrained(model_path, local_files_only=True)
model = BartForConditionalGeneration.from_pretrained(model_path, local_files_only=True)

# Seleccionar dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

long_fake_news = """
In a remarkable turn of events, an international coalition of environmental scientists and engineers has unveiled a groundbreaking technology 
capable of capturing and transforming atmospheric CO2 into clean energy at a previously unimaginable scale. 
The system, which was developed in secret over the past decade, uses a series of advanced quantum reactors that 
break down carbon dioxide molecules and restructure them into hydrogen-based fuels, emitting only water vapor in the process.

The first pilot plant, built near the coast of Norway, has already started operating at 40% efficiency and is expected 
to scale up to full capacity by the end of the year. This announcement has sent ripples through global markets, with 
oil prices plummeting and green tech stocks surging. Leaders around the world have expressed both excitement and caution, 
highlighting the need for transparent regulation and fair access to what could be one of the most disruptive innovations of the century.

Nobel Prize nominee Dr. Lina Matsuda, one of the lead architects of the project, stated that the team's goal is to 
“reset the planet’s carbon balance within a generation.” Meanwhile, critics warn that geopolitical tensions might arise 
if the technology is monopolized or weaponized. Nevertheless, hopes are high that this discovery will usher in a new era 
of sustainable progress.
"""

# Tokenizar
inputs = tokenizer(
    long_fake_news,
    return_tensors="pt",
    max_length=1024,
    truncation=True
).to(device)

# Generar resumen
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=60,
    min_length=20,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)

# Decodificar
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Mostrar resultados
print("\n📰 Fake news:\n", long_fake_news.strip())
print("\n📌 Generated summary:\n", summary.strip())



📰 Fake news:
 In a remarkable turn of events, an international coalition of environmental scientists and engineers has unveiled a groundbreaking technology 
capable of capturing and transforming atmospheric CO2 into clean energy at a previously unimaginable scale. 
The system, which was developed in secret over the past decade, uses a series of advanced quantum reactors that 
break down carbon dioxide molecules and restructure them into hydrogen-based fuels, emitting only water vapor in the process.

The first pilot plant, built near the coast of Norway, has already started operating at 40% efficiency and is expected 
to scale up to full capacity by the end of the year. This announcement has sent ripples through global markets, with 
oil prices plummeting and green tech stocks surging. Leaders around the world have expressed both excitement and caution, 
highlighting the need for transparent regulation and fair access to what could be one of the most disruptive innovations of the cent