<a href="https://colab.research.google.com/github/LuViBeBe93/tesis/blob/main/T5_F1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers pandas torch scikit-learn bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
import math
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score as bert_score

In [3]:
# Hiperparámetros
HYPERPARAMS = {
    "model_name": "t5-small",
    "max_length": 128,
    "batch_size": 2,
    "num_epochs": 10,
    "learning_rate": 5e-5,
    "num_beams": 5,
    "no_repeat_ngram_size": 2,
    "early_stopping": True,
    "logging_steps": 100,
    "output_dir": "./noticias_t5_model",
    "logging_dir": "./logs",
    "save_steps": 500,
    "save_total_limit": 2
}

In [4]:
# Cargar datos
from google.colab import files
uploaded = files.upload()

data = pd.read_excel('BASE_30122024.xlsx')
data = pd.DataFrame(data)

Saving BASE_30122024.xlsx to BASE_30122024.xlsx


In [5]:
# Convertir datos numéricos a texto
def datos_a_texto(row):
    texto_datos = (
        f"Año: {row.get('Año', 'NaN')}, Mes: {row.get('Mes', 'NaN')}, "
        f"Trimestre: {row.get('Trimestre', 'NaN')}, Frecuencia: {row.get('Frecuencia', 'NaN')}, "
        f"Año_comparación: {row.get('Año_comparación', 'NaN')}, País: {row.get('País', 'NaN')}, "
        f"Tendencia: {row.get('Tendencia', 'NaN')}, Sector: {row.get('Sector', 'NaN')}, "
        f"Indicador: {row.get('Indicador', 'NaN')}, Valor_actual: {row.get('Valor_actual', 'NaN')}, "
        f"Variación: {row.get('Variación', 'NaN')}, Valor_comparación: {row.get('Valor_comparación', 'NaN')}, "
        f"Entidad: {row.get('Entidad', 'NaN')}"
    )
    return texto_datos

data["entrada_texto"] = data.apply(datos_a_texto, axis=1)
data["texto_final"] = data["entrada_texto"] + " </s> Noticia: " + data["Noticia"]


In [6]:
# Clase Dataset personalizada
class NoticiasDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entrada_texto = self.data.iloc[idx]["entrada_texto"]
        noticia = self.data.iloc[idx]["Noticia"]

        input_encoding = self.tokenizer(
            entrada_texto,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            noticia,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": input_encoding["input_ids"].flatten(),
            "attention_mask": input_encoding["attention_mask"].flatten(),
            "labels": target_encoding["input_ids"].flatten(),
        }

In [7]:
# Cargar tokenizador y modelo preentrenado
tokenizer = T5Tokenizer.from_pretrained(HYPERPARAMS["model_name"])
model = T5ForConditionalGeneration.from_pretrained(HYPERPARAMS["model_name"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
# Crear dataset y data loader con hiperparámetros
dataset = NoticiasDataset(data, tokenizer, max_length=HYPERPARAMS["max_length"])
train_loader = DataLoader(dataset, batch_size=HYPERPARAMS["batch_size"], shuffle=True)

In [9]:
# Argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir=HYPERPARAMS["output_dir"],
    num_train_epochs=HYPERPARAMS["num_epochs"],
    per_device_train_batch_size=HYPERPARAMS["batch_size"],
    save_steps=HYPERPARAMS["save_steps"],
    save_total_limit=HYPERPARAMS["save_total_limit"],
    logging_dir=HYPERPARAMS["logging_dir"],
    logging_steps=HYPERPARAMS["logging_steps"],
    learning_rate=HYPERPARAMS["learning_rate"],
)

# Entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Entrenar modelo
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,2.6568
200,0.9221
300,0.717
400,0.692
500,0.5984
600,0.5853
700,0.5146
800,0.5606
900,0.5296
1000,0.4634


TrainOutput(global_step=15940, training_loss=0.22658571815550552, metrics={'train_runtime': 1466.6669, 'train_samples_per_second': 21.736, 'train_steps_per_second': 10.868, 'total_flos': 1078674157731840.0, 'train_loss': 0.22658571815550552, 'epoch': 10.0})

In [10]:
# Generar noticias con parámetros adicionales
def generar_noticia(model, tokenizer, datos):
    # Convertir los datos a texto
    entrada_texto = datos_a_texto(datos)

    # Codificar la entrada
    inputs = tokenizer.encode(entrada_texto, return_tensors="pt")

    # Mover los tensores al mismo dispositivo que el modelo
    inputs = inputs.to(model.device)  # Mover los inputs a la misma device que el modelo

    # Generar la salida con los parámetros dados
    outputs = model.generate(
        inputs,
        max_length=HYPERPARAMS["max_length"],
        num_beams=HYPERPARAMS["num_beams"],
        no_repeat_ngram_size=HYPERPARAMS["no_repeat_ngram_size"],
        early_stopping=HYPERPARAMS["early_stopping"],
        temperature=0.7,  # Control de la aleatoriedad (menor temperatura = menos aleatorio)
        top_p=0.9,  # Top-p sampling (para diversidad)
        do_sample=True  # Activa la aleatorización (en lugar de beam search)
    )

    # Decodificar la salida
    return tokenizer.decode(outputs[0], skip_special_tokens=True)



In [25]:
# Ejemplo 1
new_data = {
    'Año': '2024',
    'Mes': 'enero',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '2023',
    'País': 'Colombia',
    'Tendencia': 'Crecimiento',
    'Sector': 'Economía',
    'Indicador': 'Inflación',
    'Valor_actual': '9%',
    'Variación': '1%',
    'Valor_comparación': '8%',
    'Entidad': 'DANE',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: El DANE informó que la inflación en Colombia fue del 9 %.


In [26]:
# Ejemplo 1.1
new_data = {
    'Año': '2024',
    'Mes': 'enero',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '2023',
    'País': 'Colombia',
    'Tendencia': 'Crecimiento',
    'Sector': 'Economía',
    'Indicador': 'Inflación',
    'Valor_actual': '9%',
    'Variación': '1%',
    'Valor_comparación': '8%',
    'Entidad': 'DANE',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: El DANE reportó que la inflación anual en eero de 2024 fue del 9 %.


In [109]:
# Ejemplo 1.2
new_data = {
    'Año': '2024',
    'Mes': 'enero',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '2023',
    'País': 'Colombia',
    'Tendencia': 'Crecimiento',
    'Sector': 'Economía',
    'Indicador': 'Inflación',
    'Valor_actual': '9%',
    'Variación': '1%',
    'Valor_comparación': '8%',
    'Entidad': 'DANE',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: En enero de 2024, la tasa de inflación alcanzó el 9 %, lo que representó un incremento respecto al ao 2023.


In [110]:
# Ejemplo 2
new_data = {
    'Año': '2024',
    'Mes': '',
    'Trimestre': 'Primer',
    'Frecuencia': 'Trimestral',
    'Año_comparación': '',
    'País': 'Argentina',
    'Tendencia': 'Crecimiento',
    'Sector': 'Economía',
    'Indicador': 'PIB',
    'Valor_actual': '10%',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: El Producto Interno Bruto (PIB) de Argentina crece un 10% en el primer trimestre de 2024


In [39]:
# Ejemplo 2.1
new_data = {
    'Año': '2024',
    'Mes': '',
    'Trimestre': 'Primer',
    'Frecuencia': 'Trimestral',
    'Año_comparación': '',
    'País': 'Argentina',
    'Tendencia': 'Crecimiento',
    'Sector': 'Economía',
    'Indicador': 'PIB',
    'Valor_actual': '10%',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: Crecimiento del PIB de Argentina: 10% en el primer trimestre de 2024


In [40]:
# Ejemplo 2.2
new_data = {
    'Año': '2024',
    'Mes': '',
    'Trimestre': 'Primer',
    'Frecuencia': 'Trimestral',
    'Año_comparación': '',
    'País': 'Argentina',
    'Tendencia': 'Crecimiento',
    'Sector': 'Economía',
    'Indicador': 'PIB',
    'Valor_actual': '10%',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: El PIB de Argentina creció un 10% en el primer trimestre de 2024, superando las expectativas del mercado.


In [41]:
# Ejemplo 3
new_data = {
    'Año': '2020',
    'Mes': 'mayo',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '',
    'País': 'Bolivia',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Tasa_desempleo',
    'Valor_actual': '3%',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: Para mayo de 2020, la tasa de desempleo en Bolivia fue del 3%.


In [47]:
# Ejemplo 3.1
new_data = {
    'Año': '2020',
    'Mes': 'mayo',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '',
    'País': 'Bolivia',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Tasa_desempleo',
    'Valor_actual': '3%',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: La tasa de desempleo en Bolivia bajó al 3 %, lo que representó un descenso frente al ao anterior.


In [45]:
# Ejemplo 3.2
new_data = {
    'Año': '2020',
    'Mes': 'mayo',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '',
    'País': 'Bolivia',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Tasa_desempleo',
    'Valor_actual': '3%',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: Para mayo de 2020, la tasa de desempleo en Bolivia descendió a 3 %.


In [49]:
# Ejemplo 4
new_data = {
    'Año': '1993',
    'Mes': '',
    'Trimestre': '',
    'Frecuencia': 'Anual',
    'Año_comparación': '1992',
    'País': 'Nicaragua',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Importaciones',
    'Valor_actual': '12000',
    'Variación': '1%',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: En 1993, las importaciones en Nicaragua sumaron 12000 millones de dólares, lo que representó una disminución del 1% respecto al ao 1992


In [58]:
# Ejemplo 4.1
new_data = {
    'Año': '1993',
    'Mes': '',
    'Trimestre': '',
    'Frecuencia': 'Anual',
    'Año_comparación': '1992',
    'País': 'Nicaragua',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Importaciones',
    'Valor_actual': '12000',
    'Variación': '1%',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: El ao de 1993, las importaciones en Nicaragua sumaron 12000 millones de dólares, lo que representó una disminución del1%.


In [111]:
# Ejemplo 4.2
new_data = {
    'Año': '1993',
    'Mes': '',
    'Trimestre': '',
    'Frecuencia': 'Anual',
    'Año_comparación': '1992',
    'País': 'Nicaragua',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Importaciones',
    'Valor_actual': '12000',
    'Variación': '1%',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: En 1993 las importaciones en Nicaragua sumaron 12000 millones de dólares, mostrando una cada del 1% respecto al ao 1992


In [60]:
# Ejemplo 5
new_data = {
    'Año': '1993',
    'Mes': '',
    'Trimestre': '',
    'Frecuencia': 'Anual',
    'Año_comparación': '1992',
    'País': 'Uruguay',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Exportaciones',
    'Valor_actual': '10000',
    'Variación': '1%',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: Las exportaciones de Uruguay alcanzan los 10000 millones de dólares en 1993, una disminución del 1% respecto al ao 1992


In [66]:
# Ejemplo 5.1
new_data = {
    'Año': '1993',
    'Mes': '',
    'Trimestre': '',
    'Frecuencia': 'Anual',
    'Año_comparación': '1992',
    'País': 'Uruguay',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Exportaciones',
    'Valor_actual': '10000',
    'Variación': '1%',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: Exportaciones alcanzan los 10000 millones de dólares en 1993, una caida del1% respecto al ao de 1992


In [70]:
# Ejemplo 5.2
new_data = {
    'Año': '1993',
    'Mes': '',
    'Trimestre': '',
    'Frecuencia': 'Anual',
    'Año_comparación': '1992',
    'País': 'Uruguay',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Exportaciones',
    'Valor_actual': '10000',
    'Variación': '1%',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: Exportaciones de Uruguay alcanzó los 10000 millones de dólares en 1993


In [71]:
# Ejemplo 6
new_data = {
    'Año': '2015',
    'Mes': 'noviembre',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '',
    'País': 'Chile',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Precio_petróleo',
    'Valor_actual': '81',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: El precio del barril de petróleo alcanzó los 81 dólares en noviembre de 2015


In [72]:
# Ejemplo 6.1
new_data = {
    'Año': '2015',
    'Mes': 'noviembre',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '',
    'País': 'Chile',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Precio_petróleo',
    'Valor_actual': '81',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: El precio del petróleo alcanzó un 81 dólares por barril en noviembre de 2015


In [112]:
# Ejemplo 6.2
new_data = {
    'Año': '2015',
    'Mes': 'noviembre',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '',
    'País': 'Chile',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Precio_petróleo',
    'Valor_actual': '81',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: El precio del petróleo alcanzó los 81 dólares en noviembre de 2015


In [76]:
# Ejemplo 7
new_data = {
    'Año': '2015',
    'Mes': 'noviembre',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '',
    'País': 'Italia',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Inversión_extranjera_directa',
    'Valor_actual': '19000',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: La IED en Italia alcanzó los 19000 millones de dólares


In [78]:
# Ejemplo 7.1
new_data = {
    'Año': '2015',
    'Mes': 'noviembre',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '',
    'País': 'Italia',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Inversión_extranjera_directa',
    'Valor_actual': '19000',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: La inversión extranjera directa en Italia alcanzó los 19000 millones de dólares au noviembre de 2015, una disminución del mercado.


In [79]:
# Ejemplo 7.2
new_data = {
    'Año': '2015',
    'Mes': 'noviembre',
    'Trimestre': '',
    'Frecuencia': 'Mensual',
    'Año_comparación': '',
    'País': 'Italia',
    'Tendencia': 'Disminuye',
    'Sector': 'Economía',
    'Indicador': 'Inversión_extranjera_directa',
    'Valor_actual': '19000',
    'Variación': '',
    'Valor_comparación': '',
    'Entidad': '',
}

noticia = generar_noticia(model, tokenizer, new_data)
print("Noticia generada:", noticia)

Noticia generada: La inversión extranjera directa (IED) en Italia alcanzó el 19000 millones de dólares a noviembre de 2015
