In [None]:
import os
import re
import torch
import unicodedata
from torch.utils.data import Dataset, DataLoader
from transformers import BartForConditionalGeneration, BartTokenizer, get_scheduler, MarianMTModel, MarianTokenizer
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
from collections import Counter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def jaccard_similarity(str1, str2):
    tokens1 = set(str1.lower().split())
    tokens2 = set(str2.lower().split())
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    if not union:
        return 0
    return len(intersection) / len(union)

In [None]:
class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_length=1024, max_summary_length=150):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_summary_length = max_summary_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]

        input_enc = self.tokenizer(
            text,
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        summary_enc = self.tokenizer(
            summary,
            max_length=self.max_summary_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        labels = summary_enc.input_ids.squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100  # Ignorar padding en loss

        return {
            'input_ids': input_enc.input_ids.squeeze(),
            'attention_mask': input_enc.attention_mask.squeeze(),
            'labels': labels
        }

In [None]:
# --------- CONFIGURACION ---------

model_name = "GanjinZero/biobart-base"  # Cambia si tienes otro modelo bioBART

# Inicializar tokenizer y modelo
print("Cargando tokenizer y modelo...")
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


# Función para limpiar texto
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Normaliza unicode (acentos, etc)
    text = unicodedata.normalize('NFKC', text)

    # Reemplaza saltos de línea, tabs por espacio
    text = re.sub(r'[\r\n\t]+', ' ', text)

    # Elimina caracteres no imprimibles (control)
    text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C')

    # Elimina caracteres raros excepto letras, números, signos básicos y espacios
    text = re.sub(r'[^a-zA-Z0-9áéíóúÁÉÍÓÚüÜñÑ.,;:()\-\'\" ]+', ' ', text)

    # Normaliza múltiples espacios a uno solo
    text = re.sub(r'\s+', ' ', text)

    # Recorta espacios al inicio y final
    text = text.strip()

    return text

# --- Carga del Excel y lectura de textos ---
file_path = "./dataset_con_documento.xlsx"
df = pd.read_excel(file_path)

# Ruta base donde están los archivos txt
base_path = "./txt_convertidos"

texts = []
summaries = []

for idx, row in df.iterrows():
    file_name = row['documento']
    file_path_txt = os.path.join(base_path, file_name)
    with open(file_path_txt, 'r', encoding='utf-8') as f:
        article_text = f.read()
    texts.append(article_text)
    summaries.append(row['abstract'])

print(f"Cargados {len(texts)} artículos y {len(summaries)} abstracts.")

# Aplicar limpieza a textos y summaries
texts = [clean_text(t) for t in texts]
summaries = [clean_text(s) for s in summaries]

Cargando tokenizer y modelo...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Cargados 110 artículos y 110 abstracts.


In [None]:
# Parámetros de entrenamiento
batch_size = 8
num_epochs = 12
learning_rate = 4e-5
max_input_length = 1024
max_summary_length = 350

In [None]:
# --- División en train, val y test ---

# Separar test (15%)
train_val_texts, test_texts, train_val_summaries, test_summaries = train_test_split(
    texts, summaries, test_size=0.15, random_state=42
)

# Dividir train y val (20% del train_val para validación)
train_texts, val_texts, train_summaries, val_summaries = train_test_split(
    train_val_texts, train_val_summaries, test_size=0.2, random_state=42
)

print(f'Tamaño train: {len(train_texts)}')
print(f'Tamaño val: {len(val_texts)}')
print(f'Tamaño test: {len(test_texts)}')

Tamaño train: 74
Tamaño val: 19
Tamaño test: 17


In [None]:
# Crear datasets
train_dataset = SummarizationDataset(train_texts, train_summaries, tokenizer, max_input_length, max_summary_length)
val_dataset = SummarizationDataset(val_texts, val_summaries, tokenizer, max_input_length, max_summary_length)
test_dataset = SummarizationDataset(test_texts, test_summaries, tokenizer, max_input_length, max_summary_length)

# Crear DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_loader) * num_epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_lay

In [None]:

# ----- FUNCION PARA ENTRENAMIENTO -----
def train_epoch(model, dataloader, optimizer, lr_scheduler, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)


# ----- FUNCION PARA EVALUACION -----
def eval_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(dataloader)





In [None]:
# --------- ENTRENAMIENTO ---------
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, lr_scheduler, device)
    val_loss = eval_epoch(model, val_loader, device)
    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Validation Loss: {val_loss:.4f}")



Epoch 1 | Train Loss: 1.3395 | Validation Loss: 0.5990
Epoch 2 | Train Loss: 0.4862 | Validation Loss: 0.4691
Epoch 3 | Train Loss: 0.3306 | Validation Loss: 0.4473
Epoch 4 | Train Loss: 0.2796 | Validation Loss: 0.4068
Epoch 5 | Train Loss: 0.2727 | Validation Loss: 0.3799
Epoch 6 | Train Loss: 0.2192 | Validation Loss: 0.3739
Epoch 7 | Train Loss: 0.1961 | Validation Loss: 0.3725
Epoch 8 | Train Loss: 0.2095 | Validation Loss: 0.3693
Epoch 9 | Train Loss: 0.1741 | Validation Loss: 0.3676
Epoch 10 | Train Loss: 0.1751 | Validation Loss: 0.3696
Epoch 11 | Train Loss: 0.1634 | Validation Loss: 0.3690
Epoch 12 | Train Loss: 0.2801 | Validation Loss: 0.3682


In [None]:
# --------- GENERACION DE RESUMEN ---------
def generate_summary(text, model, tokenizer, device,
                     max_length=max_input_length,
                     num_beams=10,
                     length_penalty=1.2,
                     early_stopping=True):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", max_length=max_input_length, truncation=True).to(device)
    summary_ids = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# --------- EVALUACION EN TEST ---------
def evaluate_test_set(test_texts, test_summaries, model, tokenizer, device):
    from collections import Counter

    def jaccard_similarity(str1, str2):
        tokens1 = set(str1.lower().split())
        tokens2 = set(str2.lower().split())
        intersection = tokens1.intersection(tokens2)
        union = tokens1.union(tokens2)
        if not union:
            return 0
        return len(intersection) / len(union)

    model.eval()
    similarities = []
    for i, text in enumerate(test_texts):
        gen_summary = generate_summary(text, model, tokenizer, device)
        jaccard = jaccard_similarity(gen_summary, test_summaries[i])
        similarities.append(jaccard)
        print(f"Artículo {i+1}: Jaccard similarity = {jaccard:.4f}")
        print(f"Resumen generado: {gen_summary}")
        print(f"Resumen original: {test_summaries[i]}\n")

    avg_sim = sum(similarities) / len(similarities)
    print(f"Similitud Jaccard promedio en test: {avg_sim:.4f}")


evaluate_test_set(test_texts, test_summaries, model, tokenizer, device)

Artículo 1: Jaccard similarity = 0.1095
Resumen generado: LIMCas (low- to intermediate-grade invasive mammary carcinomas with discohesive tumor cells with single-file infiltrative growth patterns dispersed in the fibrous stroma2. The differences between IDC and ILC, from clinicopathological features to prognostic outcomes, have been extensively reported in the literature, sometimes with conflicting results3 5. More recently, attention has turned to the molecular and evolutionary differ- ences between the two entities and their precursor lesions, laying the foundations for personalized management of breast cancer.
Resumen original: This study describes lobular-like invasive mammary carcinomas (LLIMCas), a group of low- to intermediate-grade invasive mammary carcinomas with discohesive, diffusely infiltrative cells showing retained circumferential membranous immunoreactivity for both E-cadherin and p120. We analyzed the clinical-pathologic features of 166 LLIMCas compared to 104 classica

In [None]:
!pip install rouge_score



In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_rouge(references, predictions):
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for ref, pred in zip(references, predictions):
        score = scorer.score(ref, pred)
        for key in scores:
            scores[key].append(score[key].fmeasure)
    avg_scores = {key: sum(vals)/len(vals) for key, vals in scores.items()}
    return avg_scores

# Ejemplo de uso con tus datos test
predicted_summaries = [generate_summary(t, model, tokenizer, device) for t in test_texts]
rouge_scores = evaluate_rouge(test_summaries, predicted_summaries)

print("ROUGE Scores:", rouge_scores)


ROUGE Scores: {'rouge1': 0.497814861682714, 'rouge2': 0.42088737906258367, 'rougeL': 0.4650868964691566}


In [None]:
output_dir = "./model_finetuned_biobart"

# Guarda el modelo
model.save_pretrained(output_dir)

# Guarda el tokenizer
tokenizer.save_pretrained(output_dir)



('./model_finetuned_biobart/tokenizer_config.json',
 './model_finetuned_biobart/special_tokens_map.json',
 './model_finetuned_biobart/vocab.json',
 './model_finetuned_biobart/merges.txt',
 './model_finetuned_biobart/added_tokens.json')