In [None]:
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import os
os.environ["WANDB_DISABLED"] = "true"


: 

In [None]:
df = pd.read_csv('Open-Patients-With-Summaries.csv')

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
torch.cuda.empty_cache()

In [None]:
model_name = 'google/pegasus-large'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to('cuda')

In [None]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples['original_text'], padding="max_length", truncation=True, max_length=256, return_tensors="pt"
    )
    labels = tokenizer(
        examples['summary'], padding="max_length", truncation=True, max_length=128, return_tensors="pt"
    )

    labels["input_ids"] = torch.tensor([
        [(label if label != tokenizer.pad_token_id else -100) for label in seq]
        for seq in labels["input_ids"]
    ])

    inputs = {key: value.to('cuda') for key, value in inputs.items()}
    inputs["labels"] = labels["input_ids"].to('cuda')  # Move labels to GPU
    return inputs

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(test_df)

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

In [None]:

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir='./logs',
    fp16=True,
    report_to=None)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator)

trainer.train()

In [None]:
save_path = './pegasus-finetuned'
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
def calculate_metrics(row):
    inputs = tokenizer(
        row['original_text'], return_tensors="pt", padding="max_length", truncation=True, max_length=256
    ).input_ids.to(device)
    
    target = tokenizer(
        row['summary'], return_tensors="pt", padding="max_length", truncation=True, max_length=128
    ).input_ids.to(device)

    generated_ids = model.generate(inputs, max_length=128, num_beams=5, early_stopping=True)
    predicted_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    bleu = sentence_bleu([row['summary'].split()], predicted_summary.split())
    
    return bleu, predicted_summary

In [None]:
test_metrics = test_df.apply(calculate_metrics, axis=1, result_type='expand')
test_metrics.columns = ['bleu_score', 'predicted_summary']

In [None]:
test_df = pd.concat([test_df, test_metrics], axis=1)

In [None]:
duplicate_columns = test_df.columns.duplicated(keep='last') 

test_df = test_df.loc[:, duplicate_columns | ~test_df.columns.duplicated(keep='first')]

print(test_df.columns)

In [None]:
avg_bleu = test_df['bleu_score'].mean()
print(f"Average Test BLEU Score: {avg_bleu:.4f}")

In [None]:
print(test_df['bleu_score'].head())  
print(test_df['bleu_score'].dtype)  

In [None]:
output_path = './Test-Predicted-With-Metrics-Pegasus.csv'
test_df.to_csv(output_path, index=False)

In [None]:
print(f"Test results saved to {output_path}")

In [None]:
df_results = pd.read_csv('Test-Predicted-With-Metrics-Pegasus.csv')

In [None]:
df_results.head()

In [None]:
avg_bleu = df_results['bleu_score_predicted'].mean()
print(f"Average Test BLEU Score: {avg_bleu:.4f}")

In [None]:
df[['rouge1','rouge2','rougeL']].head()

In [None]:
avg_rouge1, avg_rouge2, avg_rougeL = df[['rouge1','rouge2','rougeL']].mean()

In [None]:
print(f"Average Test Rouge1 Score: {avg_rouge1:.4f}")
print(f"Average Test Rouge2 Score: {avg_rouge2:.4f}")
print(f"Average Test RougeL Score: {avg_rougeL:.4f}")