In [None]:
import os
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from rouge_score import rouge_scorer
from sacrebleu import corpus_bleu

# Paths to the pre-trained model and original fine-tuned model directory
pretrained_model_name = "google/pegasus-xsum"
trained_model_dir = r"/content/drive/MyDrive/trained_model"
# Path to save the new fine-tuned model
new_trained_model_dir = r"E:\NIKHIL\ML\Text Summerizer Using Deep Learning\models\pegasus-fine_tuned_model"  # Specify new path for saving

# Function to check if the trained model exists
def model_exists(model_dir):
    return os.path.exists(model_dir) and os.path.isdir(model_dir)

# Load the tokenizer
tokenizer = PegasusTokenizer.from_pretrained(pretrained_model_name)

# Load the appropriate model
if model_exists(trained_model_dir):
    print("Loading trained model...")
    model = PegasusForConditionalGeneration.from_pretrained(trained_model_dir)
else:
    print("Loading pre-trained model...")
    model = PegasusForConditionalGeneration.from_pretrained(pretrained_model_name)

# Load the CNN/Daily Mail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Select a very small portion of the dataset for training and evaluation
train_dataset = dataset["train"].select(range(60))
eval_dataset = dataset["validation"].select(range(10))
def preprocess_function(examples):
    inputs = tokenizer(examples["article"], truncation=True, padding="max_length", max_length=512)
    targets = tokenizer(examples["highlights"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply the preprocessing function to the datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# Remove columns not needed for training
train_dataset = train_dataset.remove_columns(["article", "highlights", "id"])
eval_dataset = eval_dataset.remove_columns(["article", "highlights", "id"])

def compute_metrics(pred):
    """Calculates ROUGE and SacreBLEU scores."""
    predictions, labels = pred.predictions, pred.label_ids

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    pred_ids = predictions.argmax(-1)  # Get the predicted token IDs

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [rouge.score(l, p) for l, p in zip(labels_str, pred_str)]

    bleu = corpus_bleu(pred_str, [labels_str])

    avg_rouge = {
        'rouge1': sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores),
        'rouge2': sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores),
        'rougeL': sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores),
    }

    # Logging scalar values only
    return {"eval_rouge1": avg_rouge['rouge1'], "eval_rouge2": avg_rouge['rouge2'], "eval_rougeL": avg_rouge['rougeL'], "eval_sacrebleu": bleu.score}

# Define the training arguments with further optimizations for lower memory usage
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,  # Smaller batch size for lower power usage
    per_device_eval_batch_size=1,  # Smaller batch size for lower power usage
    num_train_epochs=1,  # Reduce the number of epochs
    gradient_accumulation_steps=64,  # Simulate a larger batch size
    logging_dir="./logs",
    logging_steps=10,  # Reduce logging frequency
    save_steps=10,  # Reduce save frequency
    evaluation_strategy="steps",
    eval_steps=10,
    fp16=True,  # Use mixed precision training if supported
    disable_tqdm=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics  # Pass your custom `compute_metrics` function
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model to a different directory
model.save_pretrained(new_trained_model_dir)
tokenizer.save_pretrained(new_trained_model_dir)

# Evaluate the model
results = trainer.evaluate()
print(results)  # This will print a dictionary containing various metrics (loss, ROUGE, etc.)
