In [None]:
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, Trainer, TrainingArguments
!pip install transformers
!pip install datasets
from google.colab import drive
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import os



In [None]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Cleaned_Golden_Data.xlsx'

df = pd.read_excel(file_path)
df = df[['text', 'summary']]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Split Dataset
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("eslamxm/mt5-base-finetuned-urdu")
# model = AutoModelForSeq2SeqLM.from_pretrained("eslamxm/mt5-base-finetuned-urdu")

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the fine-tuned model and tokenizer
save_path = "/content/drive/MyDrive/mt5_model_tuning_on50k"
tokenizer = AutoTokenizer.from_pretrained(save_path)
model = AutoModelForSeq2SeqLM.from_pretrained(save_path)

print("Fine-tuned model loaded successfully!")




Fine-tuned model loaded successfully!


In [None]:
max_length = 512

def preprocess_data(examples):
    # Ensure all inputs and targets are strings and replace NaN with an empty string
    inputs = [str(text) if pd.notna(text) else "" for text in examples["text"]]
    targets = [str(summary) if pd.notna(summary) else "" for summary in examples["summary"]]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")

    # Tokenize targets using `text_target`
    labels = tokenizer(text_target=targets, max_length=max_length, truncation=True, padding="max_length")

    # Add labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)


Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
# Configure TrainingArguments
output_dir = "/content/drive/MyDrive/training-results-mt5-urdu-summarization"  # Model output directory
os.makedirs(output_dir, exist_ok=True)

batch_size = 4  # Adjust based on available GPU memory
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy="steps",
    save_steps=2000,
    bf16 = True
)

# Data Collator
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest")



In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

# Train the Model
trainer.train()

Epoch,Training Loss,Validation Loss
0,No log,1.556414
1,No log,1.541973


TrainOutput(global_step=18, training_loss=1.5174279742770724, metrics={'train_runtime': 34.2884, 'train_samples_per_second': 8.574, 'train_steps_per_second': 0.525, 'total_flos': 344126239801344.0, 'train_loss': 1.5174279742770724, 'epoch': 1.945945945945946})

In [None]:
# Save the Model and Tokenizer
save_path = "/content/drive/MyDrive/mt5_final_tuned"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model and tokenizer saved to {save_path}")

Model and tokenizer saved to /content/drive/MyDrive/mt5_final_tuned


In [None]:
# Evaluation - Generate Summaries for Test Data
import re  # Import regular expression module
def generate_summary(text):
    inputs = tokenizer(
        f"summarize: {text}", return_tensors="pt", max_length=512, truncation=True
    ).to("cuda")

    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=512,  # Shorter summaries for coherence
        ######################
        min_length=30,  # Shorter summaries for coherence
        ########################
        num_beams=8,  # Diversity while maintaining quality
        no_repeat_ngram_size=3,  # Avoid repetition
        length_penalty=1.2,  # Concise outputs
        early_stopping=True
    )

    # Decode the generated tokens
    generated_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Explicitly check and remove <extra_id_0> if it exists
    # cleaned_summary = generated_text.replace("<extra_id_0>", "").strip()
    cleaned_summary = re.sub(r"<extra_id_\d+>", "", generated_text).strip()

    return cleaned_summary



# Evaluate on a Sample of the Test Set
model.to("cuda")
sampled_test_df = test_df.sample(n=50, random_state=42)

results = []
for _, row in sampled_test_df.iterrows():
    text = row["text"]
    reference_summary = row["summary"]
    generated_summary = generate_summary(text)
    results.append({
        "Original Text": text,
        "Reference Summary": reference_summary,
        "Generated Summary": generated_summary,
    })

# Save Results to Excel
results_df = pd.DataFrame(results)
results_df.to_excel("/content/drive/MyDrive/mT5_final_trained.xlsx", index=False)
print("Summaries saved to mT5_final_trained.xlsx")




Summaries saved to mT5_final_trained.xlsx
