# Multilingual Text Summarizer using MT5

---



In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

## 1. Installations

In [4]:
# !pip install pandas numpy torch scikit-learn transformers sentencepiece evaluate rouge_score accelerate

## 2. Imports

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import re
import os
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

from transformers import MT5Tokenizer, MT5ForConditionalGeneration, DataCollatorForSeq2Seq

from transformers.modeling_utils import PreTrainedModel
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union

import evaluate
import traceback
import gc


In [6]:
nltk.download('punkt', quiet=True)

True

## 3. Configuration

In [7]:
MODEL_NAME_OR_PATH = "google/mt5-small"
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
NUM_TRAIN_EPOCHS = 2
LEARNING_RATE = 3e-5
ADAM_EPSILON = 1e-7
WEIGHT_DECAY = 0.01
MAX_GRAD_NORM = 1.0

OUTPUT_DIR = "./mt5_custom"
COMBINED_TRAIN_DATA_PATH = "combined_train_data.csv"
COMBINED_TEST_DATA_PATH = "combined_test_data.csv"
DATA_DIR = "/content/drive/MyDrive/sps/data"
MIN_SUMMARY_WORDS = 4
LOG_INTERVAL = 100

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 4. Data Loading and Preprocessing Functions

In [8]:
def clean_text_series_for_metrics(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

def create_combined_datasets_if_not_exist():
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)
        print(f"Created directory: {DATA_DIR}. Please upload your CSV files here.")
    if os.path.exists(COMBINED_TRAIN_DATA_PATH) and os.path.exists(COMBINED_TEST_DATA_PATH):
        print(f"Found existing combined datasets: {COMBINED_TRAIN_DATA_PATH} and {COMBINED_TEST_DATA_PATH}")
        return
    print(f"Combined datasets not found. Attempting to create them from CSVs in '{DATA_DIR}/' directory...")
    train_files_langs = {
        "english_train.csv": "en", "hindi_train.csv": "hi",
        "gujrati_train.csv": "gu", "bengali_train.csv": "bn",
    }
    test_files_langs = {
        "english_test.csv": "en", "hindi_test.csv": "hi",
        "gujrati_test.csv": "gu", "bengali_test.csv": "bn",
    }
    def process_files(file_lang_map, output_path, dataset_type):
        if os.path.exists(output_path):
            print(f"Found existing combined {dataset_type} dataset: {output_path}")
            return
        all_data = []
        for file_name, lang in file_lang_map.items():
            full_file_path = os.path.join(DATA_DIR, file_name)
            if not os.path.exists(full_file_path):
                print(f"Missing file: {full_file_path}")
                continue
            try:
                df = pd.read_csv(full_file_path, encoding='utf-8', on_bad_lines='skip')
                if 'Article' not in df.columns or 'Summary' not in df.columns:
                    print(f" 'Article' or 'Summary' column missing in {file_name}. Skipping.")
                    continue
                df['Article'] = df['Article'].astype(str).str.strip()
                df['Summary'] = df['Summary'].astype(str).str.strip()
                df.dropna(subset=['Article', 'Summary'], inplace=True)
                df = df[df['Article'].str.len() > 0]
                df = df[df['Summary'].str.len() > 0]
                df = df[df['Summary'].apply(lambda x: len(x.split()) >= MIN_SUMMARY_WORDS)]
                if df.empty:
                    continue
                df['lang'] = lang
                all_data.append(df[['Article', 'Summary', 'lang']])
                print(f"Loaded and processed for {dataset_type}: {file_name}, kept {len(df)} rows.")
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
        if not all_data:
            print(f"No data loaded for {dataset_type} after filtering.")
            return
        combined_df = pd.concat(all_data, ignore_index=True)
        if combined_df.empty:
            print(f"Combined {dataset_type} dataset is empty. Cannot save.")
            return
        combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
        combined_df.to_csv(output_path, index=False)
        print(f"Combined {dataset_type} dataset created and saved to: {output_path} with {len(combined_df)} rows.")
    process_files(train_files_langs, COMBINED_TRAIN_DATA_PATH, "train")
    process_files(test_files_langs, COMBINED_TEST_DATA_PATH, "test")

def load_data(data_path):
    if not os.path.exists(data_path):
        print(f"Data file not found: {data_path}")
        if data_path == COMBINED_TRAIN_DATA_PATH or data_path == COMBINED_TEST_DATA_PATH:
             create_combined_datasets_if_not_exist()
             if not os.path.exists(data_path):
                 return None
        else:
            return None
    try:
        df = pd.read_csv(data_path)
        print(f"Successfully loaded data from {data_path}, shape: {df.shape}")
        return df
    except pd.errors.EmptyDataError:
        print(f"Warning: Data file {data_path} is empty.")
        return None

## 5. Tokenizer and Model Loading

In [9]:
print(f"Loading tokenizer from: {MODEL_NAME_OR_PATH}")
tokenizer = MT5Tokenizer.from_pretrained(MODEL_NAME_OR_PATH)

print(f"Loading pre-trained MT5 model for fine-tuning from: {MODEL_NAME_OR_PATH}")
try:
    model = MT5ForConditionalGeneration.from_pretrained(MODEL_NAME_OR_PATH)
    model.to(DEVICE)
    print(f"Tokenizer and pre-trained model loaded successfully to {DEVICE}.")
except Exception as e:
    print(f"CRITICAL ERROR during model loading: {e}")
    traceback.print_exc()

Loading tokenizer from: google/mt5-small


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loading pre-trained MT5 model for fine-tuning from: google/mt5-small


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizer and pre-trained model loaded successfully to cuda.


## 6. Summarization Dataset Class

In [10]:
def prefix_by_lang(lang):
    return f"summarize in {lang}: "

class SummarizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_len, max_target_len):
        self.data = dataframe.copy()
        self.data['Article'] = self.data['Article'].astype(str).str.strip()
        self.data['Summary'] = self.data['Summary'].astype(str).str.strip()
        initial_len = len(self.data)
        self.data = self.data[self.data['Article'].str.len() > 0]
        self.data = self.data[self.data['Summary'].str.len() > 0]
        self.data = self.data[self.data['Summary'].apply(lambda x: len(x.split()) >= MIN_SUMMARY_WORDS)]
        self.data = self.data.reset_index(drop=True)
        if len(self.data) < initial_len:
            print(f"SummarizationDataset: Initialized with {len(self.data)} rows after filtering {initial_len - len(self.data)} empty/short entries.")
        if len(self.data) == 0:
            print("CRITICAL: SummarizationDataset is empty after filtering. No data available.")
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if index >= len(self.data):
            raise IndexError("Index out of bounds in SummarizationDataset")
        row = self.data.iloc[index]
        article_lang = row['lang']
        article_text_raw = str(row['Article'])
        summary_text_raw = str(row['Summary'])
        input_text = prefix_by_lang(article_lang) + article_text_raw
        target_text = summary_text_raw
        input_enc = self.tokenizer(input_text, max_length=self.max_input_len, padding='do_not_pad', truncation=True, return_tensors="pt")
        target_enc = self.tokenizer(target_text, max_length=self.max_target_len, padding='do_not_pad', truncation=True, return_tensors="pt")
        input_ids = input_enc["input_ids"].squeeze(0)
        attention_mask = input_enc["attention_mask"].squeeze(0)
        labels = target_enc["input_ids"].squeeze(0).clone()
        if labels.ndim == 0: labels = labels.unsqueeze(0)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

## 7. Custom Data Collator

In [11]:
class CustomSummarizationCollator:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, model: Optional[PreTrainedModel] = None, label_pad_token_id: int = -100, pad_to_multiple_of: Optional[int] = None):
        self.tokenizer = tokenizer
        self.model = model
        self.label_pad_token_id = label_pad_token_id
        self.pad_to_multiple_of = pad_to_multiple_of

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        input_ids_list = [feature["input_ids"].tolist() for feature in features]
        attention_mask_list = [feature["attention_mask"].tolist() for feature in features]

        padded_inputs = self.tokenizer.pad(
            {"input_ids": input_ids_list},
            padding="longest",
            max_length=MAX_INPUT_LENGTH,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt"
        )
        padded_attention_masks = self.tokenizer.pad(
            {"input_ids": attention_mask_list},
            padding="longest",
            max_length=MAX_INPUT_LENGTH,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt"
        )["input_ids"]

        batch = {
            "input_ids": padded_inputs["input_ids"],
            "attention_mask": padded_attention_masks
        }

        if "labels" in features[0] and features[0]["labels"] is not None:
            labels_list = [feature["labels"] for feature in features]
            max_label_len = max(len(l) for l in labels_list)
            if self.pad_to_multiple_of is not None:
                max_label_len = (
                    (max_label_len + self.pad_to_multiple_of - 1)
                    // self.pad_to_multiple_of
                    * self.pad_to_multiple_of
                )

            padded_labels_list = []
            for label_tensor in labels_list:
                padding_len = max_label_len - len(label_tensor)
                padded_tensor = torch.cat([
                    label_tensor,
                    torch.full((padding_len,), self.tokenizer.pad_token_id, dtype=label_tensor.dtype)
                ])
                padded_labels_list.append(padded_tensor)

            labels_tensor = torch.stack(padded_labels_list)

            if self.model is not None and hasattr(self.model, "prepare_decoder_input_ids_from_labels"):
                batch["decoder_input_ids"] = self.model.prepare_decoder_input_ids_from_labels(labels=labels_tensor.clone()) # Pass a clone for safety
            else:
                shifted_labels = labels_tensor.new_zeros(labels_tensor.shape)
                shifted_labels[..., 1:] = labels_tensor[..., :-1].clone()
                shifted_labels[..., 0] = self.tokenizer.pad_token_id
                batch["decoder_input_ids"] = shifted_labels

            labels_tensor[labels_tensor == self.tokenizer.pad_token_id] = self.label_pad_token_id
            batch["labels"] = labels_tensor

        return batch

## 8. Data Preparation

In [12]:
create_combined_datasets_if_not_exist()

df_train_val = load_data(COMBINED_TRAIN_DATA_PATH)
df_test = load_data(COMBINED_TEST_DATA_PATH)

if df_train_val is None or df_train_val.empty:
    print("STOPPING: Training data could not be loaded or is empty.")
else:
    train_df, val_df = train_test_split(df_train_val, test_size=0.1, random_state=42)
    train_dataset = SummarizationDataset(train_df, tokenizer, MAX_INPUT_LENGTH, MAX_TARGET_LENGTH)
    val_dataset = SummarizationDataset(val_df, tokenizer, MAX_INPUT_LENGTH, MAX_TARGET_LENGTH)

    if len(train_dataset) == 0:
        print("STOPPING: Training dataset is empty after SummarizationDataset initialization.")
    if len(val_dataset) == 0 and NUM_TRAIN_EPOCHS > 0:
        print("Warning: Validation dataset is empty. Evaluation will be skipped or may fail if attempted.")

    custom_collator = CustomSummarizationCollator(tokenizer=tokenizer, model=model, label_pad_token_id=-100)

    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        label_pad_token_id=-100,
        pad_to_multiple_of=8
    )

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        collate_fn=data_collator,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=VALID_BATCH_SIZE,
        collate_fn=data_collator,
        num_workers=2,
        pin_memory=True
    ) if val_dataset and len(val_dataset) > 0 else None

    print(f"Train DataLoader: {len(train_dataloader)} batches")

    train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, collate_fn=custom_collator, shuffle=True, num_workers=2, pin_memory=True)
    val_dataloader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, collate_fn=custom_collator, num_workers=2, pin_memory=True) if val_dataset and len(val_dataset) > 0 else None

    print(f"Train DataLoader: {len(train_dataloader)} batches")
    if val_dataloader:
        print(f"Validation DataLoader: {len(val_dataloader)} batches")
    else:
        print("Validation DataLoader is empty or not created.")

Combined datasets not found. Attempting to create them from CSVs in '/content/drive/MyDrive/sps/data/' directory...
Loaded and processed for train: english_train.csv, kept 28341 rows.
Loaded and processed for train: hindi_train.csv, kept 21224 rows.
Loaded and processed for train: gujrati_train.csv, kept 33625 rows.
Loaded and processed for train: bengali_train.csv, kept 12356 rows.
Combined train dataset created and saved to: combined_train_data.csv with 95546 rows.
Loaded and processed for test: english_test.csv, kept 2895 rows.
Loaded and processed for test: hindi_test.csv, kept 3000 rows.
Loaded and processed for test: gujrati_test.csv, kept 2999 rows.
Loaded and processed for test: bengali_test.csv, kept 2951 rows.
Combined test dataset created and saved to: combined_test_data.csv with 11845 rows.
Successfully loaded data from combined_train_data.csv, shape: (95546, 3)
Successfully loaded data from combined_test_data.csv, shape: (11845, 3)
Train DataLoader: 10749 batches
Train Dat

## 9. Metrics Calculation Functions

In [13]:
rouge_metric = evaluate.load("rouge")

def calculate_jaccard(str1, str2):
    s1_cleaned = clean_text_series_for_metrics(str1)
    s2_cleaned = clean_text_series_for_metrics(str2)
    tokens1 = set(nltk.word_tokenize(s1_cleaned))
    tokens2 = set(nltk.word_tokenize(s2_cleaned))
    if not tokens1 and not tokens2: return 1.0
    if not tokens1 or not tokens2: return 0.0
    return len(tokens1.intersection(tokens2)) / len(tokens1.union(tokens2))

def calculate_cosine_tfidf(list_of_references, list_of_predictions):
    if not list_of_references or not list_of_predictions or len(list_of_references) != len(list_of_predictions):
        return 0.0
    cleaned_refs = [clean_text_series_for_metrics(s) for s in list_of_references if str(s).strip()]
    cleaned_preds = [clean_text_series_for_metrics(s) for s in list_of_predictions if str(s).strip()]
    if not cleaned_refs or not cleaned_preds: return 0.0
    vectorizer = TfidfVectorizer()
    corpus = cleaned_refs + cleaned_preds
    try:
        vectorizer.fit(corpus)
    except ValueError:
        print("Warning: TF-IDF Vectorizer could not be fitted (empty corpus after cleaning?).")
        return 0.0
    total_similarity = 0
    count = 0
    for i in range(len(list_of_references)):
        ref = clean_text_series_for_metrics(list_of_references[i])
        pred = clean_text_series_for_metrics(list_of_predictions[i])
        if not ref or not pred: continue
        try:
            tfidf_ref = vectorizer.transform([ref])
            tfidf_pred = vectorizer.transform([pred])
            sim = cosine_similarity(tfidf_ref, tfidf_pred)[0, 0]
            total_similarity += sim
            count += 1
        except ValueError:
            print(f"Skipping cosine for pair due to empty vector: REF='{ref}', PRED='{pred}'")
            continue
    return (total_similarity / count) * 100 if count > 0 else 0.0

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

## 10. Training and Evaluation Loop

In [14]:
def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, num_epochs, device):
    best_val_rouge1 = -1.0
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch + 1}/{num_epochs} ---")

        model.train()
        total_train_loss = 0
        for batch_idx, batch in enumerate(train_dataloader):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_input_ids = batch.get('decoder_input_ids', None)
            if decoder_input_ids is not None:
                decoder_input_ids = decoder_input_ids.to(device)

            if (labels != -100).sum() == 0:
                if batch_idx % LOG_INTERVAL == 0 or batch_idx == len(train_dataloader) -1:
                    print(f"  WARNING: No valid labels in training batch {batch_idx}. Skipping.")
                continue

            try:
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_input_ids=decoder_input_ids)
                loss = outputs.loss

                if loss is None or torch.isnan(loss) or torch.isinf(loss):
                    print(f"  WARNING: Invalid loss (None, NaN or Inf) at training batch {batch_idx}. Loss: {loss}. Skipping update.")
                    continue

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
                optimizer.step()
                if scheduler:
                    scheduler.step()

                total_train_loss += loss.item()
                if batch_idx % LOG_INTERVAL == 0 or batch_idx == len(train_dataloader) - 1:
                    print(f"  Epoch {epoch+1}, Batch {batch_idx+1}/{len(train_dataloader)}, Loss: {loss.item():.4f}")

            except Exception as e:
                print(f"  Error during training step {batch_idx}: {e}")
                traceback.print_exc()
                continue

        avg_train_loss = total_train_loss / len(train_dataloader) if len(train_dataloader) > 0 else float('nan')
        print(f"Average Training Loss for Epoch {epoch+1}: {avg_train_loss:.4f}")

        if val_dataloader and len(val_dataloader) > 0:
            print(f"Starting validation for Epoch {epoch+1}...")
            val_metrics = evaluate_model(model, val_dataloader, tokenizer, device, is_test_set=False)
            print(f"Validation ROUGE-1: {val_metrics.get('rouge1', 0.0):.4f}, ROUGE-L: {val_metrics.get('rougeL', 0.0):.4f}, Eval Loss: {val_metrics.get('eval_loss', float('nan')):.4f}")
            current_rouge1 = val_metrics.get('rouge1', -1.0)
            if current_rouge1 > best_val_rouge1:
                best_val_rouge1 = current_rouge1
                print(f"New best validation ROUGE-1: {best_val_rouge1:.4f}. Saving model...")
                model.save_pretrained(OUTPUT_DIR)
                tokenizer.save_pretrained(OUTPUT_DIR)
                print(f"Model saved to {OUTPUT_DIR}")
        else:
            print("Skipping validation as val_dataloader is empty or not provided.")
            print(f"Saving model at end of epoch {epoch+1}...")
            model.save_pretrained(OUTPUT_DIR)
            tokenizer.save_pretrained(OUTPUT_DIR)
            print(f"Model saved to {OUTPUT_DIR}")

        gc.collect()
        if DEVICE == torch.device("cuda"):
            torch.cuda.empty_cache()

    print("Training finished.")
    return model

def evaluate_model(model, dataloader, tokenizer, device, is_test_set=True):
    model.eval()
    all_decoded_preds = []
    all_decoded_labels = []
    total_eval_loss = 0

    print(f"Evaluating on {'Test' if is_test_set else 'Validation'} Set...")
    for batch_idx, batch in enumerate(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_input_ids = batch.get('decoder_input_ids', None)
        if decoder_input_ids is not None:
            decoder_input_ids = decoder_input_ids.to(device)

        with torch.no_grad():
            try:
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_input_ids=decoder_input_ids)
                loss = outputs.loss
                if loss is not None and not (torch.isnan(loss) or torch.isinf(loss)):
                     total_eval_loss += loss.item()
                else:
                    if batch_idx % (LOG_INTERVAL // 5 if LOG_INTERVAL > 5 else 1) == 0:
                        print(f"  Warning: Invalid loss ({loss}) during evaluation batch {batch_idx}.")

                generated_ids = model.generate(
                    input_ids=input_ids, attention_mask=attention_mask,
                    max_length=MAX_TARGET_LENGTH, num_beams=4, early_stopping=True
                )
                decoded_preds_batch = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

                labels_for_decode = labels.clone()
                labels_for_decode[labels_for_decode == -100] = tokenizer.pad_token_id
                decoded_labels_batch = tokenizer.batch_decode(labels_for_decode, skip_special_tokens=True)

                all_decoded_preds.extend([pred.strip() for pred in decoded_preds_batch])
                all_decoded_labels.extend([label.strip() for label in decoded_labels_batch])
            except Exception as e:
                print(f"  Error during evaluation batch {batch_idx}: {e}")
                traceback.print_exc()
                batch_size = input_ids.size(0)
                all_decoded_preds.extend(["<error>"] * batch_size)
                all_decoded_labels.extend(["<error>"] * batch_size)
                continue
        if batch_idx % LOG_INTERVAL == 0 or batch_idx == len(dataloader) -1:
            print(f"  Evaluated batch {batch_idx+1}/{len(dataloader)}")

    avg_eval_loss = total_eval_loss / len(dataloader) if len(dataloader) > 0 and total_eval_loss > 0 else float('nan')
    print(f"Average Evaluation Loss: {avg_eval_loss:.4f}")

    rouge_preds = [pred if pred else "<empty>" for pred in all_decoded_preds]
    rouge_labels = [label if label else "<empty>" for label in all_decoded_labels]

    if not rouge_preds or not rouge_labels:
        print("Warning: No valid predictions or labels for ROUGE calculation.")
        rouge_scores = {k: 0.0 for k in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']}
    else:
        rouge_results = rouge_metric.compute(predictions=rouge_preds, references=rouge_labels, use_stemmer=True)
        rouge_scores = {k: round(v * 100, 4) for k, v in rouge_results.items()}
    print("ROUGE Scores (%):")
    for key, value in rouge_scores.items(): print(f"  {key}: {value}")

    final_metrics = {**rouge_scores, "eval_loss": avg_eval_loss}

    if is_test_set:
        jaccard_cosine_preds = [clean_text_series_for_metrics(p) for p in all_decoded_preds]
        jaccard_cosine_labels = [clean_text_series_for_metrics(l) for l in all_decoded_labels]

        jaccard_scores = [calculate_jaccard(ref, pred) for ref, pred in zip(jaccard_cosine_labels, jaccard_cosine_preds)]
        avg_jaccard = np.mean(jaccard_scores) * 100 if jaccard_scores else 0.0
        print(f"Average Jaccard Similarity (%): {avg_jaccard:.4f}")
        final_metrics["jaccard_similarity"] = avg_jaccard

        avg_cosine_tfidf = calculate_cosine_tfidf(jaccard_cosine_labels, jaccard_cosine_preds)
        print(f"Average Cosine Similarity (TF-IDF) (%): {avg_cosine_tfidf:.4f}")
        final_metrics["cosine_similarity_tfidf"] = avg_cosine_tfidf

        if not os.path.exists(OUTPUT_DIR):
            os.makedirs(OUTPUT_DIR)
        test_metrics_path = os.path.join(OUTPUT_DIR, "pytorch_test_set_evaluation_metrics.json")
        with open(test_metrics_path, "w") as f:
            json.dump(final_metrics, f, indent=4)
        print(f"Test set evaluation metrics saved to {test_metrics_path}")

    return final_metrics

## 11. Training

In [15]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=ADAM_EPSILON, weight_decay=WEIGHT_DECAY)
total_training_steps = len(train_dataloader) * NUM_TRAIN_EPOCHS
num_warmup_steps = int(0.1 * total_training_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=total_training_steps)

print(f"Starting manual PyTorch training for {NUM_TRAIN_EPOCHS} epochs...")
model.to(DEVICE)
trained_model = train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, NUM_TRAIN_EPOCHS, DEVICE)


Starting manual PyTorch training for 2 epochs...

--- Epoch 1/2 ---


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


  Epoch 1, Batch 1/10749, Loss: 24.9995
  Epoch 1, Batch 101/10749, Loss: 23.1464
  Epoch 1, Batch 201/10749, Loss: 21.8705
  Epoch 1, Batch 301/10749, Loss: 19.4559
  Epoch 1, Batch 401/10749, Loss: 18.6259
  Epoch 1, Batch 501/10749, Loss: 14.6488
  Epoch 1, Batch 601/10749, Loss: 8.8869
  Epoch 1, Batch 701/10749, Loss: 8.9392
  Epoch 1, Batch 801/10749, Loss: 6.1225
  Epoch 1, Batch 901/10749, Loss: 5.4463
  Error during training step 912: CUDA out of memory. Tried to allocate 978.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 240.12 MiB is free. Process 10232 has 14.50 GiB memory in use. Of the allocated memory 9.57 GiB is allocated by PyTorch, and 4.81 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Traceback (most recent call last):
  File "<ipython-input-14-41964471e466>", line 35, in train_model
    loss.backward()
  File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 626, in backward
    torch.autograd.backward(
  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 347, in backward
    _engine_run_backward(
  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 823, in _engine_run_backward
    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 978.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 240.12 MiB is free. Process 10232 has 14.50 GiB memory in use. Of the allocated memory 9.57 GiB is allocated by PyTorch, and 4.81 GiB is reserved by PyTorch but unallocated. If reserved but una

  Epoch 1, Batch 1001/10749, Loss: 3.9656
  Epoch 1, Batch 1101/10749, Loss: 3.8716
  Epoch 1, Batch 1201/10749, Loss: 3.3998
  Epoch 1, Batch 1301/10749, Loss: 2.2738
  Epoch 1, Batch 1401/10749, Loss: 3.1151
  Epoch 1, Batch 1501/10749, Loss: 2.7641
  Epoch 1, Batch 1601/10749, Loss: 3.1281
  Epoch 1, Batch 1701/10749, Loss: 2.8757
  Epoch 1, Batch 1801/10749, Loss: 2.4014
  Epoch 1, Batch 1901/10749, Loss: 2.6033
  Epoch 1, Batch 2001/10749, Loss: 2.5205
  Epoch 1, Batch 2101/10749, Loss: 2.4598
  Epoch 1, Batch 2201/10749, Loss: 2.2768
  Epoch 1, Batch 2301/10749, Loss: 1.7264
  Epoch 1, Batch 2401/10749, Loss: 2.2556
  Epoch 1, Batch 2501/10749, Loss: 2.5622
  Epoch 1, Batch 2601/10749, Loss: 2.2825
  Epoch 1, Batch 2701/10749, Loss: 2.9973
  Epoch 1, Batch 2801/10749, Loss: 2.4012
  Epoch 1, Batch 2901/10749, Loss: 2.2386
  Epoch 1, Batch 3001/10749, Loss: 2.9494
  Epoch 1, Batch 3101/10749, Loss: 2.2174
  Epoch 1, Batch 3201/10749, Loss: 2.2931
  Epoch 1, Batch 3301/10749, Loss:

## 12. Evaluation on Test Set

In [18]:
print("\n--- Loading best saved model for final test set evaluation ---")
final_model = MT5ForConditionalGeneration.from_pretrained(OUTPUT_DIR)
final_model.to(DEVICE)

test_dataset = SummarizationDataset(df_test, tokenizer, MAX_INPUT_LENGTH, MAX_TARGET_LENGTH)
data_collator = DataCollatorForSeq2Seq(
                tokenizer=tokenizer,
                model=final_model,
                label_pad_token_id=-100,
                pad_to_multiple_of=8
            )


--- Loading best saved model for final test set evaluation ---


In [20]:
if 'final_model' in locals() and 'test_dataset' in locals() and len(test_dataset) > 0:
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=TEST_BATCH_SIZE,
        collate_fn=data_collator,
        num_workers=2,
        pin_memory=True
    )
    print(f"Test DataLoader: {len(test_dataloader)} batches")
    test_metrics = evaluate_model(final_model, test_dataloader, tokenizer, DEVICE, is_test_set=True)
    print("\n--- Final Test Set Evaluation Metrics ---")
    for key, value in test_metrics.items():
        print(f"{key}: {value:.4f}")
else:
    print("\n--- Skipping test set evaluation ---")
    if 'df_test' not in locals() or df_test is None or df_test.empty:
        print("Test data (df_test) is not loaded or is empty.")
    elif 'test_dataset' not in locals() or len(test_dataset) == 0:
         print("Test dataset is empty after initialization.")
    elif 'final_model' not in locals():
         print("Model was not loaded or trained successfully.")
    else:
        print("Unknown reason for skipping test evaluation.")

# Cleanup
del train_dataloader, val_dataloader, test_dataloader, train_dataset, val_dataset, test_dataset, df_train_val, df_test
gc.collect()
if DEVICE == torch.device("cuda"):
    torch.cuda.empty_cache()
print("Cleanup complete.")

Test DataLoader: 1481 batches
Evaluating on Test Set...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


  Evaluated batch 1/1481
  Evaluated batch 101/1481
  Evaluated batch 201/1481
  Evaluated batch 301/1481
  Evaluated batch 401/1481
  Evaluated batch 501/1481
  Evaluated batch 601/1481
  Evaluated batch 701/1481
  Evaluated batch 801/1481
  Evaluated batch 901/1481
  Evaluated batch 1001/1481
  Evaluated batch 1101/1481
  Evaluated batch 1201/1481
  Evaluated batch 1301/1481
  Evaluated batch 1401/1481
  Evaluated batch 1481/1481
Average Evaluation Loss: 1.6685
ROUGE Scores (%):
  rouge1: 16.7807
  rouge2: 8.0862
  rougeL: 15.0775
  rougeLsum: 15.0772


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## 13. Example Prediction

In [19]:
print("\n--- Example Prediction using the fine-tuned model ---")
loaded_model = MT5ForConditionalGeneration.from_pretrained(OUTPUT_DIR)
loaded_model.to(DEVICE)
loaded_model.eval()
loaded_tokenizer = MT5Tokenizer.from_pretrained(OUTPUT_DIR)

sample_article_en = "summarize in en: Several research groups have been working on developing new types of batteries that could store more energy and charge faster. One promising approach involves using solid-state electrolytes instead of liquid ones, which could improve safety and energy density. These advancements are crucial for the future of electric vehicles and portable electronics."
sample_article_hi = "summarize in hi: कई शोध समूह नई प्रकार की बैटरियों को विकसित करने पर काम कर रहे हैं जो अधिक ऊर्जा संग्रहीत कर सकें और तेजी से चार्ज हो सकें। एक आशाजनक दृष्टिकोण में तरल इलेक्ट्रोलाइट्स के बजाय ठोस-अवस्था वाले इलेक्ट्रोलाइट्स का उपयोग करना शामिल है, जिससे सुरक्षा और ऊर्जा घनत्व में सुधार हो सकता है। ये प्रगति इलेक्ट्रिक वाहनों और पोर्टेबल इलेक्ट्रॉनिक्स के भविष्य के लिए महत्वपूर्ण हैं।"

for lang_code, sample_article in [("en", sample_article_en), ("hi", sample_article_hi)]:
    print(f"\nInput Article ({lang_code}): {sample_article.replace(f'summarize in {lang_code}: ', '')}")
    inputs = loaded_tokenizer(sample_article, return_tensors="pt", max_length=MAX_INPUT_LENGTH, truncation=True, padding=True).to(DEVICE)

    with torch.no_grad():
        summary_ids = loaded_model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            num_beams=4,
            max_length=MAX_TARGET_LENGTH,
            early_stopping=True
        )
    generated_summary = loaded_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print(f"Generated Summary ({lang_code}): {generated_summary}")



--- Example Prediction using the fine-tuned model ---

Input Article (en): Several research groups have been working on developing new types of batteries that could store more energy and charge faster. One promising approach involves using solid-state electrolytes instead of liquid ones, which could improve safety and energy density. These advancements are crucial for the future of electric vehicles and portable electronics.
Generated Summary (en): These advancements are crucial for the future of electric vehicles and portable electronics.

Input Article (hi): कई शोध समूह नई प्रकार की बैटरियों को विकसित करने पर काम कर रहे हैं जो अधिक ऊर्जा संग्रहीत कर सकें और तेजी से चार्ज हो सकें। एक आशाजनक दृष्टिकोण में तरल इलेक्ट्रोलाइट्स के बजाय ठोस-अवस्था वाले इलेक्ट्रोलाइट्स का उपयोग करना शामिल है, जिससे सुरक्षा और ऊर्जा घनत्व में सुधार हो सकता है। ये प्रगति इलेक्ट्रिक वाहनों और पोर्टेबल इलेक्ट्रॉनिक्स के भविष्य के लिए महत्वपूर्ण हैं।
Generated Summary (hi): नई शोध समूह नई प्रकार की बैटरियों को 