<a href="https://www.kaggle.com/code/mukaffimoin/bengali-news-summarization-mt5?scriptVersionId=158333739" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
from datasets import Dataset



In [None]:
with open("/kaggle/input/bengali-news-summarization-dataset/Bengali-News-Summarization-Dataset/article.txt", "r") as f:
    articles = f.read().splitlines()
    
    
with open("/kaggle/input/bengali-news-summarization-dataset/Bengali-News-Summarization-Dataset/summary.txt", "r") as f:
    summaries = f.read().splitlines()

In [None]:
df = pd.DataFrame(list(zip(articles,summaries)),columns=["article","summary"])

In [None]:
df

In [None]:
df.dropna()

In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.30, shuffle=True)
df_val, df_test = train_test_split(df_test, test_size=0.65,shuffle=True)

In [None]:
df_train

In [None]:
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

def visualize_text_length(data, title):

    data['News_article_text_length'] = data['article'].apply(len)
    data['News_summary_text_length'] = data['summary'].apply(len)

   
    plt.figure(figsize=(8, 4))

    
    custom_font = FontProperties(family='serif', style='normal', size=14, weight='bold')

    
    plt.subplot(1, 2, 1)
    plt.hist(data['News_article_text_length'], bins=40, color='cornflowerblue', edgecolor='black', alpha=0.7, label='Bangla News Article')
    plt.grid(linestyle='--', alpha=0.6)
    plt.xlabel("Bangla News Article Text Length", fontsize=10, fontproperties=custom_font, color='black')
    plt.ylabel("Frequency", fontsize=10, fontproperties=custom_font, color='black')

    
    plt.subplot(1, 2, 2)
    plt.hist(data['News_summary_text_length'], bins=40, color='firebrick', edgecolor='black', alpha=0.7, label='Bangla News Summary')
    plt.grid(linestyle='--', alpha=0.6)
    plt.xlabel("News Summary Text Length", fontsize=10, fontproperties=custom_font, color='black')
    plt.ylabel("Frequency", fontsize=10, fontproperties=custom_font, color='black')

    
    plt.suptitle(f'Text Length Distribution for {title}', fontsize=12, fontproperties=custom_font, color='black')
    
    plt.tight_layout()

    # Show the plot
    plt.show()



# **Visualize text length distribution for each dataset**

In [None]:
visualize_text_length(df_train, 'Training Dataset')

In [None]:
visualize_text_length(df_test , 'Test Dataset')

In [None]:
visualize_text_length(df_val , 'Validation Dataset')

In [None]:
from wordcloud import WordCloud

def create_wordcloud(data, column, title):
    # Specify a Bangla-supported font, such as "Siyam Rupali"
    font_path = "/kaggle/input/fonts-paths/Siyam Rupali Regular.ttf"


    #This setting disables the detection of collocations (multi-word phrases) to focus on individual words.
    wordcloud = WordCloud(width=800, height=400, background_color='black', font_path=font_path,
                          colormap='rainbow', collocations=False).generate(' '.join(data[column]))

    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {column} in {title}', fontsize=16, color='black')
    plt.axis('off')
    plt.show()


In [None]:
create_wordcloud(df_train, 'article', 'Training Set')

In [None]:
create_wordcloud(df_train, 'article', 'Training Set')

In [None]:
create_wordcloud(df_train, 'summary', 'Training Set')

In [None]:
!pip install git+https://github.com/csebuetnlp/normalizer

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from normalizer import normalize # pip install git+https://github.com/csebuetnlp/normalizer

In [None]:
import torch
from normalizer import normalize
from transformers import MT5ForConditionalGeneration, AutoTokenizer ,DataCollatorForSeq2Seq, Trainer, TrainingArguments
import os


model_name = "google/mt5-small" 
model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from torch.utils.data import Dataset, DataLoader
class Seq2SeqDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.input_text = data['article'].apply(normalize).tolist()
        self.labels = data['summary'].apply(normalize).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_text)

    def __getitem__(self, idx):
        input_text = self.input_text[idx]
        label_text = self.labels[idx]

        # Tokenize the input text
        input_encodings = self.tokenizer(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Tokenize the label text to get its 'input_ids' and 'attention_mask'
        label_encodings = self.tokenizer(
            label_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encodings['input_ids'].squeeze(),
            'attention_mask': input_encodings['attention_mask'].squeeze(),
            'labels': label_encodings['input_ids'].squeeze(),
        }

In [None]:
# Modify the data collation process to handle PyTorch tensors correctly
class MyDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    def __call__(self, features):
        batch = {}
        batch["input_ids"] = torch.stack([feature["input_ids"] for feature in features])
        batch["attention_mask"] = torch.stack([feature["attention_mask"] for feature in features])

        # Labels should be processed differently for PyTorch tensors
        if isinstance(features[0]["labels"], torch.Tensor):
            batch["labels"] = torch.stack([feature["labels"] for feature in features])
        else:
            # Convert the list of lists to a PyTorch tensor
            batch["labels"] = torch.tensor([feature["labels"] for feature in features])

        return batch

In [None]:
# Create train , test and validation datasets
train_dataset = Seq2SeqDataset(df_train, tokenizer)
test_dataset = Seq2SeqDataset(df_test, tokenizer)
validation_dataset = Seq2SeqDataset(df_val, tokenizer)

# Create train , test and validation dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)  #batch_size=32
test_dataloader = DataLoader(test_dataset, batch_size=16) #batch_size=32
validation_dataloader = DataLoader(validation_dataset, batch_size=16) #batch_size=32


In [None]:
# Move the model to the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

In [None]:
# Create a custom optimizer using torch.optim.AdamW
custom_optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-3,
    eps=1e-8,
    weight_decay=0.01,
)

In [None]:
# Define the TrainingArguments for fine-tuning
training_args = TrainingArguments(
    output_dir='/kaggle/working/',
    num_train_epochs=15,
    per_device_train_batch_size=5,
    gradient_accumulation_steps=8,
    evaluation_strategy="epoch",
    save_total_limit=1,
    save_steps=5000,
    learning_rate=1e-3,
    do_train=True,
    do_eval=True,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=False,
    lr_scheduler_type="cosine_with_restarts",
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='/kaggle/working/',
    logging_steps=200,
    
)


In [None]:
# Create a data collator for sequence-to-sequence tasks
data_collator = MyDataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=False,
    max_length=80,
    label_pad_token_id=tokenizer.pad_token_id,
)

In [None]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    optimizers=(custom_optimizer, None),
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
from transformers import AutoModelForSeq2SeqLM

# Save the model
model.save_pretrained("/kaggle/working/mt5_model.pt")

# Save the tokenizer
tokenizer.save_pretrained("/kaggle/working/mt5_tokenizer.json")

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the saved model
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/mt5_model.pt")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/mt5_tokenizer.json")


In [None]:
!pip install datasets

In [None]:
!pip install python-Levenshtein

In [None]:
!pip install jiwer

In [None]:
# Move the model to the device (CPU or GPU)
model.to(device)

In [None]:
!pip install rouge-score

In [None]:
!pip install evaluate

In [None]:
import torch
import Levenshtein
from evaluate import load
# Define the move_to_device function
def move_to_device(batch, device):
    if isinstance(batch, torch.Tensor):
        return batch.to(device)
    elif isinstance(batch, list):
        return [move_to_device(item, device) for item in batch]
    elif isinstance(batch, dict):
        return {key: move_to_device(value, device) for key, value in batch.items()}
    else:
        return batch  # If it's not a tensor, list, or dict, leave it as is

# Load the evaluation metric for Character Error Rate (CER) and Word Error Rate (WER) and Exact Match(em)
cer_metric = load("cer")
wer_metric = load("wer")

exact_match_metric = load("exact_match")

# Load BLEU and ROUGE metrics
bleu_metric = load("bleu")
rouge_metric = load('rouge')

# Initialize lists to store generated summarisaions and references
generated_summarisaions = []
references = []

# Generate summarisaions for the test dataset
for batch in test_dataloader:
    # Move the batch to CUDA
    batch = move_to_device(batch, 'cuda')

    input_text = batch['input_ids']  # Access the input_text using the correct key
    labels = batch['labels']  # Access the labels using the correct key

    # Generate summarisaions
    summarisaion_ids = model.generate(input_text, max_length=512, num_beams=4, length_penalty=2.0, early_stopping=True)

    # Move the summarisaion_ids to CPU to decode
    summarisaion_ids = summarisaion_ids.to('cpu')

    generated_summarisaion = tokenizer.batch_decode(summarisaion_ids, skip_special_tokens=True)

    generated_summarisaions.extend(generated_summarisaion)
    references.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))  # Decoding the label IDs

# Make sure to move generated_summarisaions back to CPU for evaluation if necessary
generated_summarisaions = [summarisaion if not isinstance(summarisaion, str) else summarisaion for summarisaion in generated_summarisaions]



In [None]:
print("Number of generated summarisaions:", len(generated_summarisaions))
print("Number of references:", len(references))

In [None]:
# Calculate Character Error Rate (CER) and Word Error Rate (WER)
results_CER = cer_metric.compute(predictions=generated_summarisaions, references=references)
results_WER = wer_metric.compute(predictions=generated_summarisaions, references=references)

# Calculate Exact Match (EM)
results_em = exact_match_metric.compute(predictions=generated_summarisaions, references=references)


# Calculate Bilingual Evaluation Understudy (BLEU)
results_bleu = bleu_metric.compute(predictions=generated_summarisaions, references=references)


# Calculate Levenshtein Distance
levenshtein_distances = [Levenshtein.distance(generated, reference) for generated, reference in zip(generated_summarisaions, references)]


In [None]:
print(results_CER)
print(results_WER)
print(results_em)
print(results_bleu)
# print(levenshtein_distances)

In [None]:
!pip install unidecode

In [None]:
from rouge_score import rouge_scorer
from unidecode import unidecode

# Initialize the Rouge scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
# Define a function to preprocess and tokenize Bengali text
def preprocess_text(text):
    text = unidecode(text)
    tokens = text.split()
    return ' '.join(tokens)

# Create lists to store individual scores
rouge1_f1_scores = []
rouge1_precision_scores = []
rouge1_recall_scores = []
rouge2_f1_scores = []
rouge2_precision_scores = []
rouge2_recall_scores = []
rougeL_f1_scores = []
rougeL_precision_scores = []
rougeL_recall_scores = []

for ref, pred in zip(references, generated_summarisaions):
    candidate = preprocess_text(pred)
    reference = preprocess_text(' '.join(ref))
    scores = scorer.score(reference, candidate)

    rouge1_f1_scores.append(scores['rouge1'].fmeasure)
    rouge1_precision_scores.append(scores['rouge1'].precision)
    rouge1_recall_scores.append(scores['rouge1'].recall)
    rouge2_f1_scores.append(scores['rouge2'].fmeasure)
    rouge2_precision_scores.append(scores['rouge2'].precision)
    rouge2_recall_scores.append(scores['rouge2'].recall)
    rougeL_f1_scores.append(scores['rougeL'].fmeasure)
    rougeL_precision_scores.append(scores['rougeL'].precision)
    rougeL_recall_scores.append(scores['rougeL'].recall)

# Calculate the average scores
avg_rouge1_f1 = sum(rouge1_f1_scores) / len(rouge1_f1_scores)
avg_rouge1_precision = sum(rouge1_precision_scores) / len(rouge1_precision_scores)
avg_rouge1_recall = sum(rouge1_recall_scores) / len(rouge1_recall_scores)
avg_rouge2_f1 = sum(rouge2_f1_scores) / len(rouge2_f1_scores)
avg_rouge2_precision = sum(rouge2_precision_scores) / len(rouge2_precision_scores)
avg_rouge2_recall = sum(rouge2_recall_scores) / len(rouge2_recall_scores)
avg_rougeL_f1 = sum(rougeL_f1_scores) / len(rougeL_f1_scores)
avg_rougeL_precision = sum(rougeL_precision_scores) / len(rougeL_precision_scores)
avg_rougeL_recall = sum(rougeL_recall_scores) / len(rougeL_recall_scores)


In [None]:
# Print the average scores
print("Average Rouge-1 F1 Score:", avg_rouge1_f1)
print("Average Rouge-1 Precision:", avg_rouge1_precision)
print("Average Rouge-1 Recall:", avg_rouge1_recall)

print("Average Rouge-2 F1 Score:", avg_rouge2_f1)
print("Average Rouge-2 Precision:", avg_rouge2_precision)
print("Average Rouge-2 Recall:", avg_rouge2_recall)

print("Average Rouge-L F1 Score:", avg_rougeL_f1)
print("Average Rouge-L Precision:", avg_rougeL_precision)
print("Average Rouge-L Recall:", avg_rougeL_recall)
