# Summarization (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

Installing libraries.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!pip install huggingface_hub 
!pip install rouge
!pip install nltk

Fixes encoding error with colab.

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

Mount Google Drive.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Setting up Git.

In [None]:
# !git config --global user.email "ammar_amjad@ymail.com"
# !git config --global user.name "Ammar-Amjad"

Logging into the Hugging Face Hub.

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

Setting up directories.

In [None]:
# # dir = "/content/drive/MyDrive/dataset/"
# dir = "/content/"
# train = dir + "urdu_train.jsonl" 
# test = dir + "urdu_test.jsonl"
# val = dir + "urdu_val.jsonl"

# ntrain = dir + "/SmallerDataset/" + "urdu_train.csv"
# ntest = dir + "/SmallerDataset/" + "urdu_test.csv"
# nval = dir + "/SmallerDataset/" + "urdu_val.csv"

In [None]:
import torch

In [None]:
from datasets import load_dataset, Dataset 
urdu_dataset = load_dataset('json', data_files={'train': train,
                                              'test': test,
                                              'val': val})

print(urdu_dataset)

# New section

In [None]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['title']}'")
        print(f"'>> Summary: {example['summary']}'")

show_samples(urdu_dataset)

In [None]:
urdu_dataset.reset_format()

In [None]:
urdu_news = urdu_dataset

In [None]:
from datasets import concatenate_datasets, DatasetDict

urdu_news_dataset = DatasetDict()

for split in urdu_news.keys():
    urdu_news_dataset[split] = concatenate_datasets(
        [urdu_news[split]
        ]
    )
    urdu_news_dataset[split] = urdu_news_dataset[split].shuffle(seed=42)

show_samples(urdu_news_dataset)

Removing 1 word titles.

In [None]:
urdu_news_dataset = urdu_news_dataset.filter(lambda x: len(x["title"].split()) > 2)
print(urdu_news_dataset)

Loading Model.

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
inputs = tokenizer("مجھے ہنگر گیمز پڑھنا پسند تھا!")
inputs

In [None]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

Preprocessing

In [None]:
max_input_length = 11230 + 10
max_target_length = 36 + 2


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["summary"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = urdu_news_dataset.map(preprocess_function, batched=True)

In [None]:
generated_summary = "پانچ کروڑ کہاں گئے؟"
reference_summary = "پاچ کروڑ کہاں گئے؟"

In [None]:
import evaluate

rouge_score = evaluate.load("rouge")

In [None]:
scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary]
)
scores

In [None]:
from rouge import Rouge

generated_summary = "پانچ کروڑ کہاں گئے؟"
reference_summary = "پنچ کروڑ کہاں گئے؟"

def calc_Rouge(generated_summary, reference_summary):
  rouge = Rouge()
  scores = rouge.get_scores(generated_summary, reference_summary, avg=True)
  for s in scores:
    scores[s] = scores[s]['r']
  return scores
scores = calc_Rouge(generated_summary, reference_summary)
print("ROUGE Scores:", scores)


In [None]:
scores["rouge-1"]

In [None]:
import nltk

nltk.download("punkt")

In [None]:
from nltk.tokenize import sent_tokenize
import re

def extract_first_n_sentences(Summary, n=3):
    return Summary.split("۔")

def three_sentence_summary(Summary):
    return '\n'.join(extract_first_n_sentences(Summary)[:3])

print(three_sentence_summary(urdu_news_dataset["train"][1]["summary"]))

In [None]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(Summary) for Summary in dataset["summary"]]
    if metric == "rouge":
      return calc_Rouge(summaries, dataset["title"])

In [None]:
import pandas as pd
metric = "rouge"
score = evaluate_baseline(urdu_news_dataset["val"], metric)
rouge_names = ["rouge-1", "rouge-2", "rouge-l"]
rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
rouge_dict

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Define Model Parameters

In [None]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 8
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-amazon-en-es",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = calc_Rouge(
        decoded_preds, decoded_labels
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(
    urdu_news_dataset["train"].column_names
)

In [None]:
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Train model

In [None]:
trainer.train()

Evaluation

In [None]:
trainer.evaluate()