In [None]:
pip install transformers datasets
pip install bert_score
pip install rouge_score
pip install evaluate

In [None]:
import numpy as np
import pandas as pd
import os
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch
import evaluate
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments,EarlyStoppingCallback
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
os.listdir(sbhatti_news_summarization_path)

['data.csv']

In [None]:
df = pd.read_csv(os.path.join(sbhatti_news_summarization_path, 'data.csv'))
df

Unnamed: 0.1,Unnamed: 0,ID,Content,Summary,Dataset
0,0,f49ee725a0360aa6881ed1f7999cc531885dd06a,New York police are concerned drones could bec...,Police have investigated criminals who have ri...,CNN/Daily Mail
1,1,808fe317a53fbd3130c9b7563341a7eea6d15e94,By . Ryan Lipman . Perhaps Australian porn sta...,Porn star Angela White secretly filmed sex act...,CNN/Daily Mail
2,2,98fd67bd343e58bc4e275bbb5a4ea454ec827c0d,"This was, Sergio Garcia conceded, much like be...",American draws inspiration from fellow country...,CNN/Daily Mail
3,3,e12b5bd7056287049d9ec98e41dbb287bd19a981,An Ebola outbreak that began in Guinea four mo...,World Health Organisation: 635 infections and ...,CNN/Daily Mail
4,4,b83e8bcfcd51419849160e789b6658b21a9aedcd,By . Associated Press and Daily Mail Reporter ...,A sinkhole opened up at 5:15am this morning in...,CNN/Daily Mail
...,...,...,...,...,...
870516,870516,5d28cab74ffb4ea584cbb857d64a72a2157bf19f,The state of Oregon is ready to throw in the t...,Oregon is the only one of 16 states that has f...,CNN/Daily Mail
870517,870517,6f3e12375fc400cf9dc3ad77b8191226e740e293,"MADRID, Spain (CNN) -- A 92-year-old woman wit...","Two women, both from Uruguay, arrested after a...",CNN/Daily Mail
870518,870518,,A day after a 40-year-old man miraculously sur...,– Rescuers in Niagara Falls still haven't foun...,Multi-News
870519,870519,9af32ebbdd03e1d543d5493e93b4ac8c8e489851,"By . Deni Kirkova . PUBLISHED: . 09:27 EST, 23...","Women browse, evaluate and shop through an onl...",CNN/Daily Mail


In [None]:
print(df.duplicated().sum())
print(df["Content"].isna().sum())
df.dropna(subset=["Content"], inplace=True)
df = df[["Content", "Summary"]]
print(len(df))
df.head()

0
34
870487


Unnamed: 0,Content,Summary
0,New York police are concerned drones could bec...,Police have investigated criminals who have ri...
1,By . Ryan Lipman . Perhaps Australian porn sta...,Porn star Angela White secretly filmed sex act...
2,"This was, Sergio Garcia conceded, much like be...",American draws inspiration from fellow country...
3,An Ebola outbreak that began in Guinea four mo...,World Health Organisation: 635 infections and ...
4,By . Associated Press and Daily Mail Reporter ...,A sinkhole opened up at 5:15am this morning in...


In [None]:
df_filtered = df[df["Content"].str.len() < 1000000]
df1 = df_filtered.sample(n=50000, random_state=42).copy()
len(df1)

50000

Since the default processing limit in spaCy is 1,000,000 characters, which is already a significantly large amount for a text, this typically corresponds to nearly 8,000 words. Limited to 50,000 entries due to resource constraints, especially for summarization tasks where long text inputs consume significant RAM.


# Cleaning & Preprocessing

In [None]:
pip install -U spacy[cuda118]



In [None]:
import spacy
import spacy
import re
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")
spacy.require_gpu()

True

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df1["Content"], df1["Summary"], random_state=0, test_size=0.2)
x_test,x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5,random_state=0 )
print("Shape of X_train: ", x_train.shape)
print("Shape of X_val: ", x_val.shape)
print("Shape of X_test: ", x_test.shape)

Shape of X_train:  (40000,)
Shape of X_val:  (5000,)
Shape of X_test:  (5000,)


Summarization needs semantically clean, well-structured text. Unlike classification, we preserve punctuation and flow while only removing distractions like mentions, weird characters and performing stop words

In [None]:
def preprocess(doc):
    cleaned_tokens = []
    for token in doc:
        if token.like_email or token.like_url:
            continue  # removing email & URL

        token_text = token.text.strip() #extra strip
        if not token_text:
            continue

        # Remove mentions, excessive spaces, and strange characters except for light punctuation
        token_text = re.sub(r"@\w+", "", token_text)
        token_text = re.sub(r'\s+', ' ', token_text)
        token_text = re.sub(r"[^\w\s.,!?'\]]", '', token_text)

        cleaned_tokens.append(token_text)

    return " ".join(cleaned_tokens)

In [None]:
x_train_clean = []
y_train_clean = []

for doc, label in tqdm(zip(nlp.pipe(x_train, batch_size=256), y_train), total=len(x_train), desc="Processing texts"):
    try:
        processed_text = preprocess(doc)
        x_train_clean.append(processed_text)
        y_train_clean.append(label)
    except Exception:
        continue

Processing texts: 100%|██████████| 40000/40000 [31:30<00:00, 21.15it/s]


In [None]:
y_train_clean = [text.replace('\n', ' ') for text in y_train_clean]
df_train = pd.DataFrame(pd.concat([pd.Series(x_train_clean), pd.Series(y_train_clean)], axis=1, ignore_index=True))
df_train.columns = ["text", "summ"]
df_train.head()

Unnamed: 0,text,summ
0,Victorious Lulu singing in the Eurovision con...,Spokesman for the annual festival said Scotlan...
1,London CNN -- British architect Marcus Lee i...,London Open House weekend offers chance to vis...
2,CNN -- A Dallas Cowboys scouting assistant s...,Rich Behm one of three Cowboys staffers seriou...
3,"KEY WEST , Florida CNN -- It 's one of the f...","New York has a crystal ball, Atlanta has a pea..."
4,By . Daily Mail Reporter . PUBLISHED . 0934 E...,Josh Brent faces charges of intoxication mansl...


In [None]:
x_val_clean = []
y_val_clean = []

for doc, label in tqdm(zip(nlp.pipe(x_val, batch_size=256), y_val), total=len(x_val), desc="Processing texts"):
    try:
        processed_text = preprocess(doc)
        x_val_clean.append(processed_text)
        y_val_clean.append(label)
    except Exception:
        continue

Processing texts: 100%|██████████| 5000/5000 [08:09<00:00, 10.21it/s]


In [None]:
y_val_clean = [text.replace('\n', ' ') for text in y_val_clean]
df_val = pd.DataFrame(pd.concat([pd.Series(x_val_clean), pd.Series(y_val_clean)], axis=1, ignore_index=True))
df_val.columns = ["text", "summ"]
df_val.head()

Unnamed: 0,text,summ
0,CNN Syrian President Bashar al Assad denie...,NEW: 37 people are killed throughout Syria on ...
1,CNN Wimbledon holds a special place in the ...,"Pete Sampras admits he ""hated"" playing on Wimb..."
2,CNN Wish you could play video games at work...,More companies are encouraging fun as a busine...
3,"By . Ruth Styles . PUBLISHED . 1322 EST , 8 D...",skyn ICELAND's Arctic Elixir contains stress-b...
4,With a trail of light sparkling away in the ni...,The long streak of light was seen in the sky a...


In [None]:
x_test_clean = []
y_test_clean = []

for doc, label in tqdm(zip(nlp.pipe(x_test, batch_size=256), y_test), total=len(x_test), desc="Processing texts"):
    try:
        processed_text = preprocess(doc)
        x_test_clean.append(processed_text)
        y_test_clean.append(label)
    except Exception:
        continue

Processing texts: 100%|██████████| 5000/5000 [03:56<00:00, 21.16it/s]


In [None]:
y_test_clean = [text.replace('\n', ' ') for text in y_val_clean]
df_test = pd.DataFrame(pd.concat([pd.Series(x_test_clean), pd.Series(y_test_clean)], axis=1, ignore_index=True))
df_test.columns = ["text", "summ"]
df_test.head()

Unnamed: 0,text,summ
0,The hosts finished on 174 - 4 despite the effo...,Glamorgan remain top of the T20 Blast Southern...
1,Cristiano Ronaldo gave the visitors the lead w...,Real Madrid were made to battle for a victory ...
2,By . Travelmail Reporter . Malaysia Airline sa...,Flight number MH17 will no longer be used for ...
3,I can not remember when I last needed a birth ...,British journalist and former BBC India corres...
4,Officiating was better this Saturday but there...,Saturday's Premier League action saw three cle...


In [None]:
df_train.to_csv("train.csv", index=False)
df_val.to_csv("val.csv", index=False)
df_test.to_csv("test.csv", index=False)

# Building Models

Link: https://lopezyse.medium.com/abstractive-text-summarization-in-python-comparing-transformer-models-25e382606fe9

I selected facebook/bart-large-cnn as it was ranked best overall in a Medium article where the author compared top summarization models; BART showed superior fluency and faithfulness.

In [None]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

In [None]:
def count_words(text):
    return len(word_tokenize(text))

df_val2 = df_val.copy()
df_val2["word_count_ori"] = df_val2["text"].apply(count_words)

df_val2["word_count_sum"] = df_val2["summ"].apply(count_words)

print("mean of word_count_ori", df_val2["word_count_ori"].mean())
print("mean of word_count_sum", df_val2["word_count_sum"].mean())

mean of word_count_ori 743.1292
mean of word_count_sum 59.7036


743 words and 59 words! ("not thousand")

In [None]:
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

In [None]:
MAX_INPUT = 1024 ## Set max input length to 1024 to fully utilize BART's capacity and capture long context for better summarization
MAX_TARGET = 128

def preprocess(example):
    inputs = tokenizer(
        example["text"],
        max_length=MAX_INPUT,
        truncation=True,
        padding="max_length"
    )
    targets = tokenizer(
        example["summ"],
        max_length=MAX_TARGET,
        truncation=True,
        padding="max_length"
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_val = val_dataset.map(preprocess, batched=True)
tokenized_test = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart-summarizer",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True, #important for summarization
    learning_rate=2e-5,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=100,
    logging_strategy="steps",
    save_strategy="epoch",
    eval_strategy="epoch",
    report_to="none",
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.7002,0.681965
2,0.6001,0.686382
3,0.5102,0.705182


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=7500, training_loss=0.6169107930501302, metrics={'train_runtime': 2385.2858, 'train_samples_per_second': 83.847, 'train_steps_per_second': 5.24, 'total_flos': 2.6005255225344e+17, 'train_loss': 0.6169107930501302, 'epoch': 3.0})

In [None]:
trainer.evaluate(tokenized_test)

{'eval_loss': 0.6858211755752563,
 'eval_model_preparation_time': 0.0054,
 'eval_runtime': 192.8022,
 'eval_samples_per_second': 25.933,
 'eval_steps_per_second': 6.483}

Based on the evaluation results from the test set, a loss value of 0.68 was obtained, which indicates fairly good model performance. However, during the prediction phase, an Out of Memory (OOM) error occurred, rendering the model unusable in its current state. Additionally, due to the expiration of the Colab Pro subscription, I had to retrain the model using a smaller dataset and reduce the batch size to 4—down from 16 previously used in Colab Pro—to accommodate the limited RAM available in the standard environment.


In [None]:
df_train = df_train.sample(frac=0.5, random_state=42).reset_index(drop=True)
df_val = df_val.sample(frac=0.5, random_state=42).reset_index(drop=True)
df_test = df_test.sample(frac=0.5, random_state=42).reset_index(drop=True)

print(len(df_train))
print(len(df_val))
print(len(df_test))

20000
2500
2500


In [None]:
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.7192,0.726967
2,0.6061,0.736166
3,0.4086,0.780275


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=15000, training_loss=0.5821676137288412, metrics={'train_runtime': 17766.6609, 'train_samples_per_second': 5.629, 'train_steps_per_second': 1.407, 'total_flos': 1.3002627612672e+17, 'train_loss': 0.5821676137288412, 'epoch': 3.0})

In [None]:
trainer.evaluate(tokenized_test)

{'eval_loss': 0.739427924156189,
 'eval_runtime': 202.2875,
 'eval_samples_per_second': 12.359,
 'eval_steps_per_second': 3.09,
 'epoch': 3.0}

The loss increased from 0.68 in the previous model (trained on 50,000 samples) to 0.74 in the new model trained on only half the data. This decline in performance is expected, as reducing the training set size limits the model’s ability to generalize and understand the data effectively.


# Prediction

In [None]:
small_test = tokenized_test.select(range(5))

test_results = trainer.predict(small_test)
preds = test_results.predictions
labels = test_results.label_ids

In [None]:
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

In [None]:
decoded_preds

["Bristol's new mayor will be elected at the end of May as part of a new council-based mayor for the Bristol and Gloucestershire area.It is part of the devolution deal that takes decision-making from Westminster to our region on important issues.",
 'Four Chelsea fans have been banned from attending football matches for a racist incident on the Paris Metro.Three men and one woman have been given a five-year ban for their part in the incident, which saw a French passenger pushed off the train. The court heard that the incident had "tarnished the reputation of English football in Europe".',
 "Ford has developed a 'drunk suit' that simulates the effects of being intoxicated . The suit consists of a unbalanced ankle and wrist weights, braces to restrict joint movement, and 'beer goggles' that blur eyesight . Also included is a pair of headphones to impair hearing . Suit is being used for the company's Driving Skills for Life program .",
 '– The White House press secretary, Sean Spicer, has

In [None]:
decoded_labels

['On 4 May 2017, voters in some parts of the West of England will choose their first elected mayor.',
 'Four Chelsea fans accused of refusing to allow a black man on to the Paris Metro have been banned from attending football matches for up to five years.',
 "Suit developed by Ford for its Driving Skills for Life program . Uses 'beer googles,' headphones, weights, and braces to simulate intoxication . Suit makes it difficult to perform even simple motor skills .",
 '– Sean Spicer has had another try at walking back his comments about Hitler not using chemical weapons—and this time, he didn\'t use the phrase "Holocaust centers." The White House press secretary told CNN\'s Wolf Blitzer Tuesday evening that he was "trying to make a point about the heinous acts that Assad had made against his own people last week." "Frankly, I mistakenly made an inappropriate and insensitive reference to the Holocaust, for which there is no comparison," Spicer said. "And for that I apologize. It was a mist

**Quick look:**
The predicted summaries generally capture the main idea of each article, but they tend to be too verbose or overly literal, often mirroring the source structure rather than summarizing concisely. In contrast, the reference labels are shorter, sharper, and more abstract, distilling key points efficiently.

Some critical details are missing in the predictions (e.g., specific consequences, tone clarification, or concluding remarks). This shows that while the model grasps the general context, it still struggles with information prioritization and compression, which are essential in high-quality summarization.

**Evaluation Metrics Explanation:**

ROUGE (Recall-Oriented Understudy for Gisting Evaluation):

* Measures n-gram overlap between the predicted and reference summaries.
* Common variants: ROUGE-1 (unigrams), ROUGE-2 (bigrams), and ROUGE-L (longest common subsequence).
* Higher scores indicate better overlap and thus better summary quality.

METEOR (Metric for Evaluation of Translation with Explicit ORdering):

* Focuses on precision and recall of aligned segments (stems, synonyms, etc.) between prediction and reference.
* Designed to correlate better with human judgment than BLEU.
* Score ranges from 0 to 1; higher is better.

BERTScore: (the best metric to consider)

* Uses contextual embeddings (from BERT) to compare semantic similarity between predictions and references.
* Considers meaning, not just surface-level overlap.
* Returns Precision, Recall, and F1; typically we report F1. Higher F1 indicates better semantic similarity.

In [None]:
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

# ROUGE
rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
print({k: round(v * 100, 2) for k, v in rouge_result.items()})

# METEOR
meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)
print("METEOR:", round(meteor_result["meteor"] * 100, 2))

# BERTScore
bertscore_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
print("BERTScore:", round(np.mean(bertscore_result["f1"]) * 100, 2))


{'rouge1': 36.51, 'rouge2': 13.02, 'rougeL': 19.18, 'rougeLsum': 19.18}
METEOR: 33.09


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: 87.39


**Result Explanation:**

Based on the evaluation results:

- ROUGE-1: 36.51
This indicates that about 36.51% of the unigrams (individual words) in the model's summaries match the reference summaries. This is a fairly good score, showing the model captures key vocabulary well.

- ROUGE-2: 13.02
Measures bigram overlap (sequences of two words). A score of 13.02 is moderate and expected. It reflects the model's ability to retain contextual flow, which is generally harder.

- ROUGE-L & ROUGE-Lsum: 19.18
These measure the longest common subsequence between the prediction and reference. A score of 19.18 indicates that the model partially preserves sentence structure, but there's room for improvement.

M- ETEOR: 33.09
This is a relatively strong score, as METEOR also accounts for synonyms, stemming, and word order. It suggests the model understands not only surface-level words but also meaning and variations.

**BERTScore: 87.39**
This is a high score, showing that the generated summaries are semantically close to the references. BERTScore compares deep contextual embeddings, so this suggests strong content alignment even if wording differs.

## saving model

In [None]:
model.save_pretrained("./best_model")
tokenizer.save_pretrained("./best_model")

('./best_model/tokenizer_config.json',
 './best_model/special_tokens_map.json',
 './best_model/vocab.json',
 './best_model/merges.txt',
 './best_model/added_tokens.json')

In [None]:
import shutil
shutil.make_archive("best_model", "zip", "./best_model")
from google.colab import files
files.download("best_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import zipfile

with zipfile.ZipFile("best_model.zip", "r") as zip_ref:
    zip_ref.extractall("best_model")


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("./best_model")
tokenizer = AutoTokenizer.from_pretrained("./best_model")

