In [5]:
# Safe way to access Kaggle dataset without exposing your key
from google.colab import files
import os

# Prompt user to upload kaggle.json securely at runtime
print("Please upload your kaggle.json file (from Kaggle account > Settings).")
files.upload()  # this opens a file upload dialog

# Set up Kaggle API access
os.makedirs('/root/.kaggle', exist_ok=True)
!cp kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

Please upload your kaggle.json file (from Kaggle account > Settings).


Saving kaggle.json to kaggle.json


In [6]:
# Download the dataset
!kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail

# Unzip the dataset
!unzip newspaper-text-summarization-cnn-dailymail.zip

Dataset URL: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
License(s): CC0-1.0
Downloading newspaper-text-summarization-cnn-dailymail.zip to /content
 98% 491M/503M [00:10<00:00, 63.5MB/s]
100% 503M/503M [00:10<00:00, 51.8MB/s]
Archive:  newspaper-text-summarization-cnn-dailymail.zip
  inflating: cnn_dailymail/test.csv  
  inflating: cnn_dailymail/train.csv  
  inflating: cnn_dailymail/validation.csv  


**Import Libraries**

In [7]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

**Load the Dataset**

In [8]:
train_df = pd.read_csv('/content/cnn_dailymail/train.csv')
valid_df = pd.read_csv('/content/cnn_dailymail/validation.csv')
test_df = pd.read_csv('/content/cnn_dailymail/test.csv')

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(valid_df)}")
print(f"Test samples: {len(test_df)}")

Training samples: 287113
Validation samples: 13368
Test samples: 11490


**Preprocessing**

In [9]:
def preprocess_text(text):
    # Basic cleaning: remove newlines and extra spaces
    return text.replace('\n', ' ').strip()

**Extractive Summarization using spaCy**

In [10]:
nlp = spacy.load('en_core_web_sm')

def extractive_summary(text, per=0.2):
    doc = nlp(text)
    word_frequencies = {}
    for word in doc:
        if word.text.lower() not in STOP_WORDS and word.text not in punctuation:
            word_frequencies[word.text.lower()] = word_frequencies.get(word.text.lower(), 0) + 1
    max_freq = max(word_frequencies.values())
    for word in word_frequencies:
        word_frequencies[word] /= max_freq

    sentence_scores = {}
    for sent in doc.sents:
        for word in sent:
            if word.text.lower() in word_frequencies:
                sentence_scores[sent] = sentence_scores.get(sent, 0) + word_frequencies[word.text.lower()]

    select_length = int(len(sentence_scores) * per)
    summary_sentences = nlargest(select_length, sentence_scores, key=sentence_scores.get)
    summary = ' '.join([sent.text for sent in summary_sentences])
    return summary

**Abstractive Summarization using HuggingFace T5-small model**

In [11]:
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def abstractive_summary(text, max_length=150, min_length=40):
    preprocess_text = "summarize: " + text.strip().replace("\n", " ")
    tokenized_text = tokenizer.encode(preprocess_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(tokenized_text,
                                 num_beams=4,
                                 no_repeat_ngram_size=2,
                                 length_penalty=2.0,
                                 min_length=min_length,
                                 max_length=max_length,
                                 early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

**Evaluation using ROUGE**

In [12]:
!pip install rouge-score

from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_summary(reference, generated):
    scores = scorer.score(reference, generated)
    return scores

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e137d84b48a0e6d2d2b0aeba0964e6ef0d9fb26b74846ae94d2b5455f204ba37
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


**Test the Sample Article**

In [13]:
# Choose an example article and its reference summary from the dataset
article = preprocess_text(test_df['article'][0])
reference_summary = preprocess_text(test_df['highlights'][0])

In [14]:
# Extractive summary
ext_summary = extractive_summary(article, per=0.1)

# Abstractive summary
abs_summary = abstractive_summary(article)

print("Reference summary:")
print(reference_summary)
print("\nExtractive summary:")
print(ext_summary)
print("\nAbstractive summary:")
print(abs_summary)

# Evaluate extractive summary
print("\nROUGE scores for Extractive summary:")
print(evaluate_summary(reference_summary, ext_summary))

# Evaluate abstractive summary
print("\nROUGE scores for Abstractive summary:")
print(evaluate_summary(reference_summary, abs_summary))

Reference summary:
Experts question if  packed out planes are putting passengers at risk . U.S consumer advisory group says minimum space must be stipulated . Safety tests conducted on planes with more leg room than airlines offer .

Extractive summary:
While United Airlines has 30 inches of space, Gulf Air economy seats have between 29 and 32 inches, Air Asia offers 29 inches and Spirit Airlines offers just 28 inches.

Abstractive summary:
some experts are questioning if shrinking space on planes is putting our health and safety in danger. this week, a consumer advisory group set up by the department of transportation said that while the government is happy to set standards for animals flying on airplanes, it doesn't stipulate minimum amount of space for humans. 'i am going to have to fight for space in the overhead lockers and crashing elbows.'

ROUGE scores for Extractive summary:
{'rouge1': Score(precision=0.1, recall=0.08823529411764706, fmeasure=0.09375000000000001), 'rouge2': Sc

**Basic Fine-Tuning Setup for T5**

In [2]:
!pip install transformers datasets sentencepiece

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset

In [15]:
# This is a very basic fine-tuning setup outline.
# For full fine-tuning you need to prepare your dataset in huggingface Dataset format.


# Prepare a small dataset (example for demo)
data = {'article': [train_df['article'][0]], 'summary': [train_df['highlights'][0]]}
dataset = Dataset.from_dict(data)

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'], max_length=150, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    logging_steps=10,
    evaluation_strategy="no",
    save_steps=10,
    save_total_limit=1,
    remove_unused_columns=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)


trainer.train()

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mqureshikhansa710[0m ([33mqureshikhansa710-university-of-engineering-and-technolog[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


TrainOutput(global_step=1, training_loss=13.017017364501953, metrics={'train_runtime': 296.3927, 'train_samples_per_second': 0.003, 'train_steps_per_second': 0.003, 'total_flos': 135341801472.0, 'train_loss': 13.017017364501953, 'epoch': 1.0})