In [1]:
# The CNN/DailyMail Dataset
# The CNN/DailyMail dataset consists of around 300,000 pairs of news articles and
# their corresponding summaries, composed from the bullet points that CNN and the
# DailyMail attach to their articles.
# summaries are abstractive and not extractive

In [3]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn
!pip install -U sentencepiece
!pip install -U urllib3
!pip install py7zr

Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.4
    Uninstalling transformers-4.42.4:
      Successfully uninstalled transformers-4.42.4
Successfully installed transformers-4.44.2
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
 

In [5]:
import os
os.environ["HF_TOKEN"] = "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"

In [6]:
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [8]:
print(dataset['train'][0])

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [9]:
dataset['train'][2]['article'][:300]

'MINNEAPOLIS, Minnesota (CNN) -- Drivers who were on the Minneapolis bridge when it collapsed told harrowing tales of survival. "The whole bridge from one side of the Mississippi to the other just completely gave way, fell all the way down," survivor Gary Babineau told CNN. "I probably had a 30-, 35-'

In [17]:
from transformers import AutoTokenizer

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")


In [20]:
def tokenize_function(examples):
    inputs = tokenizer(examples['article'], max_length=1024, truncation=True)
    outputs = tokenizer(examples['highlights'], max_length=150, truncation=True)
    inputs['labels'] = outputs['input_ids']
    return inputs


In [21]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [22]:
print(tokenized_datasets['train'][0])


{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [23]:
from torch.utils.data import Dataset
import torch

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        # Retrieve items by index
        item = {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx], dtype=torch.long),
            'labels': torch.tensor(self.encodings['labels'][idx], dtype=torch.long)
        }
        return item

    def __len__(self):
        # The length is determined by the number of samples in input_ids
        return len(self.encodings['input_ids'])


In [24]:
print(tokenized_datasets['train'].features)
print(tokenized_datasets['train'][0])


{'article': Value(dtype='string', id=None), 'highlights': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an

In [26]:
from torch.utils.data import DataLoader
import cProfile
import pstats

def profile_data_loader():
    train_loader = DataLoader(CustomDataset(tokenized_datasets['train']), batch_size=16, shuffle=True)
    eval_loader = DataLoader(CustomDataset(tokenized_datasets['validation']), batch_size=16)

cProfile.run('profile_data_loader()', 'restats')

p = pstats.Stats('restats')
p.sort_stats('cumulative').print_stats(10)


Sat Aug 24 19:10:12 2024    restats

         308 function calls (220 primitive calls) in 287.695 seconds

   Ordered by: cumulative time
   List reduced from 52 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000  287.695  287.695 {built-in method builtins.exec}
        1    0.000    0.000  287.695  287.695 <string>:1(<module>)
        1    0.000    0.000  287.695  287.695 <ipython-input-26-ce45eefcee45>:5(profile_data_loader)
        2    0.000    0.000  287.694  143.847 /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:226(__init__)
        1    0.000    0.000  287.691  287.691 /usr/local/lib/python3.10/dist-packages/torch/utils/data/sampler.py:132(__init__)
        2    0.000    0.000  287.691  143.845 /usr/local/lib/python3.10/dist-packages/torch/utils/data/sampler.py:145(num_samples)
      4/2    0.000    0.000  287.691  143.845 {built-in method builtins.len}
        2    7.302    

<pstats.Stats at 0x79a3f0b15720>

In [29]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "t5-small"  # or any other suitable model
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)


In [31]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./output",  # Specify the output directory
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch'
)



In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=CustomDataset(tokenized_datasets['train']),
    eval_dataset=CustomDataset(tokenized_datasets['validation'])
)


In [None]:
trainer.train()


In [None]:
eval_results = trainer.evaluate()
print(eval_results)


In [None]:
inputs = tokenizer("Your input text here", return_tensors="pt")
outputs = model.generate(inputs['input_ids'])
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
