In [None]:
!pip install transformers
!pip install keras_nlp
!pip install datasets
!pip install huggingface-hub
!pip install nltk
!pip install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.0-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Using cached huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras_nlp
  Downloading

In [None]:
import os
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
import nltk
from nltk.corpus import stopwords

# Only log error messages
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings('ignore')

## Define Certain Variables

In [None]:
# The percentage of the dataset you want to split as train and test
TRAIN_TEST_SPLIT = 0.1

# Maximum length of the input to the model
MAX_INPUT_LENGTH = 1024  

# Minimum length of the output by the model
MIN_TARGET_LENGTH = 5  
# Maximum length of the output by the model
MAX_TARGET_LENGTH = 128  

# Batch-size for training our model
BATCH_SIZE = 8 

# Learning-rate for training our model 
LEARNING_RATE = 2e-5  

# Maximum number of epochs we will train the model for
MAX_EPOCHS = 10  

# This notebook is built on the t5-small checkpoint from the Hugging Face Model Hub
MODEL_CHECKPOINT = "t5-small"

## Load the dataset

The dataset consists of BBC articles and accompanying single sentence summaries. Specifically, each article is prefaced with an introductory sentence (aka summary) which is professionally written, typically by the author of the article. That dataset has 226,711 articles divided into training (90%, 204,045), validation (5%, 11,332), and test (5%, 11,334) sets. The dataset is available for download from [here](https://www.kaggle.com/pariza/bbc-news-summary).

The dataset has the following fields:

* **document**: the original BBC article to me summarized
* **summary**: the single sentence summary of the BBC article
* **id**: ID of the document-summary pair

In [None]:
from datasets import load_dataset, load_metric

# Load the dataset
raw_datasets = load_dataset("xsum", split="train")
print(raw_datasets)

Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading and preparing dataset xsum/default to /root/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71. Subsequent calls will reuse this data.
Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 204045
})


In [None]:
raw_datasets[0]

 'summary': 'Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.',
 'id': '35232142'}

In [None]:
raw_datasets.shape

(204045, 3)

In [None]:
# Split the dataset 
raw_datasets = raw_datasets.train_test_split(
    train_size=TRAIN_TEST_SPLIT, test_size=TRAIN_TEST_SPLIT
)

In [None]:
# # Split the dataset into train and test sets
# train_size = int(len(raw_datasets) * TRAIN_TEST_SPLIT)
# test_size = len(raw_datasets) - train_size
# train_dataset, test_dataset = raw_datasets.train_test_split(
#     test_size=test_size
# )

`len(raw_materials)` computes the length of the raw_materials dataset. `TRAIN_TEST_SPLIT` is a constant value that has been defined earlier, which represents the percentage of data that has to be used for training the model. 

`train_size` calculates the number of elements to be used for training by multiplying the length of the raw_materials dataset with the `TRAIN_TEST_SPLIT` value and typecasting it to an integer.

`test_size` calculates the number of data points that will be used for testing by subtracting the `train_size` from the original length of the `raw_materials` dataset.

Then the `train_test_split()` function is called on the `raw_materials` dataset using the `test_size` and `random_state` parameters. The `test_size` parameter specifies the size of the test dataset, while the `random_state` parameter shuffles the data randomly when split. The function returns two datasets, the train dataset and the test dataset, which are stored in the variables `train_dataset` and `test_dataset` respectively. 

Finally, the code splits the dataset into two parts: a training set and a test set, which can be used to train a machine learning model and evaluate its accuracy respectively.

## Data Pre-processing

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

if MODEL_CHECKPOINT in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

I initialized a `tokenizer` object with a pre-trained model specified in the `MODEL_CHECKPOINT` variable.

Next, the code checks if the `MODEL_CHECKPOINT` variable is set to any of the T5 models: `"t5-small"`, `"t5-base"`, `"t5-large"`, `"t5-3b"`, `"t5-11b"`. If it is, the `prefix` variable is set to `"summarize: "`. Otherwise, `prefix` remains an empty string.

In [None]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

This code defines a function called `preprocess_function` that takes a dictionary of examples as input. The function first concatenates a prefix string to each document in the examples. It then tokenizes the concatenated documents into a format suitable for input to a model, using a pre-defined tokenizer with a maximum input length of `MAX_INPUT_LENGTH`. The function also tokenizes the summaries in the examples using the same tokenizer and a maximum target length of `MAX_TARGET_LENGTH,` but in "target" mode. The function then adds the summary labels to the tokenized model inputs and returns the resulting dictionary of model inputs.

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
# tokenized_datasets = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/20404 [00:00<?, ? examples/s]

Map:   0%|          | 0/20405 [00:00<?, ? examples/s]

This will apply the function on all the elements of all the splits in dataset, so our training, validation and testing data will be preprocessed in one single command.

## Defining the model

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/242M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

For training Sequence to Sequence models, we need a special kind of data collator, which will not only pad the inputs to the maximum length in the batch, but also the labels. Thus, we use the `DataCollatorForSeq2Seq` provided by the Hugging Face Transformers library on our dataset. The `return_tensors='tf'` ensures that we get `tf.Tensor` objects back.

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    tokenized_datasets["test"]
    .shuffle()
    .select(list(range(200)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


The code creates three TensorFlow datasets for a natural language processing task. The first dataset is the `training dataset` which shuffles the data and batches it using the specified batch size. The second dataset is the `test dataset` which does not shuffle the data and also batches it using the specified batch size. The final dataset is a `generation dataset` which selects a subset of the `test data`, shuffles that subset and uses it to generate text. All datasets contain columns for input IDs, attention masks and labels, and use the specified `data_collator` function to collate the data into a batch.

## Building and Compiling the the model

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


## Training and Evaluating the model

To evaluate our model on-the-fly while training, we will define `metric_fn` which will calculate the `ROUGE` score between the groud-truth and predictions.

In [None]:
import keras_nlp

rouge_l = keras_nlp.metrics.RougeL()

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    # We will print only the F1 score, you can use other aggregation metrics as well
    result = {"RougeL": result["f1_score"]}

    return result

In [None]:
# Now we can finally start training our model!
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback]

# For now we will use our test set as our validation_data
model.fit(
    train_dataset, validation_data=test_dataset, epochs=5, callbacks=callbacks
)



Epoch 1/10
 469/2551 [====>.........................] - ETA: 25:19:41 - loss: 3.1138