In [26]:
# !pip install transformers
# !pip install keras_nlp
# !pip install datasets
# !pip install huggingface-hub
# !pip install nltk
# !pip install rouge-score
# !pip install evaluate

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,AdamWeightDecay, create_optimizer 
import nltk
from nltk.corpus import stopwords

# Only log error messages
import logging
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings('ignore')

## Loading the dataset

### About Dataset
[BIGPATENT](https://huggingface.co/datasets/big_patent), consisting of 1.3 million records of U.S. patent documents along with human written abstractive summaries. Each US patent application is filed under a Cooperative Patent Classification (CPC) code. There are nine such classification categories:

* a: Human Necessities
* b: Performing Operations; Transporting
* c: Chemistry; Metallurgy
* **d: Textiles; Paper**
* **e: Fixed Constructions**
* **f: Mechanical Engineering; Lightning; Heating; Weapons; Blasting**
* g: Physics
* h: Electricity
* y: General tagging of new or cross-sectional technology

I will be working with 3 out of the 9 classification categories.

In [3]:
df = load_dataset("big_patent", codes=["d", "e", "f"], split="train")

Downloading builder script:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/22.9k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.70k [00:00<?, ?B/s]

Downloading and preparing dataset big_patent/d+e+f to /root/.cache/huggingface/datasets/big_patent/d+e+f-322b48a9529e6b49/2.1.2/bc8ec8bdf469c0da5fef04becd32bb3b0b34df0b0baa088ae1237628dd7a9caa...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.13G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/506M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/508M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset big_patent downloaded and prepared to /root/.cache/huggingface/datasets/big_patent/d+e+f-322b48a9529e6b49/2.1.2/bc8ec8bdf469c0da5fef04becd32bb3b0b34df0b0baa088ae1237628dd7a9caa. Subsequent calls will reuse this data.


In [4]:
print(df)

Dataset({
    features: ['description', 'abstract'],
    num_rows: 130175
})


In [5]:
df = df.train_test_split(test_size=0.3)

### Define Certain Variables

In [6]:
# Maximum length of the input to the model
MAX_INPUT_LENGTH = 1024  

# Minimum length of the output by the model
MIN_TARGET_LENGTH = 5  
# Maximum length of the output by the model
MAX_TARGET_LENGTH = 512  

# Batch-size for training our model
BATCH_SIZE = 16

## T5 Model

#### Data preprocessing

In [7]:
MODEL_CHECKPOINT = "t5-base"


tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

if MODEL_CHECKPOINT in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
def preprocess_function(examples):
    inputs = [prefix + inp for inp in examples["description"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
    targets = examples["abstract"]

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_df = df.map(preprocess_function, batched=True)

Map:   0%|          | 0/91122 [00:00<?, ? examples/s]

Map:   0%|          | 0/39053 [00:00<?, ? examples/s]

### Define the Model

In [17]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=MODEL_CHECKPOINT, return_tensors="tf")

In [19]:
tf_train_set = tokenized_df["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)

tf_test_set = tokenized_df["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
tf_validation_set = (
    tokenized_df["test"]
    .shuffle()
    .select(list(range(200)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

### Compiling the Model

In [20]:
from transformers import AdamWeightDecay, create_optimizer
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) 
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


----

### Train and Evaluate the Model

In [23]:
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge(decoded_labels, decoded_predictions)
    # We will print only the F1 score, you can use other aggregation metrics as well
    result = {"RougeL": result["f1_score"]}

    return result

-----------

In [24]:
# compute the ROUGE score from the predictions
from transformers.keras_callbacks import  KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, 
    eval_dataset=tf_validation_set,
    predict_with_generate=True
    )

callbacks = [metric_callback]



In [27]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=1, callbacks=callbacks)

ResourceExhaustedError: ignored

## MT5 Model

## BART Model

## PEGASUS Model

## GP2 Model