In [10]:
# !pip install transformers
# !pip install keras_nlp
# !pip install datasets
# !pip install huggingface-hub
# !pip install nltk
# !pip install rouge-score

In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
import nltk
from nltk.corpus import stopwords

# Only log error messages
import logging
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings('ignore')

## Loading the dataset

### About Dataset
[BIGPATENT](https://huggingface.co/datasets/big_patent), consisting of 1.3 million records of U.S. patent documents along with human written abstractive summaries. Each US patent application is filed under a Cooperative Patent Classification (CPC) code. There are nine such classification categories:

* a: Human Necessities
* b: Performing Operations; Transporting
* c: Chemistry; Metallurgy
* d: Textiles; Paper
* e: Fixed Constructions
* f: Mechanical Engineering; Lightning; Heating; Weapons; Blasting
* g: Physics
* h: Electricity
* y: General tagging of new or cross-sectional technology

In [6]:
from datasets import load_dataset

df = load_dataset("big_patent", codes=["f", "g"])

Downloading builder script:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/22.9k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.70k [00:00<?, ?B/s]

Downloading and preparing dataset big_patent/f+g to /root/.cache/huggingface/datasets/big_patent/f+g-fc8909cd3f463c02/2.1.2/bc8ec8bdf469c0da5fef04becd32bb3b0b34df0b0baa088ae1237628dd7a9caa...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.13G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/506M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/508M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset big_patent downloaded and prepared to /root/.cache/huggingface/datasets/big_patent/f+g-fc8909cd3f463c02/2.1.2/bc8ec8bdf469c0da5fef04becd32bb3b0b34df0b0baa088ae1237628dd7a9caa. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
print(df)

DatasetDict({
    train: Dataset({
        features: ['description', 'abstract'],
        num_rows: 344503
    })
    validation: Dataset({
        features: ['description', 'abstract'],
        num_rows: 19139
    })
    test: Dataset({
        features: ['description', 'abstract'],
        num_rows: 19140
    })
})


In [8]:
df_train = df['train']
df_val = df['validation']
df_test = df['test']

In [9]:
df_train.shape

(344503, 2)

In [11]:
df_val.shape

(19139, 2)

In [12]:
df_test.shape

(19140, 2)

In [13]:
df_train[0]

{'description': 'FIELD OF THE INVENTION \n     The present invention generally relates to cooking ranges, and more particularly relates to cooking ranges for use on recreational vehicles. \n     BACKGROUND OF THE INVENTION \n     Modern recreational vehicles include many of the modern amenities of free-standing homes. It is not uncommon for the recreational vehicle to include, in addition to sleeping and living quarters, a full-service kitchen as well. One of the appliances which is typically provided in such a recreational vehicle kitchen, is a range which combines the functions of a convection oven with a stove-top having individual burners. \n     As with many manufacturing processes, recreational vehicles are manufactured in assembly line fashion where it is necessary to minimize the labor required, and thus time and cost required, for manufacturing each vehicle. Every facet of the assembly process is under scrutiny with improvements constantly being implemented, discovered and sou