Method 1 -- By using BartTokenizer, BartForConditionalGeneration

Combined code

In [9]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import torch
from transformers import BartTokenizer, BartForConditionalGeneration


# List of patent numbers
patent_numbers = ["US11172208", "US11342001", "US9161032", "US11581022", "US10499066", "US10783609", "US11284055", "US10123027", "US11523135", "US8204134"]

patent_data = {}

for patent_number in patent_numbers:
    url = f"https://patents.google.com/patent/{patent_number}"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    
    # Extract abstract text
    abstract_texts = soup.find_all('div', class_='abstract')
    abstract = ' '.join([abstract_text.get_text().strip() for abstract_text in abstract_texts])
    
    # Extract claims text
    claim_texts = soup.find_all('div', class_='claim-text')
    claims = ' '.join([claim_text.get_text().strip() for claim_text in claim_texts])
    
    # Combine abstract and claims separately
    combined_data = {
        "Abstract": abstract,
        "Claims": claims
    }
    
    # Store the combined data for the patent number
    patent_data[patent_number] = combined_data

# Initialize the BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Iterate over each patent number and its data
for patent_number, data in patent_data.items():
  print(f"Patent Number: {patent_number}")
  #print(f"Abstract: {data['Abstract']}\n")
  #print(f"Claims: {data['Claims']}\n")

    # Combine abstract and claims separately
  combined_text = data['Abstract'] + " " + data['Claims']

# Tokenize the combined text
  input_ids = tokenizer.encode(combined_text, truncation=True, max_length=1024, padding='longest', return_tensors='pt')

    # Generate the summary
  summary_ids = model.generate(input_ids, num_beams=4, max_length=150, early_stopping=True)
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Print the summary
  print("Summary:")
  print(summary)
  print("--------------------\n")


Patent Number: US11172208
Summary:
A method, apparatus and computer program products are provided for capturing omnidirectional video with one or more cameras, streaming the video over a network, and rendering the video on a Head Mounted Display (HMD) One example method includes selecting a primary viewport, encoding a first representation, and encoding a second representation that provides a gradual picture quality change from a background viewport to the primary view port.
--------------------

Patent Number: US11342001
Summary:
According to an example embodiment, a technique for zooming one or more images of a video stream into corresponding images of the video signal is provided. The technique comprises: receiving the video stream, a plurality of audio signals and audiovisual metadata that defines a spatial relationship between images. determining presence of at least a first sound source and a second sound source.
--------------------

Patent Number: US9161032
Summary:
The present

Method 1 -- By using BartTokenizer, BartForConditionalGeneration (Here, tried with chunks)

In [11]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import torch
from transformers import BartTokenizer, BartForConditionalGeneration


# List of patent numbers
patent_numbers = ["US11172208"]

patent_data = {}

for patent_number in patent_numbers:
    url = f"https://patents.google.com/patent/{patent_number}"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    
    # Extract abstract text
    abstract_texts = soup.find_all('div', class_='abstract')
    abstract = ' '.join([abstract_text.get_text().strip() for abstract_text in abstract_texts])
    
    # Extract claims text
    claim_texts = soup.find_all('div', class_='claim-text')
    claims = ' '.join([claim_text.get_text().strip() for claim_text in claim_texts])
    
    # Combine abstract and claims separately
    combined_data = {
        "Abstract": abstract,
        "Claims": claims
    }
    
    # Store the combined data for the patent number
    patent_data[patent_number] = combined_data

# Function to summarize a given text
def summarize_text(text):
  model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
  tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
  inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors="pt")
  summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=150, early_stopping=True)
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
  return summary

# Function to divide long text into chunks and summarize each chunk
def summarize_long_text(long_text, chunk_size=500):
  chunks = [long_text[i:i+chunk_size] for i in range(0, len(long_text), chunk_size)]
  summaries = [summarize_text(chunk) for chunk in chunks]
  final_summary = " ".join(summaries)
  return final_summary

# Iterate over each patent number and its data
for patent_number, data in patent_data.items():
  print(f"Patent Number: {patent_number}")
  #print(f"Abstract: {data['Abstract']}\n")
  #print(f"Claims: {data['Claims']}\n")

    # Combine abstract and claims separately
  combined_text = data['Abstract'] + " " + data['Claims']

# Tokenize the combined text
  final_summary = summarize_long_text(combined_text)
    # Print the summary
  print("Summary:")
  print(final_summary)
  print("--------------------\n")


Patent Number: US11172208
Summary:
A method, apparatus and computer program products are provided for capturing omnidirectional video with one or more cameras, streaming the video over a network, and rendering the video on a Head Mounted Display (HMD) One example method includes selecting a primary viewport, encoding a first representation that covers thePrimary viewport. and encoding a second representation that provides a gradual picture quality change from a background viewport to the primary viewport. 1. A method comprising: selecting a primary. viewport;encoding, at a first quality, a first representation of the. primary. 2. An encoding method that encodes a second. representation of a backgroundviewport, the quality at which the first. representation is encoded being higher than the second quality. A boundary region for the first representation that is adjacent to the first. representation, is spherically covered by the background viewport, and provides for an increasing picture 

Method 2 - Here, we are using the Summarization model by diving the input text into small chunks with one sample example

In [1]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m109.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [2]:
from transformers import pipeline
from bs4 import BeautifulSoup
import requests

In [3]:
summarizer = pipeline("summarization")


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import torch


# List of patent numbers
patent_numbers = ["US11172208"]

patent_data = {}

for patent_number in patent_numbers:
    url = f"https://patents.google.com/patent/{patent_number}"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    
    # Extract abstract text
    abstract_texts = soup.find_all('div', class_='abstract')
    abstract = ' '.join([abstract_text.get_text().strip() for abstract_text in abstract_texts])
    
    # Extract claims text
    claim_texts = soup.find_all('div', class_='claim-text')
    claims = ' '.join([claim_text.get_text().strip() for claim_text in claim_texts])
    
    # Combine abstract and claims separately
    combined_data = {
        "Abstract": abstract,
        "Claims": claims
    }
    
    # Store the combined data for the patent number
    patent_data[patent_number] = combined_data

# Iterate over each patent number and its data
for patent_number, data in patent_data.items():
  print(f"Patent Number: {patent_number}")
  #print(f"Abstract: {data['Abstract']}\n")
  #print(f"Claims: {data['Claims']}\n")

    # Combine abstract and claims separately
  combined_text = data['Abstract'] + " " + data['Claims']

print(combined_text)

In [5]:
max_chunk = 500
combined_text = combined_text.replace('.', '.<eos>')
combined_text = combined_text.replace('?', '?<eos>')
combined_text = combined_text.replace('!', '!<eos>')

In [8]:
sentences = combined_text.split('<eos>')
current_chunk = 0 
chunks = []
for sentence in sentences:
    if len(chunks) == current_chunk + 1: 
        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
            chunks[current_chunk].extend(sentence.split(' '))
        else:
            current_chunk += 1
            chunks.append(sentence.split(' '))
    else:
        print(current_chunk)
        chunks.append(sentence.split(' '))

for chunk_id in range(len(chunks)):
    chunks[chunk_id] = ' '.join(chunks[chunk_id])

0


In [9]:
len(chunks)

7

In [12]:
res = summarizer(chunks, max_length=100, min_length=30, do_sample=False)


In [13]:
res[0]

{'summary_text': ' A method, apparatus and computer program products are provided for capturing omnidirectional video with one or more cameras, streaming the video over a network, and rendering the video on a Head Mounted Display (HMD)'}

In [14]:
text = ' '.join([summ['summary_text'] for summ in res])


In [15]:
text

' A method, apparatus and computer program products are provided for capturing omnidirectional video with one or more cameras, streaming the video over a network, and rendering the video on a Head Mounted Display (HMD)  An apparatus is configured to select a primary viewport and encode a second representation of the background viewport . An increasing picture quality gradient is achieved by decreasing a quantization parameter across pixels between the first representation and the second representation .  A method comprising:.\xa0obtaining a primary viewport;. selecting, obtaining, and decoding, from a first . quality encoding, a first representation of the primary . viewport, and obtaining a background viewport . A boundary region for the first representation that is adjacent to the first . representation, is spherically covered by the background view . The increasing picture quality gradient is achieved by decreasing a quantization . parameter across said pixels between the . first re