In [None]:
import json
import requests
from bs4 import BeautifulSoup as bs

# Reading locally saved file instead of calling Google Patent API

f = open('Google Patent API Test/daten.json')

data = json.load(f)

In [None]:
# Generating list of all patent IDs from search result

publication_ids = []

for cluster in data["results"]["cluster"]:
    for result in cluster["result"]:
        publication_id = result["patent"]["publication_number"]
        publication_ids.append(publication_id)


# Creating empty dictionary for the patent data to be added

patent_data = {}

# Scraping abstracts, descriptions and claims of those patents


# Iterating through publication ID list

for id in publication_ids:

    # Checking if an entry for the patent id already exists
    if id not in patent_data:
        
        # generating Google Patent links for each ID

        url = "https://patents.google.com/patent/" + id + "/en"

        response = requests.get(url)
        html_content = response.content
        soup = bs(html_content, 'html.parser')


        # Scraping Title

        title_span = soup.find('span', itemprop='title')

        if title_span is not None:
            title = title_span.get_text()

            # Removing weird ending of title
            to_remove = "\n"
            title = title.replace(to_remove, "").strip()
        else:
            title = False


        # Scraping Abstract
        
        abstract_div = soup.find('div', class_='abstract')

        if abstract_div is not None:
            abstract = abstract_div.get_text()
        else:
            abstract = False
        

        # Scraping Description

        description_section = soup.find('section', itemprop='description')

        if description_section:

            # Removing H2 from section
            h2_tag = description_section.find('h2')
            if h2_tag:
                h2_tag.decompose()
            
            # Removing all 'notranslate' class items
            for notranslate_tag in description_section.find_all(class_='notranslate'):
                notranslate_tag.decompose()
            
            # Removing all <aside> elements
            for aside_tag in description_section.find_all('aside'):
                aside_tag.decompose()

            # Extracting and joining the text
            description = "".join(description_section.stripped_strings)
            if description == "":
                description = False

        else:
            description = False
        

        # Scraping Claims

        description_section = soup.find('section', itemprop='claims')

        if description_section:

            # Removing H2 from section
            h2_tag = description_section.find('h2')
            if h2_tag:
                h2_tag.decompose()
            
            # Removing all 'notranslate' class items
            for notranslate_tag in description_section.find_all(class_='notranslate'):
                notranslate_tag.decompose()
            
            # Removing all <aside> elements
            for aside_tag in description_section.find_all('aside'):
                aside_tag.decompose()

            # Extracting and joining the text
            claims = "".join(description_section.stripped_strings)
            if claims == "":
                claims = False

        else:
            claims = False
        
        patent_data[id] = {
        "title": title,
        "abstract": abstract,
        "description": description,
        "claims": claims
    }

In [None]:
# extracting patent ids + abstracts for further prompt usage

abstract_prompt = ""

for patent_id, patent_info in patent_data.items():
    # Check if there is an abstract for the patent
    if patent_info['abstract']:
        abstract_prompt = abstract_prompt + f'{patent_id}: "{patent_info["abstract"]}"\n'