In [19]:
import json
import requests
from bs4 import BeautifulSoup as bs
import os

# Reading locally saved file instead of calling Google Patent API

f = open('Google Patent API Test/daten.json')

data = json.load(f)

In [20]:
# !!!! VOR DEM GOOGLE API LOOP !!!!

# Creating empty dictionary for the patent data to be added

patent_data = {}

In [21]:
# IN DEN GOOGLE API LOOP

# Checking if there's already an entry for the patent ID, if not, a new entry will be made including the PDF link

keywords_list = ["coffee", "vehicle access system", "wind turbine"]
patent_api_key = os.environ['GOOGLE_PATENT_API_KEY']

for keyword in keywords_list:
    url_base = "https://serpapi.com/search.html?engine=google_patents"
    query = keyword.replace(" ", "+")
    url = url_base + "&q=" + query + "&api_key=" + patent_api_key

    response = requests.get(url)

    if response.status_code == 200:
                data = response.json() #write json-answer in var
                for cluster in data["results"]["cluster"]:
                    for result in cluster["result"]:
                        id = result["patent"]["publication_number"]
                        if id not in patent_data.keys():
                            patent_data[id] = {
                                "pdf": result["patent"]["pdf"],
                            }
    else:
        print(f"Error with API request: Status code {response.status_code}")

In [22]:
print(patent_data)

{'EP2654522B1': {'pdf': '14/4b/44/78718c1eea410b/EP2654522B1.pdf'}, 'CA3104644C': {'pdf': '84/9c/1e/c02783fbdf39e5/CA3104644C.pdf'}, 'US10405690B2': {'pdf': '5c/c6/6c/ea5952be6ba227/US10405690.pdf'}, 'US7217908B2': {'pdf': 'e6/4e/e4/6d955df2224b2e/US7217908.pdf'}, 'AU2013208992B2': {'pdf': '69/f9/23/a07045138ceeb1/AU2013208992B2.pdf'}, 'AU2010202827A1': {'pdf': '13/2f/25/6a7fb121d39d1d/AU2010202827A1.pdf'}, 'EP3897309A1': {'pdf': ''}, 'RU2766609C2': {'pdf': 'ab/3c/bb/d5d4b21616f08c/RU2766609C2.pdf'}, 'US7337704B2': {'pdf': '11/df/38/54e57c305f4d3e/US7337704.pdf'}, 'EP2531079B1': {'pdf': '65/3b/6b/08ff14ebd7c6f9/EP2531079B1.pdf'}, 'US11721215B2': {'pdf': '25/74/cd/f97d22acbd7d56/US11721215.pdf'}, 'CN117002435A': {'pdf': 'a0/a1/4d/fc9e2165edc80e/CN117002435A.pdf'}, 'US10442300B2': {'pdf': 'b5/fa/c3/a61e3040b6023d/US10442300.pdf'}, 'CN109017763B': {'pdf': 'e6/19/8c/6766d9c607fd57/CN109017763B.pdf'}, 'US11445343B2': {'pdf': '86/6e/e4/ec8daaa58afdb7/US11445343.pdf'}, 'US11587370B2': {'pdf':

In [None]:
for cluster in data["results"]["cluster"]:
    for result in cluster["result"]:
        id = result["patent"]["publication_number"]
        if id not in patent_data.keys():
            patent_data[id] = {
                "pdf": result["patent"]["pdf"],
            }

In [23]:
# NACH DEM GOOGLE API LOOP


# Scraping abstracts, descriptions and claims of those patents

# Iterating through publication IDs

for id in patent_data.keys():
        
    # generating Google Patent links for each ID

    url = "https://patents.google.com/patent/" + id + "/en"

    response = requests.get(url)
    html_content = response.content
    soup = bs(html_content, 'html.parser')


    # Scraping Title

    title_span = soup.find('span', itemprop='title')

    if title_span is not None:
        title = title_span.get_text()

        # Removing weird ending of title
        to_remove = "\n"
        title = title.replace(to_remove, "").strip()
    else:
        title = False


    # Scraping Abstract
        
    abstract_div = soup.find('div', class_='abstract')

    if abstract_div is not None:
        abstract = abstract_div.get_text()
    else:
        abstract = False


    # Scraping Description

    description_section = soup.find('section', itemprop='description')

    if description_section:

        # Removing H2 from section
        h2_tag = description_section.find('h2')
        if h2_tag:
            h2_tag.decompose()
            
        # Removing all 'notranslate' class items
        for notranslate_tag in description_section.find_all(class_='notranslate'):
            notranslate_tag.decompose()
            
        # Removing all <aside> elements
        for aside_tag in description_section.find_all('aside'):
            aside_tag.decompose()

        # Extracting and joining the text
        description = "".join(description_section.stripped_strings)
        if description == "":
            description = False

    else:
        description = False
        

    # Scraping Claims

    description_section = soup.find('section', itemprop='claims')

    if description_section:

        # Removing H2 from section
        h2_tag = description_section.find('h2')
        if h2_tag:
            h2_tag.decompose()
            
        # Removing all 'notranslate' class items
        for notranslate_tag in description_section.find_all(class_='notranslate'):
            notranslate_tag.decompose()
            
        # Removing all <aside> elements
        for aside_tag in description_section.find_all('aside'):
            aside_tag.decompose()

        # Extracting and joining the text
        claims = "".join(description_section.stripped_strings)
        if claims == "":
            claims = False

    else:
        claims = False
        
    patent_data[id].update({
        "title": title,
        "abstract": abstract,
        "description": description,
        "claims": claims
    })

In [24]:
patent_data

{'EP2654522B1': {'pdf': '14/4b/44/78718c1eea410b/EP2654522B1.pdf',
  'title': 'Brewed beverage appliance and method',
  'abstract': False,
  'description': 'TECHNICAL FIELDThe present invention relates to brewed beverage appliances and, more particularly, to a brewed beverage appliance adapted to brew coffee from a pre-packaged, disposable container and filter combination.BACKGROUND OF THE INVENTIONVarious known coffee making appliances involve adding of coffee beans in one of various forms (i.e., ground or unground) to a container that is part of a machine in which heated water is delivered to the container and passes therethrough. The container typically includes a filtering mechanism so that heated water exiting the container is in the form of brewed coffee. Certain appliances require pre-ground coffee beans to be added in the form of "grounds." Other appliances are designed to accept whole coffee beans into a hopper or opening and include mechanisms that grind the beans into a grou

In [25]:
patent_data.keys()

dict_keys(['EP2654522B1', 'CA3104644C', 'US10405690B2', 'US7217908B2', 'AU2013208992B2', 'AU2010202827A1', 'EP3897309A1', 'RU2766609C2', 'US7337704B2', 'EP2531079B1', 'US11721215B2', 'CN117002435A', 'US10442300B2', 'CN109017763B', 'US11445343B2', 'US11587370B2', 'JP7011009B2', 'US10706648B2', 'US11782455B1', 'US20230356721A1', 'US8025476B2', 'US8328514B2', 'US7303369B2', 'US7821148B2', 'US8738192B2', 'US8932024B2', 'US8030790B2', 'US8794903B2', 'US8492918B1', 'CA2666269C'])

In [6]:
# extracting patent ids + abstracts for further prompt usage

abstract_prompt = ""

for patent_id, patent_info in patent_data.items():
    # Check if there is an abstract for the patent
    if patent_info['abstract']:
        abstract_prompt = abstract_prompt + f'{patent_id}: "{patent_info["abstract"]}"\n'

In [13]:
# Mockup Keyword list for testing purposes

keywords_list = ["coffee", "vehicle access system", "wind turbine"]

In [10]:
import os
patent_api_key = os.environ['GOOGLE_PATENT_API_KEY']

In [17]:
for keyword in keywords_list:
    url_base = "https://serpapi.com/search.html?engine=google_patents"
    query = keyword.replace(" ", "+")
    url = url_base + "&q=" + query + "&api_key=" + patent_api_key

    print(url)

https://serpapi.com/search.html?engine=google_patents&q=coffee&api_key=7661f86900cb42b2782c2af5c95142574b959dd5546c98491652475ea91e278e
https://serpapi.com/search.html?engine=google_patents&q=vehicle+access+system&api_key=7661f86900cb42b2782c2af5c95142574b959dd5546c98491652475ea91e278e
https://serpapi.com/search.html?engine=google_patents&q=wind+turbine&api_key=7661f86900cb42b2782c2af5c95142574b959dd5546c98491652475ea91e278e
