In [1]:
#from langchain.output_parsers import StructuredOutputParser
from langchain.output_parsers import ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os
import openai
from dotenv import load_dotenv, find_dotenv
import json
import re
import tiktoken
from Levenshtein import distance
import requests
import serpapi
import pandas as pd
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from pymongo.server_api import ServerApi


In [6]:
#Initial OpenAI API
openai.api_key = os.environ['OPENAI_API_KEY']
llm_model = "gpt-4"

def get_completion(prompt, model=llm_model):
    messages = [{"role": "system", "content": "Act as a computer system and only say the output without any explenations."}, {"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

In [4]:
#Initial our own example abstract
loader = PyPDFLoader("docs/Biometric Vehicle Access System.pdf")
example_abstract = loader.load_and_split()

In [117]:
prompt1 = f"""```{example_abstract}```\
The abstract above describes a concept for a novel invention.\
I would like to search a patent database to find out whether \
there are already patents for such a concept. Name 5 key words that I can \
use for the search. \
Optimize the phrases to get back more results. Result as python string.
"""

keywords = get_completion(prompt1)

print(keywords)

"Biometric Vehicle Access System", "biometric authentication", "vehicle security", "biometric sensors", "anti-spoofing measures"


In [161]:
#cast the results (key words) from string to list
keywords_list = []

splitstring = keywords.split(", ") #split the key words
for i in splitstring:
    keywords_list.append(i[1:-1]) #remove the quotation marks
print(keywords_list)
print(type(keywords_list))

['Biometric Vehicle Access System', 'biometric authentication', 'vehicle security', 'biometric sensors', 'anti-spoofing measures']
<class 'list'>


In [86]:
prompt2 = f"""```{example_abstract}```\
The abstract above describes a concept for a novel invention.\
I would like to search a patent database to find out whether \
there are already patents for such a concept. Please give me a python string for the codes of the 5 most relevant \
USPTO classifications to a possible patent for this concept. 
"""

classifications = get_completion(prompt2)

print(classifications)

"G06K9/00", "B60R25/10", "E05B49/00", "G07C9/00", "H04L29/06"


In [140]:
#cast the results (classifications) from string to list
class_list = []

new_string = classifications.replace(",", "",9999) #remove commas
splitstring = new_string.split() #split the classes
for i in splitstring:
    class_list.append(i[1:-1]) #remove the quotation marks
print(class_list)
print(type(class_list))

['G06K9/00', 'B60R25/10', 'E05B49/00', 'G07C9/00', 'H04L29/06']
<class 'list'>


In [None]:
#initialization of base vars for the following loop
patent_api_key = os.environ['GOOGLE_PATENT_API_KEY']
pdf_list = []
count = 0
patent_base_url = "https://patentimages.storage.googleapis.com/" #just to complete the url

In [166]:
#Loop for multiple Google Patents API calls with Key Words
for i in keywords_list:
    openai_response = i #Search String for Google Patents
    url_base = "https://serpapi.com/search.html?engine=google_patents"
    query = openai_response.replace(" ", "+")
    url = url_base + "&q=" + query + "&api_key=" + patent_api_key

    # API call Google Patents
    response = requests.get(url)

    # Check if API call was successful
    if response.status_code == 200:
        data = response.json() #write json-answer in var
        print("API Call für '",openai_response,"' erfolgreich:",data)
    else:
        print(f"Error with API request: Status code {response.status_code}")

    for a in data['results']['cluster'][0]['result']:
        loader = PyPDFLoader(patent_base_url+a['patent']['pdf'])
        pdf = loader.load_and_split()
        count+=1
        if pdf == []: #Falls PDF nicht maschinenlesbar ist, dieses überspringen; vllt. noch extra Liste anlegen mit atypischen PDFs
            print("PDF nicht Maschinenlesbar")
        else:
            pdf_list.append(pdf[0])
            print("PDF #",count,"erfolgreich zur Liste hinzugefügt.")

API Call für ' Biometric Vehicle Access System ' erfolgreich: {'results': {'total_num_results': 127313, 'total_num_pages': 100, 'many_results': False, 'num_page': 0, 'cluster': [{'result': [{'id': 'patent/US9376090B2/en', 'rank': 0, 'patent': {'title': ' Method for authenticating a driver in a motor vehicle', 'snippet': ' The invention relates to a method for authenticating a driver ( 2 ) in a motor vehicle ( 1 ) by means of a recognition device ( 10 ) disposed in the motor vehicle ( 1 ) for collecting actual data ( 50 ) of the driver ( 2 ) which are transmitted during the authentication to a checking device ( 20 ) disposed in &hellip;', 'priority_date': '2012-07-18', 'filing_date': '2013-06-27', 'grant_date': '2016-06-28', 'publication_date': '2016-06-28', 'inventor': 'Sven Gennermann', 'assignee': 'Huf Hülsbeck & Fürst Gmbh & Co. Kg', 'publication_number': 'US9376090B2', 'language': 'en', 'thumbnail': '1f/d9/bc/382b74c5cc642c/US09376090-20160628-D00000.png', 'pdf': 'bc/e5/72/2a7f1119



PDF # 20 erfolgreich zur Liste hinzugefügt.
API Call für ' vehicle security ' erfolgreich: {'results': {'total_num_results': 121539, 'total_num_pages': 100, 'many_results': False, 'num_page': 0, 'cluster': [{'result': [{'id': 'patent/CA2714179C/en', 'rank': 0, 'patent': {'title': ' Anti-theft security device for a vehicle coupling', 'snippet': ' A theft security device for a vehicle coupling located in a vehicle is described, in particular a fifth wheel coupling (1). The device comprises a security apparatus (2) and a control apparatus (3) which are electrically connected together. The control apparatus (3) is connected to at least one &hellip;', 'priority_date': '2008-02-27', 'filing_date': '2009-02-25', 'grant_date': '2016-04-05', 'publication_date': '2016-04-05', 'inventor': 'Jose Manuel Algueera Gallego', 'assignee': 'Jost-Werke Gmbh', 'publication_number': 'CA2714179C', 'language': 'en', 'thumbnail': '', 'pdf': 'b8/e2/a1/d1922f2748e7b6/CA2714179C.pdf', 'family_metadata': {'aggrega

In [171]:
#Loop for multiple Google Patents API calls with Classifications
for i in class_list:
    openai_response = i #Search String for Google Patents
    url_base = "https://serpapi.com/search.html?engine=google_patents"
    query = openai_response.replace(" ", "+")
    url = url_base + "&q=" + query + "&api_key=" + patent_api_key

    # API call Google Patents
    response = requests.get(url)

    # Check if API call was successful
    if response.status_code == 200:
        data = response.json() #write json-answer in var
        print("API Call für '",openai_response,"' erfolgreich:",data)
    else:
        print(f"Error with API request: Status code {response.status_code}")

    for a in data['results']['cluster'][0]['result']:
        loader = PyPDFLoader(patent_base_url+a['patent']['pdf'])
        pdf = loader.load_and_split()
        count+=1
        if pdf == []: #Falls PDF nicht maschinenlesbar ist, dieses überspringen; vllt. noch extra Liste anlegen mit atypischen PDFs
            print("PDF nicht Maschinenlesbar")
        else:
            pdf_list.append(pdf[0])
            print("PDF #",count,"erfolgreich zur Liste hinzugefügt.")

API Call für ' G06K9/00 ' erfolgreich: {'results': {'total_num_results': 7055, 'total_num_pages': 100, 'many_results': False, 'num_page': 0, 'cluster': [{'result': [{'id': 'patent/CN111401257B/en', 'rank': 0, 'patent': {'title': ' Face recognition method based on cosine loss under non-constraint condition', 'snippet': ' The invention belongs to the technical field of image processing, and particularly relates to a cosine loss based unconstrained condition face recognition method. S1, acquiring an image to be recognized, and performing multi-scale transformation on the image to be recognized to obtain an image &hellip;', 'priority_date': '2020-03-17', 'filing_date': '2020-03-17', 'grant_date': '2022-10-04', 'publication_date': '2022-10-04', 'inventor': '董恩增', 'assignee': '天津理工大学', 'publication_number': 'CN111401257B', 'language': 'en', 'thumbnail': 'ee/67/b0/8d79bd079e9de9/HDA0002415058060000011.png', 'pdf': '2f/2f/80/4f15606d61214b/CN111401257B.pdf', 'figures': [{'thumbnail': 'fa/63/c8



PDF # 59 erfolgreich zur Liste hinzugefügt.
PDF # 60 erfolgreich zur Liste hinzugefügt.
API Call für ' B60R25/10 ' erfolgreich: {'results': {'total_num_results': 41383, 'total_num_pages': 100, 'many_results': False, 'num_page': 0, 'cluster': [{'result': [{'id': 'patent/US9102294B2/en', 'rank': 0, 'patent': {'title': ' Real-time vehicle alarm communication system', 'snippet': ' A vehicle alarm communication system is provided, comprising a handheld electronic device and a vehicle system that utilizes a global positioning satellite (GPS) network and a cellular network to communicate the status of a parked vehicle. The vehicle system comprises a vehicle alarm that monitors &hellip;', 'priority_date': '2012-11-20', 'filing_date': '2013-11-20', 'grant_date': '2015-08-11', 'publication_date': '2015-08-11', 'inventor': 'Penny R. Oliver', 'assignee': 'Penny R. Oliver', 'publication_number': 'US9102294B2', 'language': 'en', 'thumbnail': 'bb/9c/d0/3bdb4734924d7c/US09102294-20150811-D00000.png', 

ValueError: Check the url of your file; returned status code 403

Status Code 403 heisst, dass das Nutzungslimit überschritten wurde

In [172]:
print(len(pdf_list))

76


In [174]:
#Login MongoDB with User and specific database
uri = "mongodb+srv://timmey:faB8MFdyyb7zWvVr@llm-ttt.8kqrnka.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

DB_NAME = "llm-ttt"
COLLECTION_NAME = "pdfresults"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

# insert the documents in MongoDB Atlas with their embedding
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=pdf_list, 
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

In [175]:
# Perform a similarity search with Score between the embedding of the query and the embeddings of the documents
query = str(example_abstract)

results = vector_search.similarity_search_with_score(
    query=query,
    k=5,
)

# Display results
for result in results:
    print(result)

(Document(page_content='THE TWO TONTTITUNTUM DURANTE US 20180201225A1 ( 19 ) United States ( 12 ) Patent Application Publication ( 10 ) Pub . No . : US 2018 / 0201225 A1 Farges ( 43 ) Pub . Date : Jul . 19 , 2018 \n ( 54 ) METHOD FOR AUTHORIZING A DRIVER TO ACTIVATE AT LEAST ONE SYSTEM OF \n A VEHICLE , BASED ON A BIOMETRIC \n AUTHENTICATION PROCESS ) U . S . Ci . \n CPC . . . . . . . . . . . B60R 25 / 25 ( 2013 . 01 ) ; BOOR 25 / 252 \n ( 2013 . 01 ) ; B6OR 2325 / 205 ( 2013 . 01 ) \n ( 57 ) ABSTRACT ( 71 ) Applicant : DURA AUTOMOTIVE HOLDINGS U . K . , LTD , Birmingham ( GB ) \n ( 72 ) Inventor : Thomas Farges , Gometz Le Chatel \n ( FR ) \n ( 21 ) Appl . No . : 15 / 872 , 172 \n ( 22 ) Filed : Jan . 16 , 2018 \n ( 30 ) Foreign Application Priority Data \n Jan . 16 , 2017 ( EP ) . . . . . . . . . . . . . . . . 17151558 . 8 \n Publication Classification \n ( 51 ) Int . Ci . B60R 25 / 25 ( 2006 . 01 ) A method for authorizing a driver to activate at least one system of a vehicle , incl

In [176]:
for result in results:
    print("Übereinstimmung:",round(result[1]*100,2),"%; Quelle:", result[0].metadata['source'])


Übereinstimmung: 91.09 %; Quelle: https://patentimages.storage.googleapis.com/56/ed/6b/a756c070d08e60/US20180201225A1.pdf
Übereinstimmung: 90.53 %; Quelle: https://patentimages.storage.googleapis.com/66/1c/53/dc4a8713967522/US9654468.pdf
Übereinstimmung: 90.36 %; Quelle: https://patentimages.storage.googleapis.com/4f/e4/20/f1d019f5b3d37c/US8694793.pdf
Übereinstimmung: 90.36 %; Quelle: https://patentimages.storage.googleapis.com/4f/e4/20/f1d019f5b3d37c/US8694793.pdf
Übereinstimmung: 90.24 %; Quelle: https://patentimages.storage.googleapis.com/f7/4b/67/1a4ad0e3f4f652/US8952781.pdf
