In [202]:
#from langchain.output_parsers import StructuredOutputParser
from langchain.output_parsers import ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os
import openai
from dotenv import load_dotenv, find_dotenv
import json
import re
import tiktoken
from Levenshtein import distance
import requests
import serpapi
import pandas as pd
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from pymongo.server_api import ServerApi


In [203]:
#Initial OpenAI API
openai.api_key = os.environ['OPENAI_API_KEY']
llm_model = "gpt-4"

def get_completion(prompt, model=llm_model):
    messages = [{"role": "system", "content": "Act as a computer system and only say the output without any explenations."}, {"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

In [204]:
#Initial our own example abstract
loader = PyPDFLoader("docs/Biometric Vehicle Access System.pdf")
example_abstract = loader.load_and_split()

In [205]:
prompt1 = f"""```{example_abstract}```\
The abstract above describes a concept for a novel invention.\
I would like to search a patent database to find out whether \
there are already patents for such a concept. Name 5 key words that I can \
use for the search. \
Optimize the phrases to get back more results. Result as python string.
"""

keywords = get_completion(prompt1)

print(keywords)

"Biometric Vehicle Access System", "biometric authentication", "vehicle security", "biometric sensors", "anti-spoofing measures"


In [206]:
#cast the results (key words) from string to list
keywords_list = []

splitstring = keywords.split(", ") #split the key words
for i in splitstring:
    keywords_list.append(i[1:-1]) #remove the quotation marks
print(keywords_list)
print(type(keywords_list))

['Biometric Vehicle Access System', 'biometric authentication', 'vehicle security', 'biometric sensors', 'anti-spoofing measures']
<class 'list'>


In [207]:
prompt2 = f"""```{example_abstract}```\
The abstract above describes a concept for a novel invention.\
I would like to search a patent database to find out whether \
there are already patents for such a concept. Please give me a python string for the codes of the 5 most relevant \
USPTO classifications to a possible patent for this concept. 
"""

classifications = get_completion(prompt2)

print(classifications)

"G06K9/00", "B60R25/10", "E05B49/00", "G07C9/00", "H04L29/06"


In [208]:
#cast the results (classifications) from string to list
class_list = []

new_string = classifications.replace(",", "",9999) #remove commas
splitstring = new_string.split() #split the classes
for i in splitstring:
    class_list.append(i[1:-1]) #remove the quotation marks
print(class_list)
print(type(class_list))

['G06K9/00', 'B60R25/10', 'E05B49/00', 'G07C9/00', 'H04L29/06']
<class 'list'>


In [209]:
#initialization of base vars for the following loop
patent_api_key = os.environ['GOOGLE_PATENT_API_KEY']
pdf_list = []
count = 0
patent_base_url = "https://patentimages.storage.googleapis.com/" #just to complete the url

In [259]:
short_keywords_list = keywords_list[3:]
short_class_list = class_list[3:]
print(short_class_list)

['G07C9/00', 'H04L29/06']


In [211]:
#Loop for multiple Google Patents API calls with Key Words
for i in short_keywords_list:
    openai_response = i #Search String for Google Patents
    url_base = "https://serpapi.com/search.html?engine=google_patents"
    query = openai_response.replace(" ", "+")
    url = url_base + "&q=" + query + "&api_key=" + patent_api_key

    # API call Google Patents
    response = requests.get(url)

    # Check if API call was successful
    if response.status_code == 200:
        data = response.json() #write json-answer in var
        print("API Call für '",openai_response,"' erfolgreich:",data)
    else:
        print(f"Error with API request: Status code {response.status_code}")

    for a in data['results']['cluster'][0]['result']:
        loader = PyPDFLoader(patent_base_url+a['patent']['pdf'])
        pdf = loader.load_and_split()
        count+=1
        if pdf == []: #Falls PDF nicht maschinenlesbar ist, dieses überspringen; vllt. noch extra Liste anlegen mit atypischen PDFs
            print("PDF nicht Maschinenlesbar")
        else:
            pdf_list.append(pdf[0])
            print("PDF #",count,"erfolgreich zur Liste hinzugefügt.")

API Call für ' biometric sensors ' erfolgreich: {'results': {'total_num_results': 125048, 'total_num_pages': 100, 'many_results': False, 'num_page': 0, 'cluster': [{'result': [{'id': 'patent/US9043048B2/en', 'rank': 0, 'patent': {'title': ' RF biometric ignition control system', 'snippet': ' What is claimed is: 1. A vehicle electronic ignition control arrangement, comprising: a keyfob including: a first <b>biometric sensor</b> configured to sense a first identifying biological characteristic of a user; and a transmitter configured to transmit an air-borne signal indicative of the first &hellip;', 'priority_date': '2011-10-13', 'filing_date': '2011-10-13', 'grant_date': '2015-05-26', 'publication_date': '2015-05-26', 'inventor': 'Jesus Manotas, JR.', 'assignee': 'Panasonic Automotive Systems Company Of America, Division Of Panasonic &hellip;', 'publication_number': 'US9043048B2', 'language': 'en', 'thumbnail': '56/90/d2/f186b4bb002cc7/US09043048-20150526-D00000.png', 'pdf': 'f0/46/13/0b

In [270]:
#Loop for multiple Google Patents API calls with Classifications
for i in short_class_list:
    openai_response = i #Search String for Google Patents
    url_base = "https://serpapi.com/search.html?engine=google_patents"
    query = openai_response.replace(" ", "+")
    url = url_base + "&q=" + query + "&api_key=" + patent_api_key

    # API call Google Patents
    response = requests.get(url)

    # Check if API call was successful
    if response.status_code == 200:
        data = response.json() #write json-answer in var
        print("API Call für '",openai_response,"' erfolgreich:",data)
    else:
        print(f"Error with API request: Status code {response.status_code}")

    for a in data['results']['cluster'][0]['result']:
        print("Lese Link ein:",patent_base_url+a['patent']['pdf'])
        if not a['patent']['pdf']:
            print("No URL found in meta data for PDF #",count,"This PDF will be skipped.")
            continue
        loader = PyPDFLoader(patent_base_url+a['patent']['pdf'])
        pdf = loader.load_and_split()
        count+=1
        if pdf == []: #Falls PDF nicht maschinenlesbar ist, dieses überspringen; vllt. noch extra Liste anlegen mit atypischen PDFs
            print("PDF nicht Maschinenlesbar")
        else:
            pdf_list.append(pdf[0])
            print("PDF #",count,"erfolgreich zur Liste hinzugefügt.")

API Call für ' G07C9/00 ' erfolgreich: {'results': {'total_num_results': 177766, 'total_num_pages': 100, 'many_results': False, 'num_page': 0, 'cluster': [{'result': [{'id': 'patent/US8965287B2/en', 'rank': 0, 'patent': {'title': ' Battery powered passive keyless entry system for premise entry', 'snippet': ' A passive keyless entry (PKE) system, comprising a DC power source and a base station with a housing that includes a first portion being made of a first material that shields radio frequency (RF) signaling and a second portion being made of a second material that permits RF signaling, is &hellip;', 'priority_date': '2011-04-01', 'filing_date': '2012-04-02', 'grant_date': '2015-02-24', 'publication_date': '2015-02-24', 'inventor': 'Tony Lam', 'assignee': 'Tony Lam', 'publication_number': 'US8965287B2', 'language': 'en', 'thumbnail': 'ba/12/af/6489ee4baa4d68/US08965287-20150224-D00000.png', 'pdf': 'e6/c3/6d/7941b6d4a6f17c/US8965287.pdf', 'figures': [{'thumbnail': '58/42/a2/0bec2d9a36

In [271]:
print(len(pdf_list))

123


In [None]:
#Login MongoDB with User and specific database
uri = "mongodb+srv://timmey:faB8MFdyyb7zWvVr@llm-ttt.8kqrnka.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

DB_NAME = "llm-ttt"
COLLECTION_NAME = "pdfresults"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

# insert the documents in MongoDB Atlas with their embedding
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=pdf_list, 
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

In [None]:
# Perform a similarity search with Score between the embedding of the query and the embeddings of the documents
query = str(example_abstract)

results = vector_search.similarity_search_with_score(
    query=query,
    k=5,
)

# Display results
for result in results:
    print(result)

(Document(page_content='THE TWO TONTTITUNTUM DURANTE US 20180201225A1 ( 19 ) United States ( 12 ) Patent Application Publication ( 10 ) Pub . No . : US 2018 / 0201225 A1 Farges ( 43 ) Pub . Date : Jul . 19 , 2018 \n ( 54 ) METHOD FOR AUTHORIZING A DRIVER TO ACTIVATE AT LEAST ONE SYSTEM OF \n A VEHICLE , BASED ON A BIOMETRIC \n AUTHENTICATION PROCESS ) U . S . Ci . \n CPC . . . . . . . . . . . B60R 25 / 25 ( 2013 . 01 ) ; BOOR 25 / 252 \n ( 2013 . 01 ) ; B6OR 2325 / 205 ( 2013 . 01 ) \n ( 57 ) ABSTRACT ( 71 ) Applicant : DURA AUTOMOTIVE HOLDINGS U . K . , LTD , Birmingham ( GB ) \n ( 72 ) Inventor : Thomas Farges , Gometz Le Chatel \n ( FR ) \n ( 21 ) Appl . No . : 15 / 872 , 172 \n ( 22 ) Filed : Jan . 16 , 2018 \n ( 30 ) Foreign Application Priority Data \n Jan . 16 , 2017 ( EP ) . . . . . . . . . . . . . . . . 17151558 . 8 \n Publication Classification \n ( 51 ) Int . Ci . B60R 25 / 25 ( 2006 . 01 ) A method for authorizing a driver to activate at least one system of a vehicle , incl

In [None]:
for result in results:
    print("Übereinstimmung:",round(result[1]*100,2),"%; Quelle:", result[0].metadata['source'])


Übereinstimmung: 91.09 %; Quelle: https://patentimages.storage.googleapis.com/56/ed/6b/a756c070d08e60/US20180201225A1.pdf
Übereinstimmung: 91.09 %; Quelle: https://patentimages.storage.googleapis.com/56/ed/6b/a756c070d08e60/US20180201225A1.pdf
Übereinstimmung: 91.09 %; Quelle: https://patentimages.storage.googleapis.com/56/ed/6b/a756c070d08e60/US20180201225A1.pdf
Übereinstimmung: 90.54 %; Quelle: https://patentimages.storage.googleapis.com/66/1c/53/dc4a8713967522/US9654468.pdf
Übereinstimmung: 90.54 %; Quelle: https://patentimages.storage.googleapis.com/66/1c/53/dc4a8713967522/US9654468.pdf
