In [1]:
#from langchain.output_parsers import StructuredOutputParser
from langchain.output_parsers import ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os
import openai
from dotenv import load_dotenv, find_dotenv
import json
import re
import tiktoken
from Levenshtein import distance
import requests
import serpapi
import pandas as pd
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from pymongo.server_api import ServerApi


In [6]:
#Initial OpenAI API
openai.api_key = os.environ['OPENAI_API_KEY']
llm_model = "gpt-4"

def get_completion(prompt, model=llm_model):
    messages = [{"role": "system", "content": "Act as a computer system and only say the output without any explenations."}, {"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

In [4]:
#Initial our own example abstract
loader = PyPDFLoader("docs/Biometric Vehicle Access System.pdf")
example_abstract = loader.load_and_split()

In [117]:
prompt1 = f"""```{example_abstract}```\
The abstract above describes a concept for a novel invention.\
I would like to search a patent database to find out whether \
there are already patents for such a concept. Name 5 key words that I can \
use for the search. \
Optimize the phrases to get back more results. Result as python string.
"""

keywords = get_completion(prompt1)

print(keywords)

"Biometric Vehicle Access System", "biometric authentication", "vehicle security", "biometric sensors", "anti-spoofing measures"


In [141]:
#cast the results (key words) from string to list
keywords_list = []

splitstring = keywords.split(", ") #split the key words
for i in splitstring:
    keywords_list.append(i[1:-1]) #remove the quotation marks
print(keywords_list)
print(type(keywords_list))

['Biometric Vehicle Access System', 'biometric authentication', 'vehicle security', 'biometric sensors', 'anti-spoofing measures']
<class 'list'>


In [86]:
prompt2 = f"""```{example_abstract}```\
The abstract above describes a concept for a novel invention.\
I would like to search a patent database to find out whether \
there are already patents for such a concept. Please give me a python string for the codes of the 5 most relevant \
USPTO classifications to a possible patent for this concept. 
"""

classifications = get_completion(prompt2)

print(classifications)

"G06K9/00", "B60R25/10", "E05B49/00", "G07C9/00", "H04L29/06"


In [140]:
#cast the results (classifications) from string to list
class_list = []

new_string = classifications.replace(",", "",9999) #remove commas
splitstring = new_string.split() #split the classes
for i in splitstring:
    class_list.append(i[1:-1]) #remove the quotation marks
print(class_list)
print(type(class_list))

['G06K9/00', 'B60R25/10', 'E05B49/00', 'G07C9/00', 'H04L29/06']
<class 'list'>


In [7]:
#Bindeglied was dasfür sorgt, dass die Ergebnisse (Key Words und Classes) an die Google API weitergegeben werden, damit diese in einer Schleife alle durchgehen kann.

In [10]:
#Initial Google Patents API
patent_api_key = os.environ['GOOGLE_PATENT_API_KEY']
openai_response = 'Anti-spoofing measures in vehicle access' #Search String for Google Patents
url_base = "https://serpapi.com/search.html?engine=google_patents"
query = openai_response.replace(" ", "+")
url = url_base + "&q=" + query + "&api_key=" + patent_api_key

In [11]:
# API call Google Patents
response = requests.get(url)

# Check if API call was successful
if response.status_code == 200:
    # extract JSON data from answer
    data = response.json()

    # save JSON data in a file
    filename = "data_dump/" + query + ".json"
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)
else:
    print(f"Error with API request: Status code {response.status_code}")

In [None]:
#Print the results of the current data dump
patent_base_url = "https://patentimages.storage.googleapis.com/" #just to complete the url

f = open("data_dump/"+query+".json")
data = json.load(f)

#print(data['results']['cluster'][0]['result'][0]['patent']['title'])
counter = 0
for i in data['results']['cluster'][0]['result']:
    counter += 1
    print("#"+str(counter),"Titel:",i['patent']['title'])
    print("PDF: "+patent_base_url+i['patent']['pdf'])



In [None]:
#Initial PDF List: Open the first page of each PDF result and load it in a new list named pdf_list 
pdf_list = []
for i in data['results']['cluster'][0]['result']:
    loader = PyPDFLoader(patent_base_url+i['patent']['pdf'])
    pdf = loader.load_and_split()
    if pdf == []: #Falls PDF nicht maschinenlesbar ist, dieses überspringen; vllt. noch extra Liste anlegen mit atypischen PDFs
        print("PDF nicht Maschinenlesbar")
    else:
        pdf_list.append(pdf[0])

In [24]:
#Login MongoDB with User and specific database
uri = "mongodb+srv://timmey:faB8MFdyyb7zWvVr@llm-ttt.8kqrnka.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

DB_NAME = "llm-ttt"
COLLECTION_NAME = "pdfresults"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

# insert the documents in MongoDB Atlas with their embedding
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=pdf_list, 
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

In [None]:
# Perform a similarity search with Score between the embedding of the query and the embeddings of the documents
query = str(example_abstract)

results = vector_search.similarity_search_with_score(
    query=query,
    k=5,
)

# Display results
for result in results:
    print(result)

In [46]:
for result in results:
    print("Übereinstimmung:",round(result[1]*100,2),"%; Quelle:", result[0].metadata['source'])


Übereinstimmung: 91.21 %; Quelle: https://patentimages.storage.googleapis.com/57/b7/b5/c60393e41d106c/US10246055.pdf
Übereinstimmung: 90.62 %; Quelle: https://patentimages.storage.googleapis.com/60/66/91/e867fbe2549321/US9242619.pdf
Übereinstimmung: 89.74 %; Quelle: https://patentimages.storage.googleapis.com/9b/bd/80/715a925ac80f13/US20200307519A1.pdf
Übereinstimmung: 89.72 %; Quelle: https://patentimages.storage.googleapis.com/7c/30/05/2dc2f874122207/US9095285.pdf
Übereinstimmung: 89.72 %; Quelle: https://patentimages.storage.googleapis.com/98/f0/22/ae169ef67be168/US9432632.pdf


In [99]:
x = MONGODB_COLLECTION.delete_many({})

print(x.deleted_count, " documents deleted.")

10  documents deleted.
