In [13]:
#from langchain.output_parsers import StructuredOutputParser
from langchain.output_parsers import ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os
import openai
from dotenv import load_dotenv, find_dotenv
from Levenshtein import distance
import requests
import pandas as pd
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from pymongo.server_api import ServerApi


In [14]:
#Initial OpenAI API
openai.api_key = os.environ['OPENAI_API_KEY']
llm_model = "gpt-4"

def get_completion(prompt, model=llm_model):
    messages = [{"role": "system", "content": "Act as a computer system and only say the output without any explenations."}, {"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

In [15]:
#Initial our own example abstract
loader = PyPDFLoader("docs/Biometric Vehicle Access System.pdf")
example_abstract = loader.load_and_split()

In [16]:
prompt1 = f"""
The following abstract descripes a concept for a novel invention:\
```{example_abstract}```\
Name 5 key words based on this abstract, that I can use for the search in a patent database. \
Optimize the key words to get back more results. Result as python string.
"""

keywords = get_completion(prompt1)

print(keywords)

"Biometric Vehicle Access System", "biometric authentication", "vehicle security", "biometric sensors", "anti-spoofing measures"


In [17]:
#cast the results (key words) from string to list
keywords_list = []

splitstring = keywords.split(", ") #split the key words
for i in splitstring:
    keywords_list.append(i[1:-1]) #remove the quotation marks
print(keywords_list)

['Biometric Vehicle Access System', 'biometric authentication', 'vehicle security', 'biometric sensors', 'anti-spoofing measures']


In [18]:
prompt2 = f"""
The following abstract descripes a concept for a novel invention:\
```{example_abstract}```\
Name 5 CPC classifications based on this abstract, that I can use for the search in a patent database. \
Please give me a python string for the codes of the 5 most relevant \
CPC classifications to a possible patent. 
"""

classifications = get_completion(prompt2)

print(classifications)

"G06K9/00", "B60R25/10", "G07C9/00", "H04L63/08", "G06F21/32"


In [19]:
#cast the results (classifications) from string to list
class_list = []

new_string = classifications.replace(",", "",9999) #remove commas
splitstring = new_string.split() #split the classes
for i in splitstring:
    class_list.append(i[1:-1]) #remove the quotation marks
print(class_list)

['G06K9/00', 'B60R25/10', 'G07C9/00', 'H04L63/08', 'G06F21/32']


In [20]:
#initialization of base vars for the following loop
patent_api_key = os.environ['GOOGLE_PATENT_API_KEY']
pdf_list = []
count = 0
patent_base_url = "https://patentimages.storage.googleapis.com/" #just to complete the url

In [21]:
short_keywords_list = keywords_list[3:]
short_class_list = class_list[3:]
print(short_class_list)

['H04L63/08', 'G06F21/32']


In [22]:
#Loop for multiple Google Patents API calls with Key Words
for i in short_keywords_list:
    openai_response = i #Search String for Google Patents
    url_base = "https://serpapi.com/search.html?engine=google_patents"
    query = openai_response.replace(" ", "+")
    url = url_base + "&q=" + query + "&api_key=" + patent_api_key

    # API call Google Patents
    response = requests.get(url)

    # Check if API call was successful
    if response.status_code == 200:
        data = response.json() #write json-answer in var
        print("API Call für '",openai_response,"' erfolgreich:",data)
    else:
        print(f"Error with API request: Status code {response.status_code}")

    for a in data['results']['cluster'][0]['result']:
        if not a['patent']['pdf']: #control if there is an existing url in the meta data
            print("No URL found in meta data for PDF #",count,"This PDF will be skipped.")
            continue
        loader = PyPDFLoader(patent_base_url+a['patent']['pdf'])
        pdf = loader.load_and_split()
        count+=1
        if pdf == []: #Falls PDF nicht maschinenlesbar ist, dieses überspringen; vllt. noch extra Liste anlegen mit atypischen PDFs
            print("PDF nicht Maschinenlesbar")
        else:
            pdf_list.append(pdf[0])
            print("PDF #",count,"erfolgreich zur Liste hinzugefügt.")

API Call für ' biometric sensors ' erfolgreich: {'results': {'total_num_results': 125048, 'total_num_pages': 100, 'many_results': False, 'num_page': 0, 'cluster': [{'result': [{'id': 'patent/US9043048B2/en', 'rank': 0, 'patent': {'title': ' RF biometric ignition control system', 'snippet': ' What is claimed is: 1. A vehicle electronic ignition control arrangement, comprising: a keyfob including: a first <b>biometric sensor</b> configured to sense a first identifying biological characteristic of a user; and a transmitter configured to transmit an air-borne signal indicative of the first &hellip;', 'priority_date': '2011-10-13', 'filing_date': '2011-10-13', 'grant_date': '2015-05-26', 'publication_date': '2015-05-26', 'inventor': 'Jesus Manotas, JR.', 'assignee': 'Panasonic Automotive Systems Company Of America, Division Of Panasonic &hellip;', 'publication_number': 'US9043048B2', 'language': 'en', 'thumbnail': '56/90/d2/f186b4bb002cc7/US09043048-20150526-D00000.png', 'pdf': 'f0/46/13/0b



PDF # 13 erfolgreich zur Liste hinzugefügt.
PDF # 14 erfolgreich zur Liste hinzugefügt.
PDF # 15 erfolgreich zur Liste hinzugefügt.
PDF # 16 erfolgreich zur Liste hinzugefügt.
PDF # 17 erfolgreich zur Liste hinzugefügt.
PDF # 18 erfolgreich zur Liste hinzugefügt.
PDF # 19 erfolgreich zur Liste hinzugefügt.
PDF # 20 erfolgreich zur Liste hinzugefügt.


In [23]:
#Loop for multiple Google Patents API calls with Classifications
for i in short_class_list:
    openai_response = i #Search String for Google Patents
    url_base = "https://serpapi.com/search.html?engine=google_patents"
    query = openai_response.replace(" ", "+")
    url = url_base + "&q=" + query + "&api_key=" + patent_api_key

    # API call Google Patents
    response = requests.get(url)

    # Check if API call was successful
    if response.status_code == 200:
        data = response.json() #write json-answer in var
        print("API Call für '",openai_response,"' erfolgreich:",data)
    else:
        print(f"Error with API request: Status code {response.status_code}")

    for a in data['results']['cluster'][0]['result']:
        print("Lese Link ein:",patent_base_url+a['patent']['pdf'])
        if not a['patent']['pdf']: #control if there is an existing url in the meta data
            print("No URL found in meta data for PDF #",count,"This PDF will be skipped.")
            continue
        loader = PyPDFLoader(patent_base_url+a['patent']['pdf'])
        pdf = loader.load_and_split()
        count+=1
        if pdf == []: #Falls PDF nicht maschinenlesbar ist, dieses überspringen; vllt. noch extra Liste anlegen mit atypischen PDFs
            print("PDF nicht Maschinenlesbar")
        else:
            pdf_list.append(pdf[0])
            print("PDF #",count,"erfolgreich zur Liste hinzugefügt.")

API Call für ' H04L63/08 ' erfolgreich: {'results': {'total_num_results': 117392, 'total_num_pages': 100, 'many_results': False, 'num_page': 0, 'cluster': [{'result': [{'id': 'patent/US9203841B2/en', 'rank': 0, 'patent': {'title': ' Secure authentication in a multi-party system', 'snippet': ' An authentication server transmits a random number to and receives a other information from a service provider. Later, the first random number is received from a requester and a provider identifier, the received other information and provider authentication policy requirements are transmitted to &hellip;', 'priority_date': '2012-04-01', 'filing_date': '2013-03-28', 'grant_date': '2015-12-01', 'publication_date': '2015-12-01', 'inventor': 'Michael Neuman', 'assignee': 'Authentify, Inc.', 'publication_number': 'US9203841B2', 'language': 'en', 'thumbnail': '15/6b/7f/5f2ffa6e69350e/US09203841-20151201-D00000.png', 'pdf': '96/81/cc/10ddaffcfcad68/US9203841.pdf', 'figures': [{'thumbnail': '5f/5d/0f/a24c

In [24]:
print(len(pdf_list),"PDFs will be added to the vector database.")

37 PDFs will be added to the vector database.


In [25]:
#Login MongoDB with User and specific database
uri = "mongodb+srv://timmey:faB8MFdyyb7zWvVr@llm-ttt.8kqrnka.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

DB_NAME = "llm-ttt"
COLLECTION_NAME = "pdfresults"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

# insert the documents in MongoDB Atlas with their embedding
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=pdf_list, 
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

In [26]:
# Perform a similarity search with Score between the embedding of the query and the embeddings of the documents
query = str(example_abstract)

results = vector_search.similarity_search_with_score(
    query=query,
    k=5,
)

# Display results
for result in results:
    print(result)

In [27]:
for result in results:
    print("Übereinstimmung:",round(result[1]*100,2),"%; Quelle:", result[0].metadata['source'])


In [28]:
#clear the vector database
x = MONGODB_COLLECTION.delete_many({})
print(x.deleted_count, " documents deleted.")

37  documents deleted.


In [None]:
import gradio as gr
import time
def my_function(x, progress=gr.Progress()):
    progress(0, desc="Starting...")
    time.sleep(1)
    for i in progress.tqdm(range(100)):
        time.sleep(0.1)
    return x
gr.Interface(my_function, gr.Textbox(), gr.Textbox()).queue().launch()

          #startresult.change(test, [startresult, startresult2], endresult)
            #inp = gr.Textbox(placeholder="What is your name?")
            #out = gr.Textbox()
            #inp.change(test, inp, out) 