In [2]:
#from langchain.output_parsers import StructuredOutputParser
from langchain.output_parsers import ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os
import openai
from dotenv import load_dotenv, find_dotenv
import json
import re
import tiktoken
from Levenshtein import distance
import requests
import serpapi
import pandas as pd
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from pymongo.server_api import ServerApi


In [3]:
#Initial OpenAI API
openai.api_key = os.environ['OPENAI_API_KEY']
llm_model = "gpt-4"

def get_completion(prompt, model=llm_model):
    messages = [{"role": "system", "content": "Act as a computer system and only say the output without any explenations."}, {"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

In [4]:
#Initial our own example abstract
loader = PyPDFLoader("docs/Biometric Vehicle Access System.pdf")
example_abstract = loader.load_and_split()

In [8]:
prompt1 = f"""
The following abstract descripes a concept for a novel invention:\
```{example_abstract}```\
Name 5 key words based on this abstract, that I can use for the search in a patent database. \
Optimize the key words to get back more results. Result as python string.
"""

keywords = get_completion(prompt1)

print(keywords)

"Biometric Vehicle Access System", "biometric authentication", "vehicle security", "biometric sensors", "smart vehicles"


In [11]:
#cast the results (key words) from string to list
keywords_list = []

splitstring = keywords.split(", ") #split the key words
for i in splitstring:
    keywords_list.append(i[1:-1]) #remove the quotation marks
print(keywords_list)

['Biometric Vehicle Access System', 'biometric authentication', 'vehicle security', 'biometric sensors', 'smart vehicles']


In [10]:
prompt2 = f"""
The following abstract descripes a concept for a novel invention:\
```{example_abstract}```\
Name 5 CPC classifications based on this abstract, that I can use for the search in a patent database. \
Please give me a python string for the codes of the 5 most relevant \
CPC classifications to a possible patent. 
"""

classifications = get_completion(prompt2)

print(classifications)

"G06K9/00", "B60R25/10", "G06F21/32", "G07C9/00", "H04L29/06"


In [12]:
#cast the results (classifications) from string to list
class_list = []

new_string = classifications.replace(",", "",9999) #remove commas
splitstring = new_string.split() #split the classes
for i in splitstring:
    class_list.append(i[1:-1]) #remove the quotation marks
print(class_list)

['G06K9/00', 'B60R25/10', 'G06F21/32', 'G07C9/00', 'H04L29/06']


In [279]:
#initialization of base vars for the following loop
patent_api_key = os.environ['GOOGLE_PATENT_API_KEY']
pdf_list = []
count = 0
patent_base_url = "https://patentimages.storage.googleapis.com/" #just to complete the url

In [280]:
short_keywords_list = keywords_list[3:]
short_class_list = class_list[3:]
print(short_class_list)

['G07C9/00', 'H04L29/06']


In [281]:
#Loop for multiple Google Patents API calls with Key Words
for i in short_keywords_list:
    openai_response = i #Search String for Google Patents
    url_base = "https://serpapi.com/search.html?engine=google_patents"
    query = openai_response.replace(" ", "+")
    url = url_base + "&q=" + query + "&api_key=" + patent_api_key

    # API call Google Patents
    response = requests.get(url)

    # Check if API call was successful
    if response.status_code == 200:
        data = response.json() #write json-answer in var
        print("API Call für '",openai_response,"' erfolgreich:",data)
    else:
        print(f"Error with API request: Status code {response.status_code}")

    for a in data['results']['cluster'][0]['result']:
        if not a['patent']['pdf']: #control if there is an existing url in the meta data
            print("No URL found in meta data for PDF #",count,"This PDF will be skipped.")
            continue
        loader = PyPDFLoader(patent_base_url+a['patent']['pdf'])
        pdf = loader.load_and_split()
        count+=1
        if pdf == []: #Falls PDF nicht maschinenlesbar ist, dieses überspringen; vllt. noch extra Liste anlegen mit atypischen PDFs
            print("PDF nicht Maschinenlesbar")
        else:
            pdf_list.append(pdf[0])
            print("PDF #",count,"erfolgreich zur Liste hinzugefügt.")

API Call für ' biometric sensors ' erfolgreich: {'results': {'total_num_results': 125048, 'total_num_pages': 100, 'many_results': False, 'num_page': 0, 'cluster': [{'result': [{'id': 'patent/US9043048B2/en', 'rank': 0, 'patent': {'title': ' RF biometric ignition control system', 'snippet': ' What is claimed is: 1. A vehicle electronic ignition control arrangement, comprising: a keyfob including: a first <b>biometric sensor</b> configured to sense a first identifying biological characteristic of a user; and a transmitter configured to transmit an air-borne signal indicative of the first &hellip;', 'priority_date': '2011-10-13', 'filing_date': '2011-10-13', 'grant_date': '2015-05-26', 'publication_date': '2015-05-26', 'inventor': 'Jesus Manotas, JR.', 'assignee': 'Panasonic Automotive Systems Company Of America, Division Of Panasonic &hellip;', 'publication_number': 'US9043048B2', 'language': 'en', 'thumbnail': '56/90/d2/f186b4bb002cc7/US09043048-20150526-D00000.png', 'pdf': 'f0/46/13/0b



PDF # 16 erfolgreich zur Liste hinzugefügt.
PDF # 17 erfolgreich zur Liste hinzugefügt.
PDF # 18 erfolgreich zur Liste hinzugefügt.
PDF # 19 erfolgreich zur Liste hinzugefügt.
PDF # 20 erfolgreich zur Liste hinzugefügt.


In [282]:
#Loop for multiple Google Patents API calls with Classifications
for i in short_class_list:
    openai_response = i #Search String for Google Patents
    url_base = "https://serpapi.com/search.html?engine=google_patents"
    query = openai_response.replace(" ", "+")
    url = url_base + "&q=" + query + "&api_key=" + patent_api_key

    # API call Google Patents
    response = requests.get(url)

    # Check if API call was successful
    if response.status_code == 200:
        data = response.json() #write json-answer in var
        print("API Call für '",openai_response,"' erfolgreich:",data)
    else:
        print(f"Error with API request: Status code {response.status_code}")

    for a in data['results']['cluster'][0]['result']:
        print("Lese Link ein:",patent_base_url+a['patent']['pdf'])
        if not a['patent']['pdf']: #control if there is an existing url in the meta data
            print("No URL found in meta data for PDF #",count,"This PDF will be skipped.")
            continue
        loader = PyPDFLoader(patent_base_url+a['patent']['pdf'])
        pdf = loader.load_and_split()
        count+=1
        if pdf == []: #Falls PDF nicht maschinenlesbar ist, dieses überspringen; vllt. noch extra Liste anlegen mit atypischen PDFs
            print("PDF nicht Maschinenlesbar")
        else:
            pdf_list.append(pdf[0])
            print("PDF #",count,"erfolgreich zur Liste hinzugefügt.")

API Call für ' G07C9/00 ' erfolgreich: {'results': {'total_num_results': 177766, 'total_num_pages': 100, 'many_results': False, 'num_page': 0, 'cluster': [{'result': [{'id': 'patent/US8965287B2/en', 'rank': 0, 'patent': {'title': ' Battery powered passive keyless entry system for premise entry', 'snippet': ' A passive keyless entry (PKE) system, comprising a DC power source and a base station with a housing that includes a first portion being made of a first material that shields radio frequency (RF) signaling and a second portion being made of a second material that permits RF signaling, is &hellip;', 'priority_date': '2011-04-01', 'filing_date': '2012-04-02', 'grant_date': '2015-02-24', 'publication_date': '2015-02-24', 'inventor': 'Tony Lam', 'assignee': 'Tony Lam', 'publication_number': 'US8965287B2', 'language': 'en', 'thumbnail': 'ba/12/af/6489ee4baa4d68/US08965287-20150224-D00000.png', 'pdf': 'e6/c3/6d/7941b6d4a6f17c/US8965287.pdf', 'figures': [{'thumbnail': '58/42/a2/0bec2d9a36

In [292]:
print(len(pdf_list),"PDFs will be added to the vector database.")

37 PDFs will be added to the vector database.


In [288]:
#Login MongoDB with User and specific database
uri = "mongodb+srv://timmey:faB8MFdyyb7zWvVr@llm-ttt.8kqrnka.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

DB_NAME = "llm-ttt"
COLLECTION_NAME = "pdfresults"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

# insert the documents in MongoDB Atlas with their embedding
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=pdf_list, 
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

In [289]:
# Perform a similarity search with Score between the embedding of the query and the embeddings of the documents
query = str(example_abstract)

results = vector_search.similarity_search_with_score(
    query=query,
    k=5,
)

# Display results
for result in results:
    print(result)

(Document(page_content='USOO8694.793B2 \n (12) United States Patent (10) Patent No.: US 8,694.793 B2 \n Evans (45) Date of Patent: Apr. 8, 2014 \n (54) BIOMETRIC ACCESS CONTROL SE A RE E. s et al are, Jr. et al. TRANSACTIONS 5,870,723 A 2/1999 Pare, Jr. et al. \n 5,870,895 A 2/1999 Fally (75) Inventor: James Douglas Evans, Livermore, CA 5,982,914 A 1 1/1999 Lee et al. \n (US) 6,012,039 A 1/2000 Hoffman et al. \n D425,873 S 5/2000 Anderson et al. \n 6,131.464 A 10/2000 Pare, Jr. et al. (73) Assignee: Visa U.S.A. Inc., San Francisco, CA 6,154,879 A 1 1/2000 Pare, Jr. et al. \n (US) 6,182,892 B1* 2/2001 Angelo et al. ................ 235,380 \n (*) Notice: Subject to any disclaimer, the term of this (Continued) \n patent is extended or adjusted under 35 FOREIGN PATENT DOCUMENTS U.S.C. 154(b) by 1124 days. \n AU T5O154 B2 T 2002 (21) Appl. No.: 12/001.502 KR 10-2001-0054151 A 7, 2001 \n (22) Filed: Dec. 11, 2007 (Continued) \n OTHER PUBLICATIONS \n (65) Prior Publication Data \n Internatio

In [290]:
for result in results:
    print("Übereinstimmung:",round(result[1]*100,2),"%; Quelle:", result[0].metadata['source'])


Übereinstimmung: 90.36 %; Quelle: https://patentimages.storage.googleapis.com/4f/e4/20/f1d019f5b3d37c/US8694793.pdf
Übereinstimmung: 90.23 %; Quelle: https://patentimages.storage.googleapis.com/f7/4b/67/1a4ad0e3f4f652/US8952781.pdf
Übereinstimmung: 90.15 %; Quelle: https://patentimages.storage.googleapis.com/f0/46/13/0b433a0b59b7fa/US9043048.pdf
Übereinstimmung: 89.89 %; Quelle: https://patentimages.storage.googleapis.com/ac/6a/97/40391305f14e74/US6877097.pdf
Übereinstimmung: 89.26 %; Quelle: https://patentimages.storage.googleapis.com/cd/0b/31/56362354d472bf/US9672674.pdf


In [293]:
#clear the vector database
x = MONGODB_COLLECTION.delete_many({})
print(x.deleted_count, " documents deleted.")

37  documents deleted.
