In [13]:
#from langchain.output_parsers import StructuredOutputParser
from langchain.output_parsers import ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os
import openai
from dotenv import load_dotenv, find_dotenv
import json
import re
import tiktoken
from Levenshtein import distance


In [14]:
openai.api_key = os.environ['OPENAI_API_KEY']
llm_model = "gpt-4"

def get_completion(prompt, model=llm_model):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

In [15]:
loader = PyPDFLoader("docs/Biometric Vehicle Access System.pdf")
example_abstract = loader.load_and_split()

In [30]:
enc = tiktoken.get_encoding("cl100k_base")

In [16]:
prompt1 = f"""```{example_abstract}```\
The abstract above describes a concept for a novel invention.\
I would like to search a patent database to find out whether \
there are already patents for such a concept. Name 5 phrases that I can \
use for the search. Each phrase should contain between 5 to 10 words. \
Optimize the phrases to get back more results.
"""

keywords = get_completion(prompt1)

print(keywords)

1. "Biometric Vehicle Access System technology"
2. "Advanced biometric authentication in vehicles"
3. "Biometric sensors in door handles and ignition systems"
4. "Anti-spoofing measures in vehicle access"
5. "Integration of biometrics in smart vehicles"


In [23]:
prompt2 = f"""```{example_abstract}```\
The abstract above describes a concept for a novel invention.\
I would like to search a patent database to find out whether \
there are already patents for such a concept. Please list me the codes of the 5 most relevant \
USPTO classifications to a possible patent for this concept without explanations for the codes.
"""

classifications = get_completion(prompt2)

print(classifications)

1. G06K9/00
2. B60R25/10
3. G07C9/00
4. E05B49/00
5. G06F21/32


In [24]:
f = open('data_dump/test.json')
data = json.load(f)

for i in data['organic_results']:

    print("Result", i['position'], ": ", i['title'])
    # print(i['snippet'], "\n")
    print(i['pdf'], "\n")

Result 1 :  Method and system for localizing parts of an object in an image for computer …
https://patentimages.storage.googleapis.com/a2/95/38/bcc3b288e56669/US9275273.pdf 

Result 2 :  Adaptive multi-modal integrated biometric identification and surveillance …
https://patentimages.storage.googleapis.com/98/f0/22/ae169ef67be168/US9432632.pdf 

Result 3 :  Spoof detection for biometric authentication
https://patentimages.storage.googleapis.com/84/29/92/5fe2a153298d2d/US9971920.pdf 

Result 4 :  Liveness testing methods and apparatuses and image processing methods and …
https://patentimages.storage.googleapis.com/b1/98/34/94c48fd1f99eeb/US11151397.pdf 

Result 5 :  Biometrics based on locally consistent features
https://patentimages.storage.googleapis.com/a6/11/c7/5583de7c8fb29d/US9060688.pdf 

Result 6 :  System and method for detecting the authenticity of products
https://patentimages.storage.googleapis.com/9c/e5/fc/eb90d460e518de/US10956732.pdf 

Result 7 :  System and process for au

In [26]:
pdf_list = []
for i in data['organic_results']:
    loader = PyPDFLoader(i['pdf'])
    pdf = loader.load_and_split()
    pdf_list.append(pdf[0])

In [34]:
counter = 0
count_tokens = 0
for i in pdf_list:
    counter += 1
    abstract = str(i)
    removeHeadline = abstract.find("ABSTRACT")
    abstractOhneHeadline = abstract[removeHeadline+12:] #+12 um Überschrift "ABSTRACT \n " ebenfalls zu entfernen
    #print(abstractOhneHeadline)
    split_abstr = abstractOhneHeadline.split('\\n ')
    abstractOhneN = "".join(split_abstr)
    finddigit = re.search(r"\d",abstractOhneN)
    x = finddigit.start()
    clean_abstract = abstractOhneN[:x]
    if (clean_abstract.count("") > 5):
        print("#",counter,"Clean Abstract:",clean_abstract)
        prompt = f"""```{example_abstract}```\
        Compare the abstract from the text above with the following abstract. Just tell me without explanation, how similar they are in percentage.\
        ```{clean_abstract}```
        """
        response_abstract_comparison = get_completion(prompt)
        erg = enc.encode(prompt)
        tokens = len(erg)
        print("Abstract",counter,"matchin:",response_abstract_comparison,"-> Used Tokens:",tokens)
        count_tokens += tokens
        print("-------------------")
    else:
        print("#",counter,"Failed reading PDF.")
        prompt_fail = f"""
        The following text is mixed up with random words.\
        Extract the abstract.\
        Print the abstract without any additional explanation.\
        \
        ```{i}```
        """

        reconstructed_abstract = get_completion(prompt_fail)
        erg = enc.encode(prompt_fail)
        tokens = len(erg)
        print("Failed abstract reconstructet. Used Tokens:",tokens)
        count_tokens += tokens

        print("#",counter,"Reconstructed Abstract:",reconstructed_abstract)
        prompt = f"""```{example_abstract}```\
        Compare the abstract from the text above with the following abstract. Just tell me without explanation, how similar they are in percentage.\
        ```{reconstructed_abstract}```
        """
        response_abstract_comparison = get_completion(prompt)
        erg = enc.encode(prompt)
        tokens = len(erg)
        print("Abstract",counter,"matchin:",response_abstract_comparison,"-> Used Tokens:",tokens)
        count_tokens += tokens
        print("-------------------")
print("\nInsgesamt Tokens für diese Schleife:",count_tokens)

# 1 Clean Abstract: A system is provided for localizing parts of an object in an image by training local detectors using labeled image exem plars with fiducial points corresponding to parts within the image. Each local detector generates a detector score corre sponding to the likelihood that a desired part is located at a given location within the image exemplar. A non-parametric global model of the locations of the fiducial points is gener ated for each of at least a portion of the image exemplars. An input image is analyzed using the trained local detectors, and a Bayesian objective function is derived for the input image from the non-parametric model and detector scores. The Bayesian objective function is optimized using a consensus of global models, and an output is generated with locations of the fiducial points labeled within the object in the image. 
Abstract 1 matchin: 10% -> Used Tokens: 529
-------------------
# 2 Clean Abstract: A Surveillance system is provided that include

In [36]:
print(len(pdf_list))

0.1
