In [20]:
from langchain.schema import Document
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings
from pymongo import MongoClient
from pymongo.server_api import ServerApi

In [31]:
patent_data = {
    "LD620FU7":{
        "pdf": "link.pdf",
        "title": "Richtiger Titel",
        "abstract": "Lorem marmelade",
        "description": "amet",
        "claim": "est",
    },
    "TE95BBR7":{
        "pdf": "link.pdf",
        "title": "Test Title",
        "abstract": "Lorem ipsum",
        "description": "dolor sit",
        "claim": "Consecetur",
    },
    "NAZE296N":{
        "pdf": "link.pdf",
        "title": "Frühstücks Überschrift",
        "abstract": "Banase Kaffee kakao Nutella Toastbrot",
        "description": "Frühstücksei",
        "claim": "Orangensaft",
    }
}

In [37]:
patent_list = []

for patent_id, data in patent_data.items():
    page_content = f"{data['title']} {data['abstract']} {data['description']} {data['claim']}"
    metadata = {"patent_id": patent_id}
    patent_list.append(Document(page_content=page_content, metadata=metadata))

In [38]:
patent_list

[Document(page_content='Richtiger Titel Lorem marmelade amet est', metadata={'patent_id': 'LD620FU7'}),
 Document(page_content='Test Title Lorem ipsum dolor sit Consecetur', metadata={'patent_id': 'TE95BBR7'}),
 Document(page_content='Frühstücks Überschrift Banase Kaffee kakao Nutella Toastbrot Frühstücksei Orangensaft', metadata={'patent_id': 'NAZE296N'})]

In [51]:
content = "Lorem ipsum"

#Login MongoDB with User and specific database
uri = "mongodb+srv://timmey:faB8MFdyyb7zWvVr@llm-ttt.8kqrnka.mongodb.net/?retryWrites=true&w=majority"


# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

DB_NAME = "llm-ttt"
COLLECTION_NAME = "pdfresults"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]       

# insert the documents in MongoDB Atlas with their embedding
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=patent_list,
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)
# Perform a similarity search with Score between the embedding of the query and the embeddings of the documents
#progress(0.9, desc="Compare the patents")
query = str(content)

results = vector_search.similarity_search_with_score(
    query=query,
    k=20, #Output for the top n results
)

# Display results
#for result in results:
#    print(result)

formatted_results = []
formatted_result = ""
for result in results:
    formatted_result = ("ID: {}; Übereinstimmung: {}%".format(result[0].metadata['patent_id'],round(result[1] * 100, 2)))
# Append the formatted result to the list
    formatted_results.append(formatted_result)
        
#result.live(formatted_result)
print(formatted_results)

['ID: TE95BBR7; Übereinstimmung: 93.92%', 'ID: TE95BBR7; Übereinstimmung: 93.92%', 'ID: TE95BBR7; Übereinstimmung: 93.92%', 'ID: LD620FU7; Übereinstimmung: 93.01%', 'ID: LD620FU7; Übereinstimmung: 93.01%']


In [54]:
vector_result = {}

for result in results:
    vector_result[result[0].metadata['patent_id']] = result[1]


vector_result

{'TE95BBR7': 0.939172625541687, 'LD620FU7': 0.9301292896270752}

In [18]:
def clear_db(): #clear the vector database
    uri = "mongodb+srv://timmey:faB8MFdyyb7zWvVr@llm-ttt.8kqrnka.mongodb.net/?retryWrites=true&w=majority"

    # Create a new client and connect to the server
    client = MongoClient(uri, server_api=ServerApi('1'))

    DB_NAME = "llm-ttt"
    COLLECTION_NAME = "pdfresults"
    ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

    MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]
    
    x = MONGODB_COLLECTION.delete_many({})
    delete = str(x.deleted_count) + " documents deleted."
    return delete

In [53]:
clear_db()

'12 documents deleted.'