In [2]:
!pip install langchain-community pypdf

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.1.0




In [18]:
import getpass
import os
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from dotenv import load_dotenv
from langchain_core.vectorstores import InMemoryVectorStore

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = "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"

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Choosing llm model
llm = ChatOpenAI(model="gpt-4o-mini")

# Choosing embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Vector store
vector_store = InMemoryVectorStore(embeddings)



os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = "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"

In [3]:
documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [6]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "LTC-Merradias-Ltd.-Solas-Express-Inc..pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()



6


In [10]:
print(f"Length of the document: {len(docs)}")
print(f"Description: {docs[0].page_content[:200]}\n")
print(f"Metadata: {docs[0].metadata}")

Length of the document: 6
Description: Merradias Corporate Ltd.                            Est 1974 
 
Long-term Fish Supply and Distribution Contract 
This Long -term Fish Purchase and Distribution Contract (the „Contract“) is made into a

Metadata: {'source': 'LTC-Merradias-Ltd.-Solas-Express-Inc..pdf', 'page': 0, 'page_label': '1'}


In [16]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250, chunk_overlap=50, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

48

In [17]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)

print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 3072

[0.017686249688267708, 0.012169010005891323, 0.0008824669639579952, -0.006344825495034456, -0.011360851116478443, -0.031797949224710464, -0.032233111560344696, 0.013894119299948215, -0.045847486704587936, 0.04404466971755028]


In [19]:
ids = vector_store.add_documents(documents=all_splits)

In [23]:
results = vector_store.similarity_search(
    "What are some highlighted information that I should consider?"
)

print(results[0])

page_content='-Demand appropriate Purchase Price reduction.' metadata={'source': 'LTC-Merradias-Ltd.-Solas-Express-Inc..pdf', 'page': 2, 'page_label': '3', 'start_index': 1819}


In [22]:
results = await vector_store.asimilarity_search("What are some highlighted information that I should consider?")

print(results[0])

page_content='-Demand appropriate Purchase Price reduction.' metadata={'source': 'LTC-Merradias-Ltd.-Solas-Express-Inc..pdf', 'page': 2, 'page_label': '3', 'start_index': 1819}


In [26]:
results = vector_store.similarity_search_with_score("Buyer shall give written notice of the defect to the Seller and not later than within how many days?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Score: 0.7429429727422725

page_content='4. Should the Buyer discover any defects during the Warranty Period, the Buyer shall give 
written notice of the defect to the Seller and not later than within 15 days after such defect had' metadata={'source': 'LTC-Merradias-Ltd.-Solas-Express-Inc..pdf', 'page': 2, 'page_label': '3', 'start_index': 1396}
