Following Instructions at https://python.langchain.com/docs/tutorials/retrievers/

In [1]:
from langchain_core.documents import Document


# One way of feeding data
documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [None]:
from langchain_community.document_loaders import TextLoader

# TODO: Use a different text, better suited to RAG
# Another way of feeding data
file_path = "./gullivers-travels.txt"
loader = TextLoader(file_path, encoding='utf-8')

docs = loader.load()

print(len(docs))

1


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Using Gullivers Travels


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

print(all_splits)



In [4]:
import getpass
import os
from dotenv import load_dotenv

load_dotenv()
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [5]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 3072

[-0.01735343411564827, 0.02396426722407341, -0.004915381781756878, 0.009431833401322365, -0.0010373948607593775, -0.01607116125524044, 0.018535975366830826, 0.006265330594033003, -0.03598914295434952, 0.055280234664678574]


In [6]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams


client = QdrantClient(":memory:")
# You must know the embedding dimension. Replace 1536 with your model's actual dimension.
client.recreate_collection(
    collection_name="test",
    vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
)

# Now it’s safe to pass into LangChain
vector_store = QdrantVectorStore(
    client=client,
    collection_name="test",
    embedding=embeddings,
)

  client.recreate_collection(


In [7]:
ids = vector_store.add_documents(documents=all_splits)

Ok, so Gullivers Travels admittedly isn't the best piece of written work for Vector DB searching. 
The text doesn't exactly list out a set of facts or easily understandable statements. ¯\\\_(ツ)_/¯

I'll replace that eventually


In [8]:
results = await vector_store.asimilarity_search("giant")

print(results[0])

page_content='This writer went through all the usual topics of European moralists,
showing how diminutive, contemptible, and helpless an animal was man in
his own nature; how unable to defend himself from inclemencies of the
air, or the fury of wild beasts; how much he was excelled by one
creature in strength, by another in speed, by a third in foresight, by a
fourth in industry. He added, that nature was degenerated in these
latter declining ages of the world, and could now produce only small
births, in comparison to those in ancient times. He said, it was very
reasonable to think, not only that the species of men were originally
much larger, but also, that there must have been giants in former ages;
which as it is asserted by history and tradition, so it hath been
confirmed by huge bones and skulls, casually dug up in several parts of
the kingdom, far exceeding the common dwindled race of man in our days.
He argued, that the very laws of nature absolutely required we should' metadata