In [None]:
import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
    TextSplitter,
    TokenTextSplitter,
)
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
import warnings

warnings.filterwarnings("ignore")

In [None]:
current_dir = os.getcwd()
file_path = os.path.join(current_dir, "books", "odyssey.txt")
db_dir = os.path.join(current_dir, "db")

In [None]:
loader = TextLoader(file_path=file_path)
documents = loader.load()

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)



In [None]:
print("\n--- Document Chunks Information ---")
print(f"Number of document chunks: {len(texts)}")
print(f"Sample chunk:\n{texts[0].page_content}\n")


--- Document Chunks Information ---
Number of document chunks: 826
Sample chunk:
﻿The Project Gutenberg eBook of The Odyssey
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Odyssey

Author: Homer

Translator: Samuel Butler

Release date: April 1, 1999 [eBook #1727]
                Most recently updated: December 2, 2023

Language: English

Credits: Jim Tinsley and David Widger


*** START OF THE PROJECT GUTENBERG EBOOK THE ODYSSEY ***


[Illustration]


The Odyssey

by Homer

rendered into English prose for the use of those who cannot read the
original

Contents



In [None]:
def create_vector_store(docs, embeddings, store_name):
    persistent_directory = os.path.join(db_dir, store_name)
    if not os.path.exists(persistent_directory):
        print(f"\n--- Creating vector store {store_name} ---")
        Chroma.from_documents(
            docs, embeddings, persist_directory=persistent_directory)
        print(f"--- Finished creating vector store {store_name} ---")
    else:
        print(
            f"Vector store {store_name} already exists. No need to initialize.")

In [None]:
# 2. Hugging Face Transformers
# Uses models from the Hugging Face library.
# Ideal for leveraging a wide variety of models for different tasks.
# Note: Running Hugging Face models locally on your machine incurs no direct cost other than using your computational resources.
# Note: Find other models at https://huggingface.co/models?other=embeddings
print("\n--- Using Hugging Face Transformers ---")
huggingface_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)


--- Using Hugging Face Transformers ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
create_vector_store(texts, huggingface_embeddings, "chroma_db_huggingface")
print("Embedding demonstrations for OpenAI and Hugging Face completed.")


--- Creating vector store chroma_db_huggingface ---
--- Finished creating vector store chroma_db_huggingface ---
Embedding demonstrations for OpenAI and Hugging Face completed.


In [None]:
def query_vector_store(store_name, query, embedding_function):
    persistent_directory = os.path.join(db_dir, store_name)
    if os.path.exists(persistent_directory):
        print(f"\n--- Querying the Vector Store {store_name} ---")
        db = Chroma(
            persist_directory=persistent_directory,
            embedding_function=embedding_function,
        )
        retriever = db.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"k": 3, "score_threshold": 0.1},
        )
        relevant_docs = retriever.invoke(query)
        # Display the relevant results with metadata
        print(f"\n--- Relevant Documents for {store_name} ---")
        for i, doc in enumerate(relevant_docs, 1):
            print(f"Document {i}:\n{doc.page_content}\n")
            if doc.metadata:
                print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")
    else:
        print(f"Vector store {store_name} does not exist.")

In [None]:
query = "Who is Odysseus' wife?"

# Query each vector store
query_vector_store("chroma_db_huggingface", query, huggingface_embeddings)

print("Querying demonstrations completed.")


--- Querying the Vector Store chroma_db_huggingface ---

--- Relevant Documents for chroma_db_huggingface ---
Document 1:
Thus did he speak, and they went on board even as he had said. But as
Telemachus was thus busied, praying also and sacrificing to Minerva in
the ship’s stern, there came to him a man from a distant country, a
seer, who was flying from Argos because he had killed a man. He was
descended from Melampus, who used to live in Pylos, the land of sheep;
he was rich and owned a great house, but he was driven into exile by
the great and powerful king Neleus. Neleus seized his goods and held
them for a whole year, during which he was a close prisoner in the
house of king Phylacus, and in much distress of mind both on account of
the daughter of Neleus and because he was haunted by a great sorrow
that dread Erinys had laid upon him. In the end, however, he escaped
with his life, drove the cattle from Phylace to Pylos, avenged the
wrong that had been done him, and gave the daugh

In [None]:
import shutil
shutil.make_archive('/content/db', 'zip', '/content/db')

'/content/db.zip'

In [None]:
from google.colab import files
files.download('/content/db.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>