In [None]:
import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
import warnings

warnings.filterwarnings("ignore")

In [5]:
current_dir = os.getcwd()
file_path = os.path.join(current_dir, "books", "odyssey.txt")
presisdent_directory = os.path.join(current_dir, "db", "chroma_db")

In [6]:
if not os.path.exists(presisdent_directory):
    print("Persistent directory does not exist. Initializing vector store...")

    if not os.path.exists(file_path):
        raise FileNotFoundError(
            f"The file {file_path} does not exist. Please check the path."
        )

    loader = TextLoader(file_path)
    documents = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs = text_splitter.split_documents(documents)

    print("\n--- Document Chunks Information ---")
    print(f"Number of document chunks: {len(docs)}")
    print(f"Sample chunk:\n{docs[0].page_content}\n")

    print("\n--- Creating embeddings ---")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    print("\n--- Embedding Created ---")

    print("\n--- Creating vector store ---")
    db = Chroma.from_documents(docs, embeddings, persist_directory=presisdent_directory)
    print("\n--- Vector store created ---")

else:
    print("Vector store already exists. No need to initialize.")



Persistent directory does not exist. Initializing vector store...

--- Document Chunks Information ---
Number of document chunks: 826
Sample chunk:
﻿The Project Gutenberg eBook of The Odyssey
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Odyssey

Author: Homer

Translator: Samuel Butler

Release date: April 1, 1999 [eBook #1727]
                Most recently updated: December 2, 2023

Language: English

Credits: Jim Tinsley and David Widger


*** START OF THE PROJECT GUTENBERG EBOOK THE ODYSSEY ***


[Illustration]


The Odyssey

by Homer

rendered into English prose for the use 

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


--- Embedding Created ---

--- Creating vector store ---

--- Vector store created ---


In [7]:
import shutil
shutil.make_archive('/content/db', 'zip', '/content/db')

'/content/db.zip'

In [8]:
from google.colab import files
files.download('/content/db.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### 1b_rag_basics

In [12]:
current_dir = os.getcwd()
presisdent_directory = os.path.join(current_dir, "db", "chroma_db")

In [13]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [14]:
db = Chroma(persist_directory=presisdent_directory, embedding_function=embeddings)

In [15]:
retreiver = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.4},
)

In [16]:
query = "Who is Odysseus' wife?"

In [17]:
relevant_docs = retreiver.invoke(query)

In [20]:
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")


--- Relevant Documents ---
Document 1:
[96] [ I have already said in a note on bk. xi. 186 that at this point
of Ulysses’ voyage Telemachus could only be between eleven and twelve
years old.]

[97] [ Is the writer a man or a woman?]

[98] [ Cf. “Il.” iv. 521, {Greek}. The Odyssean line reads, {Greek}.
The famous dactylism, therefore, of the Odyssean line was probably
suggested by that of the Ileadic rather than by a desire to accommodate
sound to sense. At any rate the double coincidence of a dactylic line,
and an ending {Greek}, seems conclusive as to the familiarity of the
writer of the “Odyssey” with the Iliadic line.]

Source: /content/books/odyssey.txt

