In [2]:
!unzip /content/books.zip
!unzip /content/db.zip

Archive:  /content/books.zip
   creating: books/
  inflating: books/adventures_of_huckleberry_finn.txt  
  inflating: books/adventures_of_sherlock_holmes.txt  
  inflating: books/declaration_of_independence_of_the_united_states.txt  
  inflating: books/frankenstein.txt  
  inflating: books/iliad.txt         
  inflating: books/langchain_demo.txt  
  inflating: books/moby_dick.txt     
  inflating: books/odyssey.txt       
  inflating: books/pride_and_prejudice.txt  
  inflating: books/romeo_and_juliet.txt  
  inflating: books/romeo_and_juliet (1).txt  
  inflating: books/strange_case_of_dr_jekyll_and_mr_hyde.txt  
  inflating: books/tale_of_two_cities.txt  
  inflating: books/ulysses.txt       
  inflating: books/us_bill_of_rights.txt  
  inflating: books/war_and_peace.txt  
Archive:  /content/db.zip
   creating: db/
   creating: db/chroma_db/
   creating: db/chroma_db/f681035a-17e3-44db-a5ad-6f5df64368c0/
  inflating: db/chroma_db/f681035a-17e3-44db-a5ad-6f5df64368c0/data_level0.bin  

In [1]:
!pip install -U langchain_community chromadb

  Attempting uninstall: opentelemetry-sdk
    Found existing installation: opentelemetry-sdk 1.16.0
    Uninstalling opentelemetry-sdk-1.16.0:
      Successfully uninstalled opentelemetry-sdk-1.16.0
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.3.33
    Uninstalling langchain-core-0.3.33:
      Successfully uninstalled langchain-core-0.3.33
  Attempting uninstall: langchain-text-splitters
    Found existing installation: langchain-text-splitters 0.3.5
    Uninstalling langchain-text-splitters-0.3.5:
      Successfully uninstalled langchain-text-splitters-0.3.5
  Attempting uninstall: langchain
    Found existing installation: langchain 0.3.17
    Uninstalling langchain-0.3.17:
      Successfully uninstalled langchain-0.3.17
Successfully installed asgiref-3.8.1 backoff-2.2.1 bcrypt-4.2.1 build-1.2.2.post1 chroma-hnswlib-0.7.6 chromadb-0.6.3 coloredlogs-15.0.1 dataclasses-json-0.6.7 durationpy-0.9 fastapi-0.115.8 httptools-0.6.4 httpx-sse-0.4.0 

In [3]:
import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
import warnings

warnings.filterwarnings("ignore")

In [None]:
current_dir = os.getcwd()
books_dir = os.path.join(current_dir, "books")
db_dir = os.path.join(current_dir, "db")
persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata")

In [5]:
print(f"Books directory: {books_dir}")
print(f"Persistent directory: {persistent_directory}")

Books directory: /content/books
Persistent directory: /content/db/chroma_db_with_metadata


In [6]:
if not os.path.exists(persistent_directory):
  print("Persistent directory does not exist. Initializing vector store...")

  if not os.path.exists(books_dir):
    raise ValueError(f"Books directory does not exist: {books_dir}")

  book_files = [f for f in os.listdir(books_dir) if f.endswith(".txt")]

  documents = []
  for book_file in book_files:
    loader = TextLoader(os.path.join(books_dir, book_file))
    book_docs = loader.load()

    for doc in book_docs:
      doc.metadata = {"source":book_file}
      documents.append(doc)

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    dos = text_splitter.split_documents(documents)

    print("\n--- Document Chunks Information ---")
    print(f"Number of document chunks: {len(dos)}")

    print("\n--- Creating embeddings ---")

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    print("\n--- Finished creating embeddings ---")

    print("\n--- Creating and persisting vector store ---")
    db = Chroma.from_documents(dos, embeddings, persist_directory=persistent_directory)
    print("\n--- Finished creating and persisting vector store ---")

else:
    print("Vector store already exists. No need to initialize.")




--- Document Chunks Information ---
Number of document chunks: 8378

--- Creating embeddings ---

--- Finished creating embeddings ---

--- Creating and persisting vector store ---





--- Finished creating and persisting vector store ---





--- Document Chunks Information ---
Number of document chunks: 12122

--- Creating embeddings ---

--- Finished creating embeddings ---

--- Creating and persisting vector store ---





--- Finished creating and persisting vector store ---





--- Document Chunks Information ---
Number of document chunks: 13045

--- Creating embeddings ---

--- Finished creating embeddings ---

--- Creating and persisting vector store ---

--- Finished creating and persisting vector store ---


In [7]:
import shutil
shutil.make_archive('/content/db', 'zip', '/content/db')

'/content/db.zip'

In [8]:
from google.colab import files
files.download('/content/db.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### 2b_rag_basics_metadata

In [9]:
current_dir = os.getcwd()
db_dir = os.path.join(current_dir, "db")
persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata")

In [10]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [14]:
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)

In [15]:
retreiver = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.1},
)

In [16]:
relevant_docs = retreiver.invoke("What is the capital of France?")

In [18]:
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    print(f"Source: {doc.metadata['source']}\n")


--- Relevant Documents ---
Document 1:
The captain was so naïvely and good-humoredly gay, so real, and so
pleased with himself that Pierre almost winked back as he looked merrily
at him. Probably the word “gallant” turned the captain’s thoughts to the
state of Moscow.

“Apropos, tell me please, is it true that the women have all left
Moscow? What a queer idea! What had they to be afraid of?”

“Would not the French ladies leave Paris if the Russians entered it?”
asked Pierre.

“Ha, ha, ha!” The Frenchman emitted a merry, sanguine chuckle, patting
Pierre on the shoulder. “What a thing to say!” he exclaimed. “Paris?...
But Paris, Paris...”

“Paris—the capital of the world,” Pierre finished his remark for him.

The captain looked at Pierre. He had a habit of stopping short in the
middle of his talk and gazing intently with his laughing, kindly eyes.

Source: war_and_peace.txt

Document 2:
The captain was so naïvely and good-humoredly gay, so real, and so
pleased with himself that Pierre a