In [None]:
# In LangChain, there is a strict difference between Text (just a string) 
# and a Document (a string wrapped in an object with metadata). 
# Chroma requires the Document object.

In [1]:
import os
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

load_dotenv()
MINIML_MODEL_PATH = os.getenv("MINIML_MODEL_PATH")

sample_text = '''Artificial Intelligence refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions.
        AI is used in fields such as healthcare (diagnostic tools), finance (fraud detection), and customer service (chatbots).
        Key types include Narrow AI (focused on specific tasks like voice assistants) and General AI (hypothetical AI capable of performing any intellectual task a human can do).
        '''

# 1. Create a Document object from your text
# Note: TextLoader is for files; since you have a string, we wrap it in Document()
doc_object = Document(page_content=sample_text, metadata={"source": "manual_input"})



# 2. Chunk the text using split_documents (keeps them as Document objects)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=30)
chunks = text_splitter.split_documents([doc_object]) 

# 3. Initialize Embeddings (Local Path)
embeddings = HuggingFaceEmbeddings(model_name=MINIML_MODEL_PATH)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

In [10]:
import os

# Get the current working directory (for Jupyter notebooks or scripts)
current_dir = os.getcwd()

# Create the path for the "resources/databases/db1/chroma_db" directory within the current directory
persist_directory = os.path.join(current_dir, "resources", "databases", "db1", "chroma_db")

# Make sure the directory exists
os.makedirs(persist_directory, exist_ok=True)


# 4. Create and persist the Vector DB
# 'chunks' is now a list of Document objects, so this will work!
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=embeddings, 
    persist_directory=persist_directory
)
print("Vector database created and stored locally.")



Vector database created and stored locally.


In [11]:
# 5. Simple Retrieval & Query
query = "What are the machines where AI is used?"
docs = vector_db.similarity_search(query)

if docs:
    print(f"Top result: {docs[0].page_content}")

Top result: AI is used in fields such as healthcare (diagnostic tools), finance (fraud detection), and customer service (chatbots).


In [12]:
## Lets store using FAISS
# Exactly. To put it simply: FAISS is the engine that does the math, but LangChain provides the "container" (the Documents) to hold your data.


In [15]:
from langchain_community.vectorstores import FAISS
vector_db = FAISS.from_documents(chunks, embeddings)
vector_db.save_local(persist_directory)
print("FAISS index created and saved to 'faiss_index' folder.")

FAISS index created and saved to 'faiss_index' folder.


In [16]:
# 5. Retrieval
query = "What are the programmed where AI is used?"
docs = vector_db.similarity_search(query, k=1)
if docs:
    print(f"\nTop result: {docs[0].page_content}")


Top result: AI is used in fields such as healthcare (diagnostic tools), finance (fraud detection), and customer service (chatbots).


In [None]:
chunks