In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import glob
import tiktoken 
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import shutil

In [None]:
load_dotenv(override=True)

GROQ_BASE_URL = "https://api.groq.com/openai/v1"
api_key = os.getenv("GROQ_API_KEY")

if api_key:
    print(f"API key is Found starting with {api_key[:4]} and Good to Go!")
else:
    print("There is an issue with API key.")

MODEL = "openai/gpt-oss-120b"
groq = OpenAI(base_url=GROQ_BASE_URL, api_key=api_key)

db_name = "vector_db"


In [None]:
knowledge_base_path = "knowledge-base/**/*.md"
files = glob.glob(knowledge_base_path, recursive=True)
print(f"Found {len(files)} files in the knowledge base.")

entire_knowledge_base = ""

for file_path in files:
    with open(file_path, mode="r", encoding="utf-8") as f:
        entire_knowledge_base += f.read()
        entire_knowledge_base += "\n\n"

print(f"Total character in knowledge base: {len(entire_knowledge_base)}")

In [None]:
encoding = tiktoken.get_encoding("o200k_base")
tokens = encoding.encode(entire_knowledge_base)
token_count = len(tokens)
print(f"Total tokens approx for {MODEL}: {token_count}")

In [None]:
# Now loading everything in knowledge base using LangChain's Loaders

folders = glob.glob("knowledge-base/*")

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={'encoding': 'utf-8'})
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

print(f"Loaded {len(documents)} documents")

In [None]:
documents[0]

In [None]:
# Divide into Chunks using Recursive Character Text Splitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Divided into {len(chunks)} chunks.")
print(f"First Chunk:\n\n{chunks[0]}")

In [None]:
# Pick an embedding model to convert chunks into embeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# if os.path.exists(db_name):
#     Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

if os.path.exists(db_name):
    shutil.rmtree(db_name)

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vector store created with {vectorstore._collection.count()} documents.")

In [None]:
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count} vectors with {dimensions} dimensions in the vector store")