In [None]:
!pip install sentence-transformers langchain chromadb pypdf faiss-cpu  langchain_community scikit-learn matplotlib seaborn numpy

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS,Chroma
import os

In [None]:
from google.colab import files
uploaded = files.upload()

pdf_path = list(uploaded.keys())[0]
print(f"Uploaded file: {pdf_path}")

In [None]:
loader = PyPDFLoader(pdf_path)
documents = loader.load()

print("Total pages loaded:", len(documents))

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50
)

chunks = splitter.split_documents(documents)
print("Total chunks created:", len(chunks))

In [None]:
print(chunks[0].page_content[:50])

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

**FAISS DB**

In [None]:
faiss_db = FAISS.from_documents(chunks, embedding_model)
print("FAISS Vector DB created!")

In [None]:
query = "Who is K.khajabee"
results = faiss_db.similarity_search(query, k=3)

for r in results:
    print(r.page_content[:200])
    print("---")

**Chrome Vector DB**

In [None]:
chroma_dir = "./chroma_db"
if not os.path.exists(chroma_dir):
    os.makedirs(chroma_dir)

chroma_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=chroma_dir
)

print("Chroma DB created and persisted!")

In [None]:
query = "Which type of jobs r suits to K.khajabee?"
results = chroma_db.similarity_search(query, k=3)

for r in results:
    print(r.page_content[:50])
    print("---")