# Ingest and Embed Documents for RAG using LangChain + FAISS

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import os

In [None]:
# Step 1: Load PDF document
pdf_path = "data/sample_doc.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

In [None]:
# Step 2: Split text into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
docs = text_splitter.split_documents(documents)

In [None]:
# Step 3: Create embeddings for chunks
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# Step 4: Store in FAISS vector store
db = FAISS.from_documents(docs, embedding)

In [None]:
# Step 5: Save the FAISS index locally
os.makedirs("faiss_index", exist_ok=True)
db.save_local("faiss_index")
print("✅ Ingestion complete: FAISS index saved.")