# Ingest and Embed Documents for RAG using LangChain + FAISS

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.pgvector import PGVector
from langchain.document_loaders import PyPDFDirectoryLoader, WebBaseLoader
import os

!pip install pgvector pypdf psycopg langchain lxml_html_clean

In [None]:
product_version = "2-latest"
CONNECTION_STRING = "postgresql+psycopg://vectordb:vectordb@postgresql-service.ic-shared-rag-llm.svc.cluster.local:5432/vectordb"
COLLECTION_NAME = "documents_test"

In [None]:
# Step 1: Load PDF document
!pip install pypdf

import os
print("Exists:", os.path.exists("sample_doc.pdf"))
print("Found?" , os.path.exists("sample_doc.pdf"))  # Update path accordingly

pdf_path = "sample_doc.pdf"

loader = PyPDFLoader(pdf_path)
documents = loader.load()

In [None]:
# Step 2: Split text into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
docs = text_splitter.split_documents(documents)

In [None]:
# Step 3: Create embeddings for chunks
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# Step 4: Store in FAISS vector store


db = PGVector.from_documents(
    documents=docs,
    embedding=embedding,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    pre_delete_collection=True # This deletes existing collection and its data, use carefully!
)

In [None]:
# Step 5: Save the FAISS index locally
#os.makedirs("faiss_index", exist_ok=True)
#db.save_local("faiss_index")
#print("✅ Ingestion complete: FAISS index saved.")

In [None]:
query = "How to install a cluster on Azure ?"
docs_with_score = db.similarity_search_with_score(query)

In [None]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)