In [None]:
import os
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS


In [None]:
# Define paths and model names
pdf_path = "./data/ISLP_website.pdf"
embedding_model = "sentence-transformers/all-MiniLM-l6-v2"
embeddings_folder = "./content/"  # Use "./content/" for local environments
faiss_index_path = "./content/faiss_index"

In [None]:
# Load and split documents
loader = PyPDFLoader(pdf_path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
docs = text_splitter.split_documents(documents)

In [None]:
# Initialize embeddings and create FAISS vector store
embedding_model = "sentence-transformers/all-MiniLM-l6-v2"
embeddings_folder = "./content/"

embeddings = HuggingFaceEmbeddings(model_name=embedding_model,
                                   cache_folder=embeddings_folder)
vector_db = FAISS.from_documents(docs, embeddings)
vector_db.save_local(faiss_index_path)
print(f"FAISS index saved to {faiss_index_path}")