In [10]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.embeddings import HuggingFaceEmbeddings

In [11]:
DATA_PATH = "../data/main-data/synthetic-resumes.csv"
FAISS_PATH = "../vectorstore"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

In [12]:
df = pd.read_csv(DATA_PATH)
loader = DataFrameLoader(df, page_content_column="Resume")

text_splitter = RecursiveCharacterTextSplitter(
  chunk_size = 1024,
  chunk_overlap = 500
)

embedding_model = HuggingFaceEmbeddings(
  model_name=EMBEDDING_MODEL,
  model_kwargs={"device": "cpu"}
)

In [13]:
documents = loader.load()
document_chunks = text_splitter.split_documents(documents)

In [14]:
vectorstore_db = FAISS.from_documents(document_chunks, embedding_model, distance_strategy=DistanceStrategy.COSINE)
vectorstore_db.save_local(FAISS_PATH)