In [4]:
import pandas as pd
import faiss
# import faiss.contrib.torch_utils  # Enable GPU support for FAISS
import os
import torch
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

import pickle

In [17]:
# Check and set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device: ",device)

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
model = model.to(device)


device:  cuda


In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_english_text(text, chunk_size=200, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", ".", "?", "!", ",", " ", "\n"],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        add_start_index=False
    )
    return [chunk.lstrip(" .,\n") for chunk in splitter.split_text(text)]

In [19]:
df = pd.read_csv("../../Dataset_code_csvs/hotpotQA/hotpotQA_dataset_versions/5884paras_600queries/English/5884_paras.csv")  
paragraphs = df["Content"].dropna().tolist()

In [20]:
len(paragraphs)

5884

In [21]:
# Initialize FAISS IndexFlatL2 (CPU version)
embedding_dim = model.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(embedding_dim)

all_chunks = []

for para in paragraphs:
    chunks = split_english_text(para)
    if not chunks:
        continue
    embeddings = model.encode(chunks, convert_to_numpy=True)
    index.add(embeddings)
    all_chunks.extend(chunks)


# Save FAISS index to disk
faiss.write_index(index, "../vector_db/paragraphs/5884_paras/5884_paras_index.faiss")

chunks_save_path = "../data_storage/parachunks/5884_para_chunks/5884_parachunks.pkl"
with open(chunks_save_path, "wb") as f:
    pickle.dump(all_chunks, f)