In [7]:
import pandas as pd
import faiss
# import faiss.contrib.torch_utils  # Enable GPU support for FAISS
import os
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import pickle

In [17]:
# === Paths ===
csv_path = "../../../Dataset_code_csvs/hotpotQA/hotpotQA_dataset_versions/5884paras_599queries/Urdu/5884_paras_translated.csv"  # replace with your actual CSV path
column_name = "Translated_Content"
faiss_index_path = "../../vector_db/paragraphs/5884_paras/5884_paras_faiss_index.index"
chunks_path = "../../data_storage/Paragraph_chunks/5884_paragraphs/5884_chunks.pkl"

# === Parameters ===
chunk_size = 250
chunk_overlap = 50

# Create output directory if not exists
os.makedirs(os.path.dirname(faiss_index_path), exist_ok=True)


In [18]:
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("intfloat/e5-large", device=device)
# Configure for Urdu text
model.max_seq_length = 512
model.tokenizer.do_lower_case = False  # Preserve Urdu characters

print("✅ SentenceTransformer model loaded")

✅ SentenceTransformer model loaded


In [19]:
# Split text into overlapping chunks
def split_urdu_text(text, chunk_size=250, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        separators=["۔", "\n", ",", " "],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    return splitter.split_text(text)

# Embed a list of texts WITHOUT adding "passage: "
def embed_texts(texts):
    # SentenceTransformer handles batching and normalization automatically
    return model.encode(texts, normalize_embeddings=True, show_progress_bar=False)


In [21]:
df = pd.read_csv(csv_path)
all_chunks = []

for text in tqdm(df[column_name].dropna().tolist(), desc="Splitting"):
    chunks = split_urdu_text(text, chunk_size, chunk_overlap)
    all_chunks.extend(chunks)

print(f"✅ Total chunks: {len(all_chunks)}")

# Change to IndexFlatIP since we're using normalized embeddings
dimension = 1024  
index = faiss.IndexFlatIP(dimension)  # Changed from IndexFlatL2 to IndexFlatIP

batch_size = 32
for i in tqdm(range(0, len(all_chunks), batch_size), desc="Embedding"):
    batch = all_chunks[i:i + batch_size]
    embeddings = embed_texts(batch)
    index.add(embeddings)

print(f"✅ FAISS index built with {index.ntotal} vectors")

Splitting: 100%|██████████| 5884/5884 [00:00<00:00, 47062.66it/s]


✅ Total chunks: 13306


Embedding: 100%|██████████| 416/416 [04:59<00:00,  1.39it/s]

✅ FAISS index built with 13306 vectors





In [22]:
faiss.write_index(index, faiss_index_path)
with open(chunks_path, "wb") as f:
    pickle.dump(all_chunks, f)

print(f"✅ Saved FAISS index to {faiss_index_path}")
print(f"✅ Saved chunks to {chunks_path}")


✅ Saved FAISS index to ../../vector_db/paragraphs/5884_paras/5884_paras_faiss_index.index
✅ Saved chunks to ../../data_storage/Paragraph_chunks/5884_paragraphs/5884_chunks.pkl
