In [1]:
import pandas as pd
import faiss
import faiss.contrib.torch_utils  # Enable GPU support for FAISS
import os
import torch
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

import pickle

In [2]:


# Check and set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load model from local or HuggingFace and move to GPU
model_path = 'E:/Study/Thesis work/Multihop for Urdu/model_weights/embedding_model'
if os.path.exists(model_path):
    model = SentenceTransformer(model_path, device=device)
else:
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device=device)
    model.save(model_path)



In [3]:
# Urdu text splitter
def split_urdu_text(text, chunk_size=250, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        separators=["۔", "\n", ",", " "],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    return splitter.split_text(text)



In [4]:
# Load CSV
df = pd.read_csv("E:\\Study\\Thesis work\\Multihop for Urdu\\Dataset\\Hotpotqa\\1000_paras_100_queries\\translated_file_for_100_paras_corrected_Format.csv")  
paragraphs = df["Translated_Content"].dropna().tolist()


In [5]:
len(paragraphs)

985

In [6]:
# Initialize FAISS IndexFlatL2 (CPU version)
embedding_dim = model.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(embedding_dim)

all_chunks = []

for para in paragraphs:
    chunks = split_urdu_text(para)
    if not chunks:
        continue
    embeddings = model.encode(chunks, convert_to_numpy=True)
    index.add(embeddings)
    all_chunks.extend(chunks)


# Save FAISS index to disk
faiss.write_index(index, "E:\\Study\\Thesis work\\Multihop for Urdu\\vector_db\\paragraphs\\urdu_faiss_index_for_100_para.index")

chunks_save_path = "E:\\Study\\Thesis work\\Multihop for Urdu\\data_storage\\parachunks\\urdu_chunks_for_100_para.pkl"
with open(chunks_save_path, "wb") as f:
    pickle.dump(all_chunks, f)

In [7]:
print(f"Number of vectors indexed: {index.ntotal}")


Number of vectors indexed: 2238
