In [1]:
import json
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#  Load the pre-trained embedding model
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Change to OpenAI model if needed
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

In [3]:
#  Load BPE Tokenized Dataset
file_path = r"C:\Users\gvais\OneDrive\Desktop\faq\bharatpe_faqs.json"  # Use full path
with open(file_path, "r", encoding="utf-8") as file:
    faq_data = json.load(file)

In [4]:
# Function to Convert Text to Vector Embeddings
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

In [5]:
#  Generate Embeddings for Each FAQ
embeddings = []
for item in faq_data["faqs"]:
    vector = generate_embedding(item["question"] + " " + item["answer"])
    embeddings.append({"question": item["question"], "embedding": vector})


In [6]:
#  Save Embeddings to a JSON File
embedding_file = r"C:\Users\gvais\OneDrive\Desktop\faq\faq_embeddings.json"
with open(embedding_file, "w", encoding="utf-8") as f:
    json.dump(embeddings, f, indent=4)

print(f" Embeddings saved at: {embedding_file}")

 Embeddings saved at: C:\Users\gvais\OneDrive\Desktop\faq\faq_embeddings.json
