In [14]:
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import json
from sqlalchemy import create_engine
from sqlalchemy import text
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from InstructorEmbedding import INSTRUCTOR
import uuid
import numpy as np  



In [15]:
load_dotenv()

DB_URL = "postgresql://postgres:pgAdmin@localhost:5432/SherpalDB_V2"
COLLECTION_NAME = "math_v2_questiontext_reasoning_instructor_xl"
QDRANT_HOST = "localhost"
QDRANT_PORT = 32770

In [16]:
engine = create_engine(DB_URL)
# model = SentenceTransformer("BAAI/bge-m3")  # Or use 'instructor-xl'
model = SentenceTransformer('hkunlp/instructor-xl')

client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)


In [17]:
def fetch_questions():
    query = text("""
        SELECT 
        q.id,
        q.question_text,
        q.passage,
        q.question_type,
        q.rationale,
        q.options,
        q.correct_answer,
        q.hint,
        q.prompt_id,
        p.name AS prompt_name
    FROM public.api_tempquestion q
    JOIN public.api_tempprompt p ON q.prompt_id = p.id
    FROM public.api_tempquestion q
    JOIN public.api_tempprompt p ON q.prompt_id = p.id
    WHERE p.name LIKE 'PID-M%'
    order by p.name asc
    """)
    with engine.connect() as conn:
        result = conn.execute(query)
        return [dict(row._mapping) for row in result]

Creating a Collection for instructor-XL model


In [18]:
VECTOR_SIZE = model.get_sentence_embedding_dimension()

def create_qdrant_collection():
    if client.collection_exists(collection_name=COLLECTION_NAME):
        print(f"Collection '{COLLECTION_NAME}' already exists. Deleting it...")
        client.delete_collection(collection_name=COLLECTION_NAME)

    print(f"Creating collection '{COLLECTION_NAME}'...")
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
    )
    print(f"Collection '{COLLECTION_NAME}' created successfully.")


Creating a Collection for beg-m3 model


In [19]:

# VECTOR_SIZE = model.get_sentence_embedding_dimension()

# def create_qdrant_collection():
#     test_vector = model.encode("test")
#     vector_size = len(test_vector) 
#     print("vector size =", vector_size)
#     client.recreate_collection(
#         collection_name=COLLECTION_NAME,
#         vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
#     )

In [20]:
def parse_prompt_name(prompt_name):
    parts = prompt_name.split('-')
    
    if len(parts) < 6:
        raise ValueError("Invalid prompt name format. Expected 6 parts separated by '-'.")

    return {
        'subject': parts[1],
        'domain': parts[2],
        'skill': parts[3],
        'subskill': parts[4],
        'difficulty': parts[5]
    }

In [21]:
from tqdm import tqdm
import uuid
import json
from qdrant_client.models import PointStruct

def extract_reasoning(rationale):
    if isinstance(rationale, dict):
        return rationale.get("reasoning", "")
    return ""

def format_field(value):
    if isinstance(value, dict):
        return json.dumps(value, indent=2)
    elif isinstance(value, list):
        return ", ".join(map(str, value))
    elif value is not None:
        return str(value).strip()
    return ""

Generating embeddings for instructor-XL model


In [9]:

def generate_embeddings_and_upload(questions):
    for q in tqdm(questions, desc="Processing Questions", unit="question"):
        prompt_meta = parse_prompt_name(q['prompt_name'])

        task_instruction = str(q["prompt_name"]).strip() if q.get("prompt_name") else ""

        question_text = format_field(q.get("question_text"))
        reasoning_text = extract_reasoning(q.get("rationale"))
        input_text = " | ".join(part for part in [question_text, reasoning_text] if part.strip())
        # input_text = question_text

        vector = model.encode([[task_instruction, input_text]])[0].tolist()


        point = PointStruct(
            id=str(uuid.uuid4()),
            vector=vector,
            payload={"question_id": q["id"], "prompt_name": q["prompt_name"], **prompt_meta}
        )


        # Debug Info
        # print("\n--- Embedding Info ---")
        # print("Instruction (metadata):", task_instruction)
        # print("Input text:", input_text)
        # print("Metadata payload:", json.dumps(point.payload, indent=2))
        # print("--- End ---\n")


        client.upsert(collection_name=COLLECTION_NAME, points=[point])


Generating embeddings for beg-m3 model


In [None]:

# def generate_embeddings_and_upload(questions):
#     for q in tqdm(questions, desc="Processing Questions", unit="question"):
#         prompt_meta = parse_prompt_name(q['prompt_name'])
        
#         metadata_values = [str(q["prompt_name"])] if q.get("prompt_name") else []


#         question_text = format_field(q.get("question_text"))
#         reasoning_text = extract_reasoning(q.get("rationale"))

#         input_text_parts = metadata_values + [question_text, reasoning_text]
#         # input_text_parts = metadata_values + [question_text]

#         input_text = " | ".join(part for part in input_text_parts if part.strip())

#         # Debug print
#         # print("\n--- Embedding Info ---")
#         # print("Metadata:", json.dumps({
#         #     "question_id": q["id"],
#         #     "prompt_name": q["prompt_name"],
#         #     **prompt_meta
#         # }, indent=2))
#         # print("Input text to embed:", input_text)
#         # print("--- End ---\n")

#         # Uncomment these lines to enable embedding and uploading
#         vector = model.encode(input_text).tolist()
#         point = PointStruct(
#             id=str(uuid.uuid4()),
#             vector=vector,
#             payload={ "question_id": q["id"], "prompt_name": q["prompt_name"], **prompt_meta }
#         )
#         client.upsert(collection_name=COLLECTION_NAME, points=[point])


In [24]:
print("Fetching questions...")
questions = fetch_questions()
print(len(questions))


Fetching questions...
1814


In [25]:
print("Creating collection in Qdrant...")
create_qdrant_collection()

Creating collection in Qdrant...
Collection 'math_v2_questiontext_reasoning_instructor_xl' already exists. Deleting it...
Creating collection 'math_v2_questiontext_reasoning_instructor_xl'...
Collection 'math_v2_questiontext_reasoning_instructor_xl' created successfully.


In [26]:
print("Generating embeddings and uploading...")
generate_embeddings_and_upload(questions)

Generating embeddings and uploading...


Processing Questions: 100%|██████████| 1814/1814 [1:27:28<00:00,  2.89s/question]
