# RAG Question-to-Topic Mapping Pipeline

1. Setup & Install Dependencies

In [1]:
# Install required libraries
!pip install openai pandas tqdm




2. Enter OpenAI API Key

In [2]:
import getpass
import openai

# Prompt for OpenAI API key (keeps it hidden)
openai.api_key = getpass.getpass("Enter your OpenAI API key: ")


Enter your OpenAI API key: ··········


3. Data Loading

In [5]:
import pandas as pd
import json

# Mount Google Drive if your files are in Drive (skip if uploading manually)
# from google.colab import drive
# drive.mount('/content/drive')

# Load questions
questions_df = pd.read_excel("/content/Assessments Test 3-6-25 - sample - testing - 10 questions.xlsx")
print("Loaded questions:", questions_df.shape)

# Load topics metadata
with open("/content/topics_metadata.json", "r") as f:
    topics = json.load(f)

# Load ground truth (evaluation)
eval_df = pd.read_excel("/content/evaluation_file.xlsx")
print("Loaded evaluation:", eval_df.shape)


Loaded questions: (9, 14)
Loaded evaluation: (9, 3)


4. Parse Topic Metadata

In [6]:
# Extract topic names and descriptions
topic_names = []
topic_descriptions = []
for t in topics:
    topic_names.append(t["Topics"])
    desc = ""
    subtopics = t.get("Examples of Subtopics that would be\nincluded") or t.get("Examples of Subtopics that would be included")
    if isinstance(subtopics, dict):
        desc = ". ".join([f"{k}: {v}" for k, v in subtopics.items()])
    topic_descriptions.append(desc)

# Full text for embeddings
topic_fulltexts = [f"{name}. {desc}" for name, desc in zip(topic_names, topic_descriptions)]
print(f"Extracted {len(topic_names)} topics.")


Extracted 24 topics.


5. Generate OpenAI Embeddings

In [7]:
from tqdm import tqdm

def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

# Topic embeddings
topic_embeddings = []
for txt in tqdm(topic_fulltexts, desc="Embedding topics"):
    topic_embeddings.append(get_embedding(txt))

# Question embeddings
question_embeddings = []
for q in tqdm(questions_df["Question"], desc="Embedding questions"):
    question_embeddings.append(get_embedding(str(q)))


Embedding topics: 100%|██████████| 24/24 [00:10<00:00,  2.27it/s]
Embedding questions: 100%|██████████| 9/9 [00:03<00:00,  2.68it/s]


6. Semantic Retrieval: Find Top-N Topics for Each Question

In [8]:
import numpy as np

def cosine_similarity(a, b):
    a, b = np.array(a), np.array(b)
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

def retrieve_top_n(question_emb, topic_embeddings, N=3):
    sims = [cosine_similarity(question_emb, emb) for emb in topic_embeddings]
    idx = np.argsort(sims)[-N:][::-1]
    return idx, [sims[i] for i in idx]


7. Generate Rationale (GPT-4.1) & Map Outputs

In [11]:
results = []

for i, (q_text, q_emb) in enumerate(zip(questions_df["Question"], question_embeddings)):
    idxs, sims = retrieve_top_n(q_emb, topic_embeddings, N=3)
    retrieved = [topic_names[j] for j in idxs]
    top_topic = retrieved[0]
    # Generate rationale using GPT-4.1
    prompt = f"""You are a real estate exam coach.
Given the question: "{q_text}" and the topic: "{top_topic}" ({topic_descriptions[idxs[0]]}),
in 2 sentences, explain why this topic matches the question best."""
    rationale = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
        temperature=0.2
    ).choices[0].message.content.strip()
    results.append({
        "input_question": q_text,
        "predicted_topic": top_topic,
        "retrieved_topics": retrieved,
        "rationale": rationale
    })

results_df = pd.DataFrame(results)
results_df.to_excel("rag_question_topic_mapping.xlsx", index=False)
print("Results saved to rag_question_topic_mapping.xlsx")
results_df.head()


Results saved to rag_question_topic_mapping.xlsx


Unnamed: 0,input_question,predicted_topic,retrieved_topics,rationale
0,The majority of new licensees begin their care...,Licensing Requirements,"[Licensing Requirements, Licensing Law and Bro...","The topic ""Licensing Requirements"" matches the..."
1,What do most new real estate licensees do afte...,Licensing Requirements,"[Licensing Requirements, Out-of-State Brokers ...","The topic ""Licensing Requirements"" matches the..."
2,A licensee who is interested in retail stores ...,Special Areas of Practice,"[Special Areas of Practice, Licensing Law and ...","The topic ""Special Areas of Practice"" includes..."
3,"When considering a career in real estate, it i...",Licensing Law and Brokerage Operations,"[Licensing Law and Brokerage Operations, Pract...","The topic ""Licensing Law and Brokerage Operati..."
4,What should a licensee look for when selecting...,Licensing Law and Brokerage Operations,"[Licensing Law and Brokerage Operations, Pract...","The topic ""Licensing Law and Brokerage Operati..."


8. Evaluate Accuracy

In [14]:
# Merge on 'input_question'
merged = pd.merge(results_df, eval_df, on="input_question", how="left")

# Compute accuracy
accuracy = (merged["predicted_topic"].str.lower() == merged["correct_topic"].str.lower()).mean()
print(f"Accuracy: {accuracy*100:.2f}%")

# Display results for inspection
print(merged[["input_question", "predicted_topic", "correct_topic", "retrieved_topics", "rationale"]])



Accuracy: 11.11%
                                      input_question  \
0  The majority of new licensees begin their care...   
1  What do most new real estate licensees do afte...   
2  A licensee who is interested in retail stores ...   
3  When considering a career in real estate, it i...   
4  What should a licensee look for when selecting...   
5  Which type of brokerage coordinator is respons...   
6  The advantage of choosing a small independent ...   
7           Large independent real estate firms are:   
8  The opportunity to operate under the name and ...   

                          predicted_topic              correct_topic  \
0                  Licensing Requirements    Practice of Real Estate   
1                  Licensing Requirements    Practice of Real Estate   
2               Special Areas of Practice  Special Areas of Practice   
3  Licensing Law and Brokerage Operations    Practice of Real Estate   
4  Licensing Law and Brokerage Operations    Practice of Real 

9. Download Results File (Colab Only)

In [None]:
# If using Colab and want to download the file
from google.colab import files
files.download("rag_question_topic_mapping.xlsx")
