In [34]:
import os
import random
from dotenv import load_dotenv
# ✅ Step 1: Load environment variables
load_dotenv()
# Make sure to set your OpenAI API key in the .env file or directly here

# ✅ Step 2: Import libraries
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain import hub
import openai
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

openai.api_key = os.getenv("OPENAI_API_KEY")  # Ensure you have set this in your .env file
embedding = OpenAIEmbeddings(openai_api_key=openai.api_key)
vectorstore = FAISS.load_local("nba_vector_db_semantic", embeddings=embedding, allow_dangerous_deserialization=True)

In [35]:
import json, pandas as pd

In [40]:
documents = list(vectorstore.docstore._dict.values())
random.seed(788)
selected_docs = random.sample(documents, 80)


In [41]:
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0.7)
parser = JsonOutputParser()

In [42]:
# — after sampling selected_docs —

prompt = PromptTemplate.from_template(
    """
You are an expert in generating high-quality, diverse basketball-related questions.

This passage is about the player **{player}**:
{document}

Generate **ONE** clear and relevant question that includes the player’s name "{player}". Your question should be based on the passage, and must fall into one of the following types:

1. **Simple factual** — e.g., biographical facts, team history, awards.
2. **Statistical** — must include the **year** when referencing any stats.
3. **Reasoning-based** — requires connecting multiple facts or drawing a conclusion.
4. **Comparative** — comparing the player with teammates, other seasons, or opponents.
5. **Event-specific** — referencing a game, season, or milestone moment.

**Important:**
- If you ask a question about statistics, explicitly include the year.
- The question must be **answerable from the document**.
- Make sure the question is natural, specific, and unambiguous.

Respond ONLY in JSON format:
{{
  "question": "...",
  "answer": "..."
}}
    """
)
qa_chain = (
    {"document": RunnablePassthrough(), "player": RunnablePassthrough()}
    | prompt
    | llm
    | parser
)

results = []
for doc in selected_docs:
    qa = qa_chain.invoke({
        "document": doc.page_content,
        "player": doc.metadata["player"]
    })
    # filter out any that failed to include the name
    if doc.metadata["player"].lower() in qa["question"].lower():
        qa["context"] = doc.page_content
        results.append(qa)


In [43]:
with open("nba_generated_qa.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# 6.2 CSV
df = pd.DataFrame(results)
df.to_csv("nba_generated_qa.csv", index=False, encoding="utf-8-sig")
