## generate questions by openai

In [1]:
import openai 
from openai import OpenAI
import pickle
import json

In [5]:
openai_client = OpenAI(api_key = OPENAI_API_KEY)

## step1: create unique ids for each questions

In [2]:
import requests
import hashlib
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)

In [3]:
def generate_unique_question_id(doc):
    by_id = f'{doc['course']} - {doc['question']} - {doc['text'][:10]}'
    id = hashlib.md5(by_id.encode('utf-8')).hexdigest()
    return id[:10]

In [4]:
for doc in documents:
    doc['id'] = generate_unique_question_id(doc)

In [6]:
with open("document.json", "w", encoding="utf-8") as f:
    json.dump(documents, f, ensure_ascii=False, indent=4)

In [9]:
from  collections import defaultdict
grouped = defaultdict(list)
for d in documents:
    grouped[d['id']].append(d)

## step2: create a prompt

In [10]:
prompt_template = ''' 
You are a student. Based on the record below, generate 5 new user questions that are as short as possible. 

Record:
section: {section}
question: {question}
answer: {text}

Return only a JSON array of the questions, without any code blocks or extra text, in this format:
['question1', 'question2', 'question3', 'question4', 'question5']
'''.strip()

In [11]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    response = openai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages = [{'role': 'user',
                    'content': prompt}]
        )
    json_response = response.choices[0].message.content
    return json_response

In [15]:
results = {}
for doc in documents:
    doc_id = doc['id']
    if doc_id in results:
        continue
    questions = generate_questions(doc)
    results[doc_id] = questions

In [25]:
doc_index = {doc['id']: doc['course'] for doc in documents}

In [59]:
import re
def safe_parse_list(text):
    text = text.strip()
    # Escape apostrophes inside words like "I'm"
    text = re.sub(r"(?<=\w)'(?=\w)", "\\'", text)
    # Remove trailing commas before closing bracket
    text = re.sub(r",\s*\]", "]", text)
    try:
        return ast.literal_eval(text)
    except Exception:
        return []

In [76]:
finl_results = []
for id, questions in results.items():
    questions = safe_parse_list(questions) 
    course = doc_index[id]
    for q in questions:
        finl_results.append((q, id, course))

In [74]:
import pandas as pd

In [78]:
finl_csv = pd.DataFrame(finl_results, columns = ['question', 'doc_id', 'course'])

In [79]:
finl_csv.to_csv('prepared_question_evaluation.csv')