# 🤖 Generate Ukrainian QA Dataset with GPT-4
This notebook generates questions and answers from academic chunks using OpenAI GPT-4.

In [None]:
!pip install tqdm
!pip install --upgrade openai

In [None]:
from openai import OpenAI
import json
from tqdm import tqdm

# Insert your OpenAI API key
key = "sk-"
client = OpenAI(api_key=key)


In [None]:
# Load text chunks
from google.colab import files

uploaded = files.upload()  # load chunks.txt

# read file
with open("chunks.txt", "r", encoding="utf-8") as f:
    chunks = [line.strip() for line in f if len(line.strip()) > 30]

print(f"Loaded {len(chunks)} chunks")

In [None]:
def build_prompt(text):
    return f"""
You are a highly skilled academic assistant. Read the following Ukrainian technical academic text and generate 3–5 **meaningful**, **detailed** QA pairs. Each question should be specific, based on the actual content (not general), and each answer should be concise but technically accurate.

Format your output as a JSON list with elements like:
{{"question": "...", "answer": "..."}}

Only return the JSON list — no comments or explanations.

Text:
{text}
"""

In [None]:
def generate_qa(text):
    prompt = build_prompt(text)
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.5
        )
        content = response.choices[0].message.content
        return json.loads(content)
    except Exception as e:
        print(f"Error: {e}")
        return []

In [None]:
alpaca_data = []
for chunk in tqdm(chunks[:100]):
    qa_pairs = generate_qa(chunk)
    for pair in qa_pairs:
        alpaca_data.append({
            "instruction": "Answer the question based on the text.",
            "input": f"Text: {chunk}\nQuestion: {pair['question']}",
            "output": pair['answer']
        })

with open('qa_dataset.jsonl', 'w', encoding='utf-8') as f:
    for item in alpaca_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print("✅ Saved as qa_dataset.jsonl")