# ü¶∑ Q&A Generation for RAFT Dataset

This notebook generates question-answer pairs from dental text chunks.

**Input:** `chunks.jsonl` from notebook 01
**Output:** `qa_pairs.jsonl` for RAFT formatting

**API:** Uses Groq (FREE) or OpenAI for generation

## 1. Setup

In [None]:
# Install dependencies
!pip install -q groq openai tqdm

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Paths
DATA_DIR = "/content/drive/MyDrive/RAFT_dental_data"
CHUNKS_FILE = f"{DATA_DIR}/chunks.jsonl"
OUTPUT_FILE = f"{DATA_DIR}/qa_pairs.jsonl"

import os
if os.path.exists(CHUNKS_FILE):
    print(f"‚úì Found chunks file")
else:
    print(f"‚úó chunks.jsonl not found. Run notebook 01 first!")

In [None]:
# API Keys - Choose ONE:
# Option 1: Groq (FREE - recommended)
# Get your key at: https://console.groq.com/keys
GROQ_API_KEY = ""  # @param {type:"string"}

# Option 2: OpenAI (paid but higher quality)
OPENAI_API_KEY = ""  # @param {type:"string"}

# Select provider
USE_GROQ = True  # Set False to use OpenAI

if USE_GROQ and GROQ_API_KEY:
    os.environ["GROQ_API_KEY"] = GROQ_API_KEY
    print("‚úì Using Groq API (FREE)")
elif OPENAI_API_KEY:
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    print("‚úì Using OpenAI API")
else:
    print("‚ö† No API key provided. Enter your key above.")

## 2. Load Chunks

In [None]:
import json
from tqdm.notebook import tqdm

# Load all chunks
chunks = []
with open(CHUNKS_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        chunks.append(json.loads(line))

print(f"Loaded {len(chunks)} chunks")

# Sample chunk
print("\nSample chunk:")
print(f"Source: {chunks[0]['source']}")
print(f"Category: {chunks[0]['category']}")
print(f"Text preview: {chunks[0]['text'][:300]}...")

## 3. Q&A Generator

In [None]:
import time
import re
from typing import List, Dict, Optional

# Prompts
SYSTEM_PROMPT = """You are an expert dental educator creating study materials for dental students.
Generate high-quality question-answer pairs from the provided dental textbook content.

Requirements:
1. Generate 3-5 questions per passage
2. Questions should test understanding, not just recall
3. Include a mix of: conceptual, clinical application, and comparison questions
4. Answers MUST include direct quotes from the text using ##begin_quote## and ##end_quote## markers
5. Answers should explain the concept, not just quote

Output format (JSON array):
[
  {
    "question": "What are the indications for...",
    "answer": "According to the text, ##begin_quote##exact quote here##end_quote##, this means that...",
    "difficulty": "medium",
    "type": "conceptual"
  }
]

Question types: conceptual, clinical, procedural, comparison, definition
Difficulty levels: easy, medium, hard"""

USER_PROMPT_TEMPLATE = """Generate Q&A pairs from this dental textbook passage:

Source: {source}
Category: {category}
Page: {page_number}

Content:
{text}

Generate 3-5 question-answer pairs. Return ONLY valid JSON array."""


class QAGenerator:
    """Generate Q&A pairs using LLM APIs."""
    
    def __init__(self, use_groq: bool = True):
        self.use_groq = use_groq
        
        if use_groq:
            from groq import Groq
            self.client = Groq()
            self.model = "llama-3.1-70b-versatile"
        else:
            from openai import OpenAI
            self.client = OpenAI()
            self.model = "gpt-4-turbo-preview"
        
        self.rate_limit_delay = 1.0 if use_groq else 0.5
    
    def generate(self, chunk: Dict) -> List[Dict]:
        """Generate Q&A pairs from a chunk."""
        user_prompt = USER_PROMPT_TEMPLATE.format(
            source=chunk.get('source', 'Unknown'),
            category=chunk.get('category', 'Unknown'),
            page_number=chunk.get('page_number', 0),
            text=chunk['text']
        )
        
        try:
            if self.use_groq:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": user_prompt}
                    ],
                    temperature=0.7,
                    max_tokens=2000
                )
            else:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": user_prompt}
                    ],
                    temperature=0.7,
                    max_tokens=2000,
                    response_format={"type": "json_object"}
                )
            
            content = response.choices[0].message.content
            qa_pairs = self._parse_response(content)
            
            # Add metadata
            for qa in qa_pairs:
                qa['chunk_id'] = chunk.get('chunk_id', '')
                qa['source'] = chunk.get('source', '')
                qa['category'] = chunk.get('category', '')
                qa['page_number'] = chunk.get('page_number', 0)
                qa['context_text'] = chunk['text']
            
            time.sleep(self.rate_limit_delay)
            return qa_pairs
            
        except Exception as e:
            print(f"Error generating Q&A: {e}")
            return []
    
    def _parse_response(self, content: str) -> List[Dict]:
        """Parse LLM response to extract Q&A pairs."""
        try:
            # Try direct JSON parse
            data = json.loads(content)
            if isinstance(data, list):
                return data
            elif isinstance(data, dict) and 'questions' in data:
                return data['questions']
            elif isinstance(data, dict) and 'qa_pairs' in data:
                return data['qa_pairs']
            return []
        except json.JSONDecodeError:
            # Try to extract JSON from markdown code blocks
            match = re.search(r'```(?:json)?\s*([\s\S]*?)```', content)
            if match:
                try:
                    return json.loads(match.group(1))
                except:
                    pass
            
            # Try to find JSON array
            match = re.search(r'\[\s*\{[\s\S]*\}\s*\]', content)
            if match:
                try:
                    return json.loads(match.group(0))
                except:
                    pass
            
            return []


# Initialize generator
generator = QAGenerator(use_groq=USE_GROQ)
print(f"‚úì Initialized {generator.model}")

## 4. Test Generation

In [None]:
# Test with one chunk
test_chunk = chunks[0]
print(f"Testing with: {test_chunk['source']}")
print(f"Category: {test_chunk['category']}")
print()

test_qa = generator.generate(test_chunk)
print(f"Generated {len(test_qa)} Q&A pairs:\n")

for i, qa in enumerate(test_qa, 1):
    print(f"Q{i}: {qa.get('question', 'N/A')}")
    print(f"A{i}: {qa.get('answer', 'N/A')[:200]}...")
    print(f"Type: {qa.get('type', 'N/A')}, Difficulty: {qa.get('difficulty', 'N/A')}")
    print()

## 5. Generate All Q&A Pairs

In [None]:
# Configuration
MAX_CHUNKS = None  # Set to number to limit, None for all
SKIP_EXISTING = True  # Skip if output file exists
CHECKPOINT_EVERY = 100  # Save progress every N chunks

# Check for existing progress
existing_qa = []
processed_chunk_ids = set()

if SKIP_EXISTING and os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            qa = json.loads(line)
            existing_qa.append(qa)
            processed_chunk_ids.add(qa.get('chunk_id', ''))
    print(f"Found {len(existing_qa)} existing Q&A pairs")
    print(f"Skipping {len(processed_chunk_ids)} already processed chunks")

# Filter chunks to process
chunks_to_process = [c for c in chunks if c.get('chunk_id', '') not in processed_chunk_ids]
if MAX_CHUNKS:
    chunks_to_process = chunks_to_process[:MAX_CHUNKS]

print(f"\nChunks to process: {len(chunks_to_process)}")
print(f"Estimated Q&A pairs: {len(chunks_to_process) * 4}")

In [None]:
# Generate Q&A pairs
all_qa = existing_qa.copy()
failed = 0

with open(OUTPUT_FILE, 'a' if existing_qa else 'w', encoding='utf-8') as f:
    for i, chunk in enumerate(tqdm(chunks_to_process, desc="Generating Q&A")):
        try:
            qa_pairs = generator.generate(chunk)
            
            for qa in qa_pairs:
                f.write(json.dumps(qa, ensure_ascii=False) + "\n")
                all_qa.append(qa)
            
            # Checkpoint
            if (i + 1) % CHECKPOINT_EVERY == 0:
                f.flush()
                tqdm.write(f"Checkpoint: {len(all_qa)} Q&A pairs saved")
                
        except Exception as e:
            failed += 1
            tqdm.write(f"Failed on chunk {chunk.get('chunk_id', i)}: {e}")
            continue

print(f"\n‚úì Generated {len(all_qa)} total Q&A pairs")
print(f"Failed chunks: {failed}")

## 6. Analyze Results

In [None]:
# Load final Q&A pairs
qa_pairs = []
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        qa_pairs.append(json.loads(line))

print(f"Total Q&A pairs: {len(qa_pairs)}")

# Distribution by category
print("\nBy Category:")
category_counts = {}
for qa in qa_pairs:
    cat = qa.get('category', 'Unknown')
    category_counts[cat] = category_counts.get(cat, 0) + 1

for cat, count in sorted(category_counts.items(), key=lambda x: -x[1])[:10]:
    print(f"  {cat}: {count}")

# Distribution by type
print("\nBy Question Type:")
type_counts = {}
for qa in qa_pairs:
    qtype = qa.get('type', 'unknown')
    type_counts[qtype] = type_counts.get(qtype, 0) + 1

for qtype, count in sorted(type_counts.items(), key=lambda x: -x[1]):
    print(f"  {qtype}: {count}")

# Check citation markers
with_citations = sum(1 for qa in qa_pairs if '##begin_quote##' in qa.get('answer', ''))
print(f"\nAnswers with citations: {with_citations}/{len(qa_pairs)} ({100*with_citations/len(qa_pairs):.1f}%)")

In [None]:
# Sample Q&A pairs
import random

print("Sample Q&A pairs:\n")
for qa in random.sample(qa_pairs, min(3, len(qa_pairs))):
    print(f"Category: {qa.get('category', 'N/A')}")
    print(f"Q: {qa.get('question', 'N/A')}")
    print(f"A: {qa.get('answer', 'N/A')[:300]}...")
    print("-" * 50)

## 7. Summary

In [None]:
file_size = os.path.getsize(OUTPUT_FILE) / (1024**2)

print("="*60)
print("Q&A GENERATION COMPLETE")
print("="*60)
print(f"üìù Total Q&A pairs: {len(qa_pairs)}")
print(f"üìÅ Output file: {OUTPUT_FILE}")
print(f"üíæ File size: {file_size:.1f} MB")
print(f"üìä Categories covered: {len(category_counts)}")
print(f"‚úÖ Citation rate: {100*with_citations/len(qa_pairs):.1f}%")
print("="*60)
print("Next: Run 03_raft_dataset_creation.ipynb")
print("="*60)