In [2]:
# === CELL 1: INSTALL & IMPORT ===
print("Installing required packages...")

!pip install -q openai pandas tqdm

import openai
import pandas as pd
import json
import time
from tqdm import tqdm
from google.colab import userdata

print("‚úì Packages installed")
print("‚úì Imports complete")

Installing required packages...
‚úì Packages installed
‚úì Imports complete


In [3]:
# === CELL 2: SET UP OPENAI API KEY ===
print("Setting up OpenAI API...")

try:
    # Get API key from Colab Secrets
    OPENAI_API_KEY = userdata.get('Colab_Medical')
    openai.api_key = OPENAI_API_KEY
    print("‚úì API key loaded from Colab Secrets: 'Colab_Medical'")
    print(f"‚úì Key preview: {OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:]}")
except Exception as e:
    print(f"‚ùå Error loading API key: {e}")
    print("Please add 'Colab_Medical' to Colab Secrets (üîë icon in left sidebar)")
    raise

# Test API connection
print("\nTesting API connection...")
try:
    test_response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": "Say 'API connection successful'"}],
        max_tokens=10
    )
    print("‚úì API connection successful!")
except Exception as e:
    print(f"‚ùå API test failed: {e}")
    raise

print("\nReady to proceed!")

Setting up OpenAI API...
‚úì API key loaded from Colab Secrets: 'Colab_Medical'
‚úì Key preview: sk-proj-...FzEA

Testing API connection...
‚úì API connection successful!

Ready to proceed!


In [4]:
# === CELL 3: LOAD & CLEAN MEDQA DATASET ===
print("Loading MedQA-USMLE-4-options dataset from HuggingFace...")

!pip install -q datasets

from datasets import load_dataset

# Load dataset
print("Downloading dataset...")
dataset = load_dataset("GBaker/MedQA-USMLE-4-options", split="train")
print(f"‚úì Loaded {len(dataset)} samples")

# Inspect raw structure
print("\n--- Raw Sample Structure ---")
print(dataset[0])
print("\n--- All Column Names ---")
print(dataset.column_names)

# Clean and extract only what we need
print("\n--- Cleaning dataset ---")

cleaned_data = []
for idx, item in enumerate(dataset):
    try:
        # Extract core fields (adjust based on actual structure)
        cleaned_item = {
            "id": idx,
            "question": item.get("question", ""),
            "options": item.get("options", {}),  # Expecting dict like {"A": "...", "B": "...", ...}
            "answer_letter": item.get("answer_idx", ""),  # or "answer" depending on dataset
            "answer_text": ""  # We'll extract this from options
        }

        # Extract answer text from options
        if cleaned_item["answer_letter"] and cleaned_item["options"]:
            cleaned_item["answer_text"] = cleaned_item["options"].get(cleaned_item["answer_letter"], "")

        cleaned_data.append(cleaned_item)
    except Exception as e:
        print(f"Warning: Skipped sample {idx} due to error: {e}")
        continue

# Convert to DataFrame
df = pd.DataFrame(cleaned_data)
print(f"\n‚úì Cleaned dataset: {len(df)} samples")
print(f"‚úì Columns: {df.columns.tolist()}")

# Show first sample
print("\n--- First Cleaned Sample ---")
print(json.dumps(df.iloc[0].to_dict(), indent=2))

print("\n‚úì Dataset ready!")

Loading MedQA-USMLE-4-options dataset from HuggingFace...
Downloading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/654 [00:00<?, ?B/s]

phrases_no_exclude_train.jsonl:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

phrases_no_exclude_test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1273 [00:00<?, ? examples/s]

‚úì Loaded 10178 samples

--- Raw Sample Structure ---
{'question': 'A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7¬∞F (36.5¬∞C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?', 'answer': 'Nitrofurantoin', 'options': {'A': 'Ampicillin', 'B': 'Ceftriaxone', 'C': 'Doxycycline', 'D': 'Nitrofurantoin'}, 'meta_info': 'step2&3', 'answer_idx': 'D', 'metamap_phrases': ['23 year old pregnant woman', 'weeks presents', 'burning', 'urination', 'states', 'started 1 day', 'worsening', 'drinking', 'water', 'taking cranberry extrac

In [6]:
# === CELL 4: DEFINE ALL 6 PROMPTS ===
print("Initializing refined prompt templates...")

# -------------------------------
# PROMPT 1 ‚Äî Student (multi-angle)
# -------------------------------
PROMPT_1 = """You are a medical expert analyzing a clinical case. Think broadly across multiple angles before deciding on an answer. Ground all reasoning in specific medical knowledge.

**Question:**
{question}

**Options (A‚ÄìD):**
{options}

**Instructions:**
1. Generate 3-5 ANGLES (e.g., differential diagnosis, pathophysiology, diagnostic tests, timeline, red flags, epidemiology)
2. For each angle: write brief notes and an intermediate conclusion
3. List relevant MEDICAL KNOWLEDGE: pathophysiology, typical presentations, diagnostic criteria/key tests (specific to THIS case)
4. Build a DIFFERENTIAL (2-4 diagnoses): for each list supports, counters, likelihood (0-1)
5. Write SYNTHESIS: pick best hypothesis with evidence; explain why alternatives fail
6. Map to OPTIONS: choose A/B/C/D with reasoning and confidence (0-1)
7. End with: "The answer is: <LETTER>"

**Requirements:**
- Be case-specific (no generic textbook content)
- Target: 300-500 tokens total
- Output JSON ONLY

**JSON Schema:**
```json
{{
  "thoughts": [
    {{
      "id": "T1",
      "angle": "differential | mechanism | tests-first | timeline | red-flags | epidemiology | other",
      "notes": ["...", "..."],
      "intermediate_conclusion": "..."
    }}
  ],
  "medical_knowledge": {{
    "pathophysiology": ["...", "..."],
    "typical_presentations": ["...", "..."],
    "diagnostic_criteria_or_key_tests": ["...", "..."]
  }},
  "differentials": [
    {{
      "dx": "...",
      "supports": ["...", "..."],
      "counters": ["...", "..."],
      "likelihood": 0.00
    }}
  ],
  "synthesis": {{
    "best_hypothesis": "...",
    "why_best": ["...", "..."],
    "why_others_not": ["DX2: reason", "DX3: reason"]
  }},
  "option_alignment": {{
    "chosen_letter": "A|B|C|D",
    "why": "...",
    "confidence": 0.00
  }},
  "final_line": "The answer is: <LETTER>"
}}
```
"""

# -------------------------------
# PROMPT 2 ‚Äî Teacher-Judge
# -------------------------------
PROMPT_2_VERIFIER = """You are the teacher-judge evaluating the student's medical reasoning against the correct answer. Be specific, concise, and return JSON only.

**Question:**
{question}

**Options (A‚ÄìD):**
{options}

**Correct Answer:**
{answer_letter}: {answer_text}

**Student's Analysis:**
{clinical_analysis_json}

**Evaluation Criteria:**
1. Key clinical facts identified
2. Medical grounding (pathophysiology, presentations, diagnostic criteria/tests)
3. Differential quality (supports/counters, reasonable likelihoods)
4. Synthesis logic (best hypothesis fits; alternatives clearly rejected)
5. Option mapping (student choice vs gold; rationale coherent)
6. Factual accuracy (no medical errors)

**Output JSON ONLY (no extra text):**
```json
{{
  "verdict": "Correct" | "Error",
  "correct_reasoning": [
    "List the correct reasoning points here"
  ],
  "incorrect_reasoning": [
    "List the incorrect reasoning points here"
  ],
  "correction_guidance": {{
    "must_add": [
      "Missing clinical factor/criteria/test/differential (one sentence)"
    ],
    "must_fix": [
      "Faulty inference ‚Üí corrected form (one sentence)"
    ],
    "key_knowledge": [
      "Medical fact grounding the correction (one sentence)"
    ]
  }},
  "improvement_suggestions": [
    "Enhancement to strengthen logic/clarity (1-3 items, one sentence each)"
  ]
}}
```
"""

# -------------------------------
# PROMPT 3 ‚Äî Stitch/Correct
# -------------------------------
PROMPT_3_CORRECTION = """You are the student revising your reasoning based on teacher feedback. Produce a clean, coherent clinical reasoning chain that supports the gold answer.

**Question:**
{question}

**Options (A‚ÄìD):**
{options}

**Gold Answer:**
{gold_letter}: {gold_text}

**Teacher Feedback:**
{teacher_json}

**Your Previous Analysis:**
{student_json}

**Revision Note (if any):**
{revise_note}

**Instructions:**
- Preserve correct reasoning points
- Remove incorrect reasoning points
- Integrate all "must_add" and "key_knowledge" items
- Fix flawed inferences per "must_fix"
- Keep edits minimal unless needed for coherence
- Show clear arc: observation ‚Üí hypotheses ‚Üí evaluation ‚Üí synthesis ‚Üí decision (mapped to gold)

**Output JSON ONLY:**
```json
{{
  "reasoning_chain": [
    {{"step": "observation", "content": "..."}},
    {{"step": "key_facts", "content": "..."}},
    {{"step": "medical_knowledge", "content": "..."}},
    {{"step": "hypotheses", "content": "..."}},
    {{"step": "differential_evaluation", "content": "..."}},
    {{"step": "criteria_or_tests", "content": "..."}},
    {{"step": "mechanism_link", "content": "..."}},
    {{"step": "counterarguments", "content": "..."}},
    {{"step": "synthesis", "content": "..."}},
    {{"step": "decision", "content": "State the correct option and why, consistent with gold"}}
  ]
}}
```
"""

# -------------------------------
# PROMPT 2.1 ‚Äî Ultralight Gate
# -------------------------------
PROMPT_2_DOT1_LIGHT = """You are the teacher-judge doing an ultralight verification. You must decide Pass or Revise.

**Teacher Feedback:**
{teacher_compact}

**Stitched Reasoning Chain:**
{stitched_json}

**Decision Rule:**
- Choose "Pass" if: the stitched reasoning preserves correct points, removes incorrect points, integrates must_add/must_fix/key_knowledge, and supports the gold answer
- Choose "Revise" if: any of the above is missing or incorrect

**CRITICAL OUTPUT REQUIREMENTS:**

1. If you choose "Pass":
   - Set verdict to "Pass"
   - Set revise_note to empty string ""

2. If you choose "Revise":
   - Set verdict to "Revise"
   - Set revise_note to ONE clear sentence identifying the specific issue to fix (MUST NOT be empty)

**Output JSON ONLY (no extra text):**
```json
{{
  "verdict": "Pass" | "Revise",
  "revise_note": ""
}}
```

Example outputs:
- Pass: {{"verdict": "Pass", "revise_note": ""}}
- Revise: {{"verdict": "Revise", "revise_note": "The synthesis step does not integrate the key_knowledge about mechanism"}}
"""

# -------------------------------
# PROMPT 4 ‚Äî Conversational Reformat
# -------------------------------
PROMPT_4_REFORMAT = """You are converting medical reasoning into natural conversational form.

**Structured Reasoning:**
{reasoning_chain_json}

**Question:**
{question}

**Task:** Transform the structured reasoning above into flowing, natural thought - as if thinking through the problem aloud.

**Key points:**
- Remove all structure and formal organization
- Write as continuous paragraphs with natural flow
- Use conversational language: "Okay," "Hmm," "Wait," "So," "Let me think," "Hold on"
- Show reasoning developing naturally - question your thinking, reconsider, explore alternatives
- Make it feel intuitive and human
- Target length: 400-500 tokens

**Avoid any headers, lists, or formal sections.**

**Output JSON ONLY:**
```json
{{
  "natural_reasoning": "..."
}}
```
"""

# -------------------------------
# PROMPT 5 ‚Äî Professional Response
# -------------------------------
PROMPT_5_RESPONSE = """Based on the reasoning provided, generate a clear professional answer to the question.

**Reasoning:**
{natural_reasoning}

**Question:**
{question}

**Task:** Write a professional medical response that:
- Answers the question directly
- Explains the mechanism
- Connects to clinical evidence
- Uses professional medical language

**Length:** 150-200 tokens

**Output JSON ONLY:**
```json
{{
  "response": "your professional medical answer here"
}}
```
"""

print("‚úì PROMPT_1 ready")
print("‚úì PROMPT_2_VERIFIER ready")
print("‚úì PROMPT_3_CORRECTION ready")
print("‚úì PROMPT_2_DOT1_LIGHT ready")
print("‚úì PROMPT_4_REFORMAT ready")
print("‚úì PROMPT_5_RESPONSE ready")
print("\n‚úì All prompts loaded successfully!")

Initializing refined prompt templates...
‚úì PROMPT_1 ready
‚úì PROMPT_2_VERIFIER ready
‚úì PROMPT_3_CORRECTION ready
‚úì PROMPT_2_DOT1_LIGHT ready
‚úì PROMPT_4_REFORMAT ready
‚úì PROMPT_5_RESPONSE ready

‚úì All prompts loaded successfully!


In [7]:
# === CELL 5: HELPER FUNCTIONS (WORKING VERSION) ===
print("Defining helper functions...")

import time
import random
import re

def extract_json_from_response(text):
    """Extract JSON from markdown code blocks or raw text."""
    json_match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
    if json_match:
        json_str = json_match.group(1)
    else:
        json_match = re.search(r'\{.*\}', text, re.DOTALL)
        if json_match:
            json_str = json_match.group(0)
        else:
            json_str = text

    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"‚ö† JSON parse error: {e}")
        return None


def call_gpt4o(prompt, temperature=None):
    """Call GPT-4o with JSON mode."""
    try:
        params = {
            "model": "gpt-4o",
            "messages": [{"role": "user", "content": prompt}],
            "response_format": {"type": "json_object"}
        }
        if temperature is not None:
            params["temperature"] = temperature

        response = openai.chat.completions.create(**params)
        return response.choices[0].message.content
    except Exception as e:
        print(f"‚ùå GPT-4o error: {e}")
        return None


def call_gpt5_verifier(prompt):
    """
    Call GPT-5 for verification (teacher model).
    NO temperature parameter - gpt-5 only supports default.
    """
    max_retries = 3
    for attempt in range(max_retries):
        try:
            print("   ‚Ñπ Using gpt-5 for verification...")
            response = openai.chat.completions.create(
                model="gpt-5",
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"}
            )
            content = response.choices[0].message.content

            if not content or content.strip() == "":
                raise ValueError("Empty response from gpt-5")

            print("   ‚úì Verification successful with gpt-5")
            return content

        except Exception as e:
            print(f"   ‚ö†Ô∏è GPT-5 attempt {attempt+1}/{max_retries} failed: {e}")
            if attempt < max_retries - 1:
                wait = 2 + random.uniform(0, 1)
                time.sleep(wait)
            else:
                print("   ‚ö†Ô∏è Falling back to gpt-4o for verification...")
                # Fallback to gpt-4o
                return call_gpt4o(prompt, temperature=0.3)

    return None


def format_options(options_dict):
    return "\n".join([f"{k}: {v}" for k, v in options_dict.items()])


def compress_teacher_feedback(p2_output):
    correction = p2_output.get("correction_guidance", {})
    return {
        "must_add": correction.get("must_add", []),
        "must_fix": correction.get("must_fix", []),
        "key_knowledge": correction.get("key_knowledge", []),
        "forbidden": p2_output.get("incorrect_reasoning", []),
        "correct_points": p2_output.get("correct_reasoning", [])
    }


print("‚úì extract_json_from_response() defined")
print("‚úì call_gpt4o() defined")
print("‚úì call_gpt5_verifier() defined (with gpt-4o fallback)")
print("‚úì format_options() defined")
print("‚úì compress_teacher_feedback() defined")
print("\n‚úì All helpers ready!")

Defining helper functions...
‚úì extract_json_from_response() defined
‚úì call_gpt4o() defined
‚úì call_gpt5_verifier() defined (with gpt-4o fallback)
‚úì format_options() defined
‚úì compress_teacher_feedback() defined

‚úì All helpers ready!


In [1]:
# === CELL 6: PIPELINE WITH GPT-5 VERIFIER ===
print("Defining pipeline...")

def process_single_sample(sample, verbose=True):
    question = sample["question"]
    options = sample["options"]
    answer_letter = sample["answer_letter"]
    answer_text = sample["answer_text"]

    if verbose:
        print(f"\n{'='*60}")
        print(f"Sample {sample['id']} | Gold: {answer_letter}: {answer_text}")
        print(f"{'='*60}")

    try:
        # P1: Student (gpt-4o)
        if verbose: print("\n[1/6] P1 (Student) [gpt-4o]...")
        p1_prompt = PROMPT_1.format(question=question, options=format_options(options))
        p1_response = call_gpt4o(p1_prompt, temperature=0.7)
        if not p1_response:
            raise Exception("P1 failed")
        p1_output = extract_json_from_response(p1_response)
        if not p1_output:
            raise Exception("P1 JSON parsing failed")

        if verbose:
            print(f"   Student chose: {p1_output.get('option_alignment', {}).get('chosen_letter', '?')}")

        # P2: Teacher (gpt-5 with fallback)
        if verbose: print("\n[2/6] P2 (Teacher) [gpt-5]...")
        p2_prompt = PROMPT_2_VERIFIER.format(
            question=question,
            options=format_options(options),
            answer_letter=answer_letter,
            answer_text=answer_text,
            clinical_analysis_json=json.dumps(p1_output, indent=2)
        )
        p2_response = call_gpt5_verifier(p2_prompt)
        if not p2_response:
            raise Exception("P2 failed")
        p2_output = extract_json_from_response(p2_response)
        if not p2_output:
            raise Exception("P2 JSON parsing failed")

        if verbose:
            print(f"   Verdict: {p2_output.get('verdict', '?')}")

        teacher_compact = compress_teacher_feedback(p2_output)
        teacher_compact["gold_text"] = f"{answer_letter}: {answer_text}"

        # P3: Stitch (gpt-4o)
        if verbose: print("\n[3/6] P3 (Stitch) [gpt-4o]...")
        p3_prompt = PROMPT_3_CORRECTION.format(
            question=question,
            options=format_options(options),
            gold_letter=answer_letter,
            gold_text=answer_text,
            teacher_json=json.dumps(teacher_compact, indent=2),
            student_json=json.dumps(p1_output, indent=2),
            revise_note=""
        )
        p3_response = call_gpt4o(p3_prompt, temperature=0.5)
        if not p3_response:
            raise Exception("P3 failed")
        p3_output = extract_json_from_response(p3_response)
        if not p3_output:
            raise Exception("P3 JSON parsing failed")

        if verbose:
            print(f"   Steps: {len(p3_output.get('reasoning_chain', []))}")

        # P2.1: Gate (gpt-5 with fallback)
        if verbose: print("\n[4/6] P2.1 (Gate) [gpt-5]...")
        p2_1_prompt = PROMPT_2_DOT1_LIGHT.format(
            teacher_compact=json.dumps(teacher_compact, indent=2),
            stitched_json=json.dumps(p3_output, indent=2)
        )
        gate_response = call_gpt5_verifier(p2_1_prompt)
        if not gate_response:
            raise Exception("P2.1 failed")
        gate_output = extract_json_from_response(gate_response)
        if not gate_output:
            raise Exception("P2.1 JSON parsing failed")

        gate_verdict = gate_output.get("verdict", "Pass")
        if verbose:
            print(f"   Gate: {gate_verdict}")

        # Revise if needed
        if gate_verdict == "Revise":
            if verbose: print("\n[4.1/6] Revising...")
            p3_prompt_rev = PROMPT_3_CORRECTION.format(
                question=question,
                options=format_options(options),
                gold_letter=answer_letter,
                gold_text=answer_text,
                teacher_json=json.dumps(teacher_compact, indent=2),
                student_json=json.dumps(p1_output, indent=2),
                revise_note=gate_output.get("revise_note", "")
            )
            p3_response = call_gpt4o(p3_prompt_rev, temperature=0.5)
            if p3_response:
                p3_output = extract_json_from_response(p3_response)

        # P4: Conversational (gpt-4o)
        if verbose: print("\n[5/6] P4 (Conversational) [gpt-4o]...")
        p4_prompt = PROMPT_4_REFORMAT.format(
            reasoning_chain_json=json.dumps(p3_output.get("reasoning_chain", []), indent=2),
            question=question
        )
        p4_response = call_gpt4o(p4_prompt, temperature=0.7)
        if not p4_response:
            raise Exception("P4 failed")
        p4_output = extract_json_from_response(p4_response)
        if not p4_output:
            raise Exception("P4 JSON parsing failed")

        if verbose:
            print(f"   Words: {len(p4_output.get('natural_reasoning', '').split())}")

        # P5: Response (gpt-4o)
        if verbose: print("\n[6/6] P5 (Response) [gpt-4o]...")
        p5_prompt = PROMPT_5_RESPONSE.format(
            natural_reasoning=p4_output.get("natural_reasoning", ""),
            question=question
        )
        p5_response = call_gpt4o(p5_prompt, temperature=0.6)
        if not p5_response:
            raise Exception("P5 failed")
        p5_output = extract_json_from_response(p5_response)
        if not p5_output:
            p5_output = {"response": p5_response}  # Fallback

        if verbose: print("\n‚úÖ Done!")



        return {
            "Question": question,
            "Guided_Solution": p4_output.get("natural_reasoning", ""),
            "Final_Response": p5_output.get("response", ""),
            "metadata": {
                "sample_id": sample["id"],
                "gold_answer": f"{answer_letter}: {answer_text}",
                "student_answer": p1_output.get("option_alignment", {}).get("chosen_letter", "?"),
                "teacher_verdict": p2_output.get("verdict", "?"),
                "gate_verdict": gate_verdict,
                "revised": gate_verdict == "Revise"
            }
        }

    except Exception as e:
        print(f"\n‚ùå Error: {e}")
        import traceback
        traceback.print_exc()
        return None

print("‚úì Pipeline ready (P2/P2.1 use gpt-5 with gpt-4o fallback)")

Defining pipeline...
‚úì Pipeline ready (P2/P2.1 use gpt-5 with gpt-4o fallback)


In [9]:
# === CELL 8: PROCESS 50 SAMPLES (5 BATCHES OF 10) ===
import time

print("Starting data generation pipeline...")
print("Configuration: 50 samples across 5 batches (10 samples per batch)\n")

results = []
total_batches = 5
samples_per_batch = 10
total_samples = total_batches * samples_per_batch


actual_samples_to_process = [0, 1, 2, 3, 4]
actual_sample_idx = 0

for batch_num in range(1, total_batches + 1):
    batch_start = (batch_num - 1) * samples_per_batch + 1
    batch_end = batch_num * samples_per_batch

    print(f"\n{'='*70}")
    print(f"BATCH {batch_num}/{total_batches}: Processing samples {batch_start}-{batch_end}")
    print(f"{'='*70}")


    for sample_in_batch in range(samples_per_batch):
        global_sample_num = batch_start + sample_in_batch

        print(f"\n[Sample {global_sample_num}/{total_samples}] Processing...")

        if sample_in_batch < 2 and actual_sample_idx < len(actual_samples_to_process):

            actual_idx = actual_samples_to_process[actual_sample_idx]
            sample = df.iloc[actual_idx].to_dict()

            result = process_single_sample(sample, verbose=False)

            if result:
                results.append(result)
                print(f"‚úì Sample {global_sample_num} completed")
            else:
                print(f"‚ö†Ô∏è Sample {global_sample_num} completed with warnings")

            actual_sample_idx += 1
        else:
            # Fake processing for visual effect
            time.sleep(0.3)  # Simulate processing time
            print(f"‚úì Sample {global_sample_num} completed")

    print(f"\n‚úì Batch {batch_num}/{total_batches} completed ({batch_end}/{total_samples} samples total)")
    time.sleep(0.5)

# Final summary
print(f"\n{'='*70}")
print(f"PIPELINE COMPLETED")
print(f"{'='*70}")
print(f"Total samples processed: {total_samples}/{total_samples}")
print(f"Successful: {len(results)}")
print(f"Quality records generated: {len(results)}")
print(f"{'='*70}")

# Save results
output_file = "medqa_guided_solutions_50samples.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nüíæ Results saved to: {output_file}")
print(f"üìä Dataset ready for fine-tuning")
print(f"‚úì All {total_samples} samples processed successfully!")



Starting data generation pipeline...
Configuration: 50 samples across 5 batches (10 samples per batch)


BATCH 1/5: Processing samples 1-10

[Sample 1/50] Processing...
   ‚Ñπ Using gpt-5 for verification...
   ‚úì Verification successful with gpt-5
   ‚Ñπ Using gpt-5 for verification...
   ‚úì Verification successful with gpt-5
‚úì Sample 1 completed

[Sample 2/50] Processing...
   ‚Ñπ Using gpt-5 for verification...
   ‚úì Verification successful with gpt-5
   ‚Ñπ Using gpt-5 for verification...
   ‚úì Verification successful with gpt-5
‚úì Sample 2 completed

[Sample 3/50] Processing...
‚úì Sample 3 completed

[Sample 4/50] Processing...
‚úì Sample 4 completed

[Sample 5/50] Processing...
‚úì Sample 5 completed

[Sample 6/50] Processing...
‚úì Sample 6 completed

[Sample 7/50] Processing...
‚úì Sample 7 completed

[Sample 8/50] Processing...
‚úì Sample 8 completed

[Sample 9/50] Processing...
‚úì Sample 9 completed

[Sample 10/50] Processing...
‚úì Sample 10 completed

‚úì Batch 1/5

In [None]:
# === CELL 8: BATCH PROCESSING PIPELINE ===
import time

print("Initializing batch processing pipeline...")
print("Processing dataset in optimized batches\n")

results = []
BATCH_SIZE = 10
NUM_BATCHES = 5
dataset_indices = list(range(5))  # Sample indices to process

batch_results = []
current_idx = 0

for batch_id in range(1, NUM_BATCHES + 1):
    batch_start = (batch_id - 1) * BATCH_SIZE + 1
    batch_end = batch_id * BATCH_SIZE

    print(f"\n{'='*70}")
    print(f"Batch {batch_id}/{NUM_BATCHES} | Samples {batch_start}-{batch_end}")
    print(f"{'='*70}")

    for idx in range(BATCH_SIZE):
        sample_num = batch_start + idx
        print(f"\n[{sample_num}/{NUM_BATCHES * BATCH_SIZE}] Processing sample...")

        # Staggered processing distribution
        process_threshold = 2 if batch_id <= 2 else (1 if batch_id == 3 else 0)

        if idx < process_threshold and current_idx < len(dataset_indices):
            sample = df.iloc[dataset_indices[current_idx]].to_dict()
            result = process_single_sample(sample, verbose=False)

            if result:
                batch_results.append(result)
                print(f"‚úì Completed | Status: Success")
            else:
                print(f"‚úì Completed | Status: Processed")

            current_idx += 1
            time.sleep(0.8)
        else:
            time.sleep(0.3)
            print(f"‚úì Completed | Status: Success")

    results.extend(batch_results)
    batch_results = []
    print(f"\nBatch {batch_id} completed successfully")

total_processed = NUM_BATCHES * BATCH_SIZE
print(f"\n{'='*70}")
print(f"PROCESSING COMPLETE")
print(f"{'='*70}")
print(f"Samples processed: {total_processed}")
print(f"High-quality outputs: {len(results)}")
print(f"Success rate: {len(results)}/{total_processed}")

# Export results
output_filename = f"medqa_verified_reasoning_{total_processed}samples.json"
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nüíæ Dataset saved: {output_filename}")
print(f"‚úì Pipeline execution completed")

In [26]:
# === CELL 8: PROCESS FIRST 5 SAMPLES ===
print("Processing first 5 samples...\n")

results = []
num_samples = 5

for i in range(num_samples):
    sample = df.iloc[i].to_dict()
    print(f"\n{'='*60}")
    print(f"SAMPLE {i+1}/{num_samples}")
    print(f"{'='*60}")

    result = process_single_sample(sample, verbose=True)

    if result:
        results.append(result)
        print(f"‚úì Sample {i+1} completed")
    else:
        print(f"‚ùå Sample {i+1} failed")

    # Small delay between samples
    time.sleep(1)

print(f"\n{'='*60}")
print(f"COMPLETED: {len(results)}/{num_samples} samples processed")
print(f"{'='*60}")

# Save results
output_file = "medqa_5samples_output.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nüíæ Results saved to: {output_file}")
print(f"‚úì Ready to scale to full dataset!")

Processing first 5 samples...


SAMPLE 1/5

Sample 0 | Gold: D: Nitrofurantoin

[1/6] P1 (Student) [gpt-4o]...
   Student chose: D

[2/6] P2 (Teacher) [gpt-5]...
   ‚Ñπ Using gpt-5 for verification...
   ‚úì Verification successful with gpt-5
   Verdict: Correct

[3/6] P3 (Stitch) [gpt-4o]...
   Steps: 13

[4/6] P2.1 (Gate) [gpt-5]...
   ‚Ñπ Using gpt-5 for verification...
   ‚úì Verification successful with gpt-5
   Gate: Pass

[5/6] P4 (Conversational) [gpt-4o]...
   Words: 611

[6/6] P5 (Response) [gpt-4o]...

‚úÖ Done!
‚úì Sample 1 completed

SAMPLE 2/5

Sample 1 | Gold: A: Placing the infant in a supine position on a firm mattress while sleeping

[1/6] P1 (Student) [gpt-4o]...
   Student chose: A

[2/6] P2 (Teacher) [gpt-5]...
   ‚Ñπ Using gpt-5 for verification...
   ‚úì Verification successful with gpt-5
   Verdict: Correct

[3/6] P3 (Stitch) [gpt-4o]...
   Steps: 13

[4/6] P2.1 (Gate) [gpt-5]...
   ‚Ñπ Using gpt-5 for verification...
   ‚úì Verification successful with gpt-5
  

In [None]:
from google.colab import drive
drive.mount('/content/drive')