# Structured vs Free-Form Output: Impact on VLM Reasoning

**Research Question:** Does JSON schema constraint affect not just reasoning verbosity, but also the final classification?

**Hypothesis:** Structured output (JSON schema) leads to:
1. Shorter reasoning chains
2. Potentially different classifications
3. More conservative/less nuanced answers

## Experiment Structure

| # | Sub-Experiment | Purpose |
|---|----------------|--------|
| 1 | Setup & Data Loading | Load existing classifications, setup inference |
| 2 | Single Key Comparison | Compare structured vs free-form for one key |
| 3 | Multi-Key Analysis | Extend to multiple classification keys |
| 4 | Cross-Model Comparison | Test if effect varies by model size |
| 5 | Reasoning Quality Analysis | Analyze reasoning depth and structure |

---
## Sub-Experiment 1: Setup & Data Loading

In [4]:
# Imports and path setup
import sys
import json
import random
import time
from pathlib import Path
from dataclasses import dataclass, field
from typing import Literal, Any

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ollama import Client

# Project paths
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT / "tools"))

from scene import KEYS, KEY_PROMPTS, get_schema, KEY_DIFFICULTY
from vlm import load_config

# Config
DATA_DIR = PROJECT_ROOT / "data" / "runs" / "classification_20260121_001102"
RESULTS_DIR = PROJECT_ROOT / "data" / "experiments" / "structured_vs_freeform"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Data dir: {DATA_DIR}")
print(f"Results dir: {RESULTS_DIR}")

Project root: /Users/kaiser/Projects/Masterarbeit
Data dir: /Users/kaiser/Projects/Masterarbeit/data/runs/classification_20260121_001102
Results dir: /Users/kaiser/Projects/Masterarbeit/data/experiments/structured_vs_freeform


In [5]:
# Load existing progress data
with open(DATA_DIR / "progress.json") as f:
    progress = json.load(f)

# Extract clips with stage1 results (needed as context)
partial = progress.get("partial", {})
completed = progress.get("completed", [])

# Combine all clips with stage1
clips_with_stage1 = {}
for cid, data in partial.items():
    if "stage1" in data:
        clips_with_stage1[cid] = data["stage1"]

for result in progress.get("results", []):
    cid = result["clip_id"]
    if "scene_reasoning" in result.get("classification", {}):
        clips_with_stage1[cid] = result["classification"]["scene_reasoning"]

print(f"Clips with stage1 context: {len(clips_with_stage1)}")
print(f"Completed clips: {len(completed)}")
print(f"Partial clips: {len(partial)}")

Clips with stage1 context: 100
Completed clips: 36
Partial clips: 64


In [10]:
# Setup Ollama client
config = load_config()
client = Client(host=config.endpoints["default"].url)

# Test connection
models = client.list()
available_models = [m["model"] for m in models["models"]]
print("Available models:")
for m in available_models:
    if "qwen" in m.lower():
        print(f"  - {m}")

Available models:
  - qwen3-vl:30b
  - qwen3-vl:4b
  - qwen3-embedding:latest
  - qwen3:latest
  - qwen3:14b
  - qwen3-vl:8b


In [11]:
# Core experiment infrastructure

@dataclass
class ExperimentResult:
    """Result of a single structured vs free-form comparison."""
    clip_id: str
    key: str
    model: str
    
    # Structured output
    structured_answer: str
    structured_reasoning: str
    structured_time_ms: float
    
    # Free-form output  
    freeform_answer: str
    freeform_text: str
    freeform_time_ms: float
    
    @property
    def answers_match(self) -> bool:
        return self.structured_answer == self.freeform_answer
    
    @property
    def structured_reasoning_len(self) -> int:
        return len(self.structured_reasoning)
    
    @property
    def freeform_reasoning_len(self) -> int:
        return len(self.freeform_text)
    
    @property
    def reasoning_ratio(self) -> float:
        if self.structured_reasoning_len == 0:
            return float('inf')
        return self.freeform_reasoning_len / self.structured_reasoning_len


def extract_answer_from_freeform(text: str, valid_answers: list[str]) -> str:
    """Extract the final answer from free-form text."""
    text_lower = text.lower()
    
    # Check for explicit answer patterns
    patterns = [
        lambda a: f"**{a}**" in text_lower,
        lambda a: f"answer: {a}" in text_lower,
        lambda a: f"answer is {a}" in text_lower,
        lambda a: f"final answer: {a}" in text_lower,
        lambda a: text_lower.strip().endswith(a),
        lambda a: text_lower.strip().endswith(f"**{a}**"),
    ]
    
    for answer in valid_answers:
        for pattern in patterns:
            if pattern(answer):
                return answer
    
    # Check last few lines
    last_lines = text.strip().split("\n")[-5:]
    for line in reversed(last_lines):
        line_lower = line.lower()
        for answer in valid_answers:
            if answer in line_lower:
                return answer
    
    return "unknown"


def run_comparison(
    client: Client,
    model: str,
    key: str,
    clip_id: str,
    stage1_context: str,
    valid_answers: list[str],
    context_limit: int = 4000,
) -> ExperimentResult:
    """Run structured vs free-form comparison for a single clip/key."""
    
    prompt = KEY_PROMPTS[key]
    schema = get_schema(key)
    context = stage1_context[:context_limit]
    
    # 1. Structured output
    t0 = time.perf_counter()
    r_structured = client.chat(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"SCENE: {context}"},
        ],
        format=schema,
        options={"num_ctx": 32768},
    )
    structured_time = (time.perf_counter() - t0) * 1000
    
    try:
        structured_data = json.loads(r_structured["message"]["content"])
        structured_reasoning = structured_data.get("reasoning", "")
        # Find the answer field (varies by key)
        structured_answer = None
        for field in [key, "category", "level", "type"]:
            if field in structured_data:
                val = structured_data[field]
                structured_answer = val.lower() if isinstance(val, str) else str(val)
                break
        if structured_answer is None:
            # Try to find any field that matches valid answers
            for k, v in structured_data.items():
                if k != "reasoning" and isinstance(v, str) and v.lower() in valid_answers:
                    structured_answer = v.lower()
                    break
        if structured_answer is None:
            structured_answer = "parse_error"
    except json.JSONDecodeError:
        structured_reasoning = r_structured["message"]["content"]
        structured_answer = "json_error"
    
    # 2. Free-form output
    t0 = time.perf_counter()
    r_freeform = client.chat(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": f"SCENE: {context}\n\nProvide your reasoning step by step, then give your final answer."},
        ],
        options={"num_ctx": 32768},
    )
    freeform_time = (time.perf_counter() - t0) * 1000
    
    freeform_text = r_freeform["message"]["content"]
    freeform_answer = extract_answer_from_freeform(freeform_text, valid_answers)
    
    return ExperimentResult(
        clip_id=clip_id,
        key=key,
        model=model,
        structured_answer=structured_answer,
        structured_reasoning=structured_reasoning,
        structured_time_ms=structured_time,
        freeform_answer=freeform_answer,
        freeform_text=freeform_text,
        freeform_time_ms=freeform_time,
    )


print("Experiment infrastructure ready.")

Experiment infrastructure ready.


---
## Sub-Experiment 2: Single Key Comparison

Test the hypothesis on `required_action` - a safety-critical key where reasoning quality matters.

In [12]:
# Configuration for this sub-experiment
KEY = "required_action"
MODEL = "qwen3-vl:30b"
N_SAMPLES = 10
VALID_ANSWERS = ["none", "slow", "stop", "evade"]

print(f"Key: {KEY}")
print(f"Model: {MODEL}")
print(f"Samples: {N_SAMPLES}")
print(f"Valid answers: {VALID_ANSWERS}")
print(f"\nPrompt:\n{KEY_PROMPTS[KEY]}")

Key: required_action
Model: qwen3-vl:30b
Samples: 10
Valid answers: ['none', 'slow', 'stop', 'evade']

Prompt:
Determine REQUIRED ACTION for this scene.

Based on all visible hazards and traffic situation:
- none: Maintain current speed and course
- slow: Reduce speed, prepare to stop
- stop: Come to complete stop required
- evade: Steering maneuver may be needed

Select the most conservative action warranted by the scene.


In [13]:
# Sample clips and run comparisons
random.seed(42)
clip_ids = list(clips_with_stage1.keys())
sample_clips = random.sample(clip_ids, min(N_SAMPLES, len(clip_ids)))

results_exp2 = []

for i, clip_id in enumerate(sample_clips):
    print(f"[{i+1}/{len(sample_clips)}] {clip_id[:12]}...", end=" ", flush=True)
    
    result = run_comparison(
        client=client,
        model=MODEL,
        key=KEY,
        clip_id=clip_id,
        stage1_context=clips_with_stage1[clip_id],
        valid_answers=VALID_ANSWERS,
    )
    results_exp2.append(result)
    
    match = "✓" if result.answers_match else "✗"
    print(f"{result.structured_answer:6} vs {result.freeform_answer:6} {match}")

print("\nDone.")

[1/10] 0248180b-650... stop   vs stop   ✓
[2/10] 751758b2-2d5... stop   vs none   ✗
[3/10] 75537f4f-ec6... none   vs slow   ✗
[4/10] 994af02f-895... stop   vs stop   ✓
[5/10] e4a24200-d20... slow   vs none   ✗
[6/10] 5bbe668e-ea7... stop   vs none   ✗
[7/10] 0da5e374-0f7... stop   vs slow   ✗
[8/10] d174cda8-311... stop   vs none   ✗
[9/10] 14850e11-6a0... slow   vs slow   ✓
[10/10] 361a388a-108... slow   vs slow   ✓

Done.


In [14]:
# Analyze results
df_exp2 = pd.DataFrame([
    {
        "clip_id": r.clip_id[:12],
        "structured": r.structured_answer,
        "freeform": r.freeform_answer,
        "match": r.answers_match,
        "structured_len": r.structured_reasoning_len,
        "freeform_len": r.freeform_reasoning_len,
        "ratio": r.reasoning_ratio,
        "structured_ms": r.structured_time_ms,
        "freeform_ms": r.freeform_time_ms,
    }
    for r in results_exp2
])

print("="*70)
print(f"RESULTS: {KEY} ({MODEL})")
print("="*70)
print(f"Agreement rate: {df_exp2['match'].mean():.1%}")
print(f"Avg reasoning length - Structured: {df_exp2['structured_len'].mean():.0f} chars")
print(f"Avg reasoning length - Free-form:  {df_exp2['freeform_len'].mean():.0f} chars")
print(f"Avg ratio (freeform/structured):   {df_exp2['ratio'].mean():.1f}x")
print(f"\nAvg time - Structured: {df_exp2['structured_ms'].mean()/1000:.1f}s")
print(f"Avg time - Free-form:  {df_exp2['freeform_ms'].mean()/1000:.1f}s")
print("\n")
display(df_exp2)

RESULTS: required_action (qwen3-vl:30b)
Agreement rate: 40.0%
Avg reasoning length - Structured: 1551 chars
Avg reasoning length - Free-form:  2744 chars
Avg ratio (freeform/structured):   2.4x

Avg time - Structured: 30.9s
Avg time - Free-form:  53.9s




Unnamed: 0,clip_id,structured,freeform,match,structured_len,freeform_len,ratio,structured_ms,freeform_ms
0,0248180b-650,stop,stop,True,884,2789,3.154977,81475.868792,132677.758584
1,751758b2-2d5,stop,none,False,2124,2253,1.060734,28198.234875,76213.431292
2,75537f4f-ec6,none,slow,False,380,2835,7.460526,18852.720083,46088.950375
3,994af02f-895,stop,stop,True,1351,3588,2.655811,23503.28325,109170.265791
4,e4a24200-d20,slow,none,False,2848,2429,0.852879,33630.637208,28088.910042
5,5bbe668e-ea7,stop,none,False,1235,2414,1.954656,19235.273209,25818.797416
6,0da5e374-0f7,stop,slow,False,2253,2652,1.177097,34205.839208,29306.305417
7,d174cda8-311,stop,none,False,983,2299,2.338759,20156.976709,26661.864833
8,14850e11-6a0,slow,slow,True,1750,3335,1.905714,26874.132334,41288.739333
9,361a388a-108,slow,slow,True,1699,2843,1.673337,22532.288875,23614.230083


In [15]:
# Visualization: Agreement and Reasoning Length
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Answer Agreement", "Reasoning Length Comparison"),
    specs=[[{"type": "pie"}, {"type": "bar"}]]
)

# Pie chart: Agreement
match_counts = df_exp2["match"].value_counts()
fig.add_trace(
    go.Pie(
        labels=["Match", "Mismatch"],
        values=[match_counts.get(True, 0), match_counts.get(False, 0)],
        marker_colors=["#2ecc71", "#e74c3c"],
        hole=0.4,
    ),
    row=1, col=1
)

# Bar chart: Reasoning length
fig.add_trace(
    go.Bar(
        x=df_exp2["clip_id"],
        y=df_exp2["structured_len"],
        name="Structured",
        marker_color="#3498db",
    ),
    row=1, col=2
)
fig.add_trace(
    go.Bar(
        x=df_exp2["clip_id"],
        y=df_exp2["freeform_len"],
        name="Free-form",
        marker_color="#9b59b6",
    ),
    row=1, col=2
)

fig.update_layout(
    title=f"Structured vs Free-form: {KEY} ({MODEL})",
    barmode="group",
    height=400,
)
fig.update_xaxes(title_text="Clip ID", row=1, col=2)
fig.update_yaxes(title_text="Chars", row=1, col=2)

fig.show()

In [16]:
# Visualization: Answer Distribution
structured_counts = df_exp2["structured"].value_counts()
freeform_counts = df_exp2["freeform"].value_counts()

all_answers = sorted(set(list(structured_counts.index) + list(freeform_counts.index)))

fig = go.Figure()
fig.add_trace(go.Bar(
    x=all_answers,
    y=[structured_counts.get(a, 0) for a in all_answers],
    name="Structured",
    marker_color="#3498db",
))
fig.add_trace(go.Bar(
    x=all_answers,
    y=[freeform_counts.get(a, 0) for a in all_answers],
    name="Free-form",
    marker_color="#9b59b6",
))

fig.update_layout(
    title=f"Answer Distribution: {KEY}",
    xaxis_title="Answer",
    yaxis_title="Count",
    barmode="group",
    height=400,
)
fig.show()

In [17]:
# Show example of disagreement
disagreements = [r for r in results_exp2 if not r.answers_match]

if disagreements:
    example = disagreements[0]
    print("="*70)
    print(f"EXAMPLE DISAGREEMENT: {example.clip_id[:12]}")
    print("="*70)
    print(f"\n>>> STRUCTURED ({example.structured_answer}):\n")
    print(example.structured_reasoning[:1500])
    print("\n" + "-"*70)
    print(f"\n>>> FREE-FORM ({example.freeform_answer}):\n")
    print(example.freeform_text[:2000])
else:
    print("No disagreements found in this sample.")

EXAMPLE DISAGREEMENT: 751758b2-2d5

>>> STRUCTURED (stop):

Based on the detailed scene description, the most critical hazard is the **red SUV moving under a red traffic light** (observed in both top-left and top-right views). Here's the breakdown:

1.  **Traffic Light Violation**: 
    - Multiple red lights are visible in the distance (top-left and top-right views).
    - The red SUV is **moving away from the intersection** while red lights are active. This is highly anomalous behavior for a vehicle at a red light. In standard traffic control, vehicles must stop at red lights. The SUV's motion suggests one of these scenarios: 
      - It is illegally rolling through the intersection (a serious violation).
      - It is making an unauthorized turn (e.g., turning left against the red light), which would place it directly in the path of oncoming traffic or the autonomous vehicle.

2.  **Potential Collision Risk**: 
    - The SUV is moving at night, and visibility is reduced. If it is tur

---
## Sub-Experiment 3: Multi-Key Analysis

Extend the comparison to multiple keys to see if the effect varies by task type.

In [18]:
# Define keys to test with their valid answers
KEYS_TO_TEST = {
    "required_action": ["none", "slow", "stop", "evade"],
    "safety_criticality": ["tier1_catastrophic", "tier2_severe", "tier3_moderate", "tier4_minor"],
    "weather": ["clear", "cloudy", "rainy", "foggy", "snowy"],
    "occlusion_level": ["none", "minimal", "partial", "severe"],
    "road_type": ["highway", "urban_street", "residential", "intersection", "parking_lot", "construction_zone", "rural"],
}

MODEL = "qwen3-vl:30b"
N_SAMPLES_PER_KEY = 5

print(f"Testing {len(KEYS_TO_TEST)} keys with {N_SAMPLES_PER_KEY} samples each")
print(f"Total comparisons: {len(KEYS_TO_TEST) * N_SAMPLES_PER_KEY}")

Testing 5 keys with 5 samples each
Total comparisons: 25


In [19]:
# Run multi-key experiment
random.seed(42)
sample_clips = random.sample(list(clips_with_stage1.keys()), min(N_SAMPLES_PER_KEY, len(clips_with_stage1)))

results_exp3 = []

for key, valid_answers in KEYS_TO_TEST.items():
    print(f"\n{'='*70}")
    print(f"KEY: {key}")
    print(f"{'='*70}")
    
    for i, clip_id in enumerate(sample_clips):
        print(f"  [{i+1}/{len(sample_clips)}] {clip_id[:12]}...", end=" ", flush=True)
        
        try:
            result = run_comparison(
                client=client,
                model=MODEL,
                key=key,
                clip_id=clip_id,
                stage1_context=clips_with_stage1[clip_id],
                valid_answers=valid_answers,
            )
            results_exp3.append(result)
            
            match = "✓" if result.answers_match else "✗"
            print(f"{result.structured_answer[:15]:15} vs {result.freeform_answer[:15]:15} {match}")
        except Exception as e:
            print(f"Error: {e}")

print("\n\nDone.")


KEY: required_action
  [1/5] 0248180b-650... slow            vs none            ✗
  [2/5] 751758b2-2d5... stop            vs stop            ✓
  [3/5] 75537f4f-ec6... none            vs none            ✓
  [4/5] 994af02f-895... stop            vs stop            ✓
  [5/5] e4a24200-d20... slow            vs none            ✗

KEY: safety_criticality
  [1/5] 0248180b-650... tier2_severe    vs tier2_severe    ✓
  [2/5] 751758b2-2d5... tier1_catastrop vs tier2_severe    ✗
  [3/5] 75537f4f-ec6... tier4_minor     vs tier1_catastrop ✗
  [4/5] 994af02f-895... tier1_catastrop vs tier1_catastrop ✓
  [5/5] e4a24200-d20... tier2_severe    vs tier3_moderate  ✗

KEY: weather
  [1/5] 0248180b-650... cloudy          vs clear           ✗
  [2/5] 751758b2-2d5... clear           vs clear           ✓
  [3/5] 75537f4f-ec6... clear           vs clear           ✓
  [4/5] 994af02f-895... clear           vs clear           ✓
  [5/5] e4a24200-d20... clear           vs clear           ✓

KEY: occlusion_level
  

In [20]:
# Analyze multi-key results
df_exp3 = pd.DataFrame([
    {
        "key": r.key,
        "clip_id": r.clip_id[:12],
        "structured": r.structured_answer,
        "freeform": r.freeform_answer,
        "match": r.answers_match,
        "structured_len": r.structured_reasoning_len,
        "freeform_len": r.freeform_reasoning_len,
        "ratio": r.reasoning_ratio,
        "difficulty": KEY_DIFFICULTY.get(r.key, "unknown"),
    }
    for r in results_exp3
])

# Summary by key
summary = df_exp3.groupby("key").agg({
    "match": "mean",
    "structured_len": "mean",
    "freeform_len": "mean",
    "ratio": "mean",
}).round(2)
summary.columns = ["Agreement", "Structured Len", "Freeform Len", "Ratio"]
summary = summary.sort_values("Agreement")

print("SUMMARY BY KEY:")
display(summary)

SUMMARY BY KEY:


Unnamed: 0_level_0,Agreement,Structured Len,Freeform Len,Ratio
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
road_type,0.4,1373.0,3315.4,2.72
safety_criticality,0.4,1396.8,3797.2,3.39
occlusion_level,0.6,2023.4,3330.0,1.82
required_action,0.6,2078.2,4075.2,2.05
weather,0.8,1987.2,2328.4,1.24


In [21]:
# Visualization: Agreement by Key
fig = px.bar(
    summary.reset_index(),
    x="key",
    y="Agreement",
    color="Agreement",
    color_continuous_scale=["#e74c3c", "#f1c40f", "#2ecc71"],
    title="Agreement Rate by Classification Key",
    labels={"Agreement": "Agreement Rate", "key": "Key"},
)
fig.add_hline(y=0.5, line_dash="dash", line_color="gray", annotation_text="50%")
fig.update_layout(height=400)
fig.show()

In [22]:
# Visualization: Reasoning Length Ratio by Key
fig = px.bar(
    summary.reset_index(),
    x="key",
    y="Ratio",
    color="Ratio",
    color_continuous_scale="Blues",
    title="Reasoning Length Ratio (Free-form / Structured)",
    labels={"Ratio": "Ratio", "key": "Key"},
)
fig.add_hline(y=1.0, line_dash="dash", line_color="gray", annotation_text="Equal length")
fig.update_layout(height=400)
fig.show()

In [23]:
# Scatter: Agreement vs Reasoning Ratio
fig = px.scatter(
    summary.reset_index(),
    x="Ratio",
    y="Agreement",
    text="key",
    size="Freeform Len",
    color="Agreement",
    color_continuous_scale=["#e74c3c", "#2ecc71"],
    title="Agreement vs Reasoning Length Ratio",
)
fig.update_traces(textposition="top center")
fig.update_layout(height=500)
fig.show()

---
## Sub-Experiment 4: Cross-Model Comparison

Does the structured vs free-form effect vary by model size?

In [24]:
# Test across model sizes
MODELS_TO_TEST = ["qwen3-vl:4b", "qwen3-vl:8b", "qwen3-vl:30b"]
KEY = "required_action"
VALID_ANSWERS = ["none", "slow", "stop", "evade"]
N_SAMPLES = 5

random.seed(42)
sample_clips = random.sample(list(clips_with_stage1.keys()), min(N_SAMPLES, len(clips_with_stage1)))

print(f"Testing {len(MODELS_TO_TEST)} models on {KEY}")
print(f"Samples: {N_SAMPLES}")

Testing 3 models on required_action
Samples: 5


In [25]:
# Run cross-model experiment
results_exp4 = []

for model in MODELS_TO_TEST:
    print(f"\n{'='*70}")
    print(f"MODEL: {model}")
    print(f"{'='*70}")
    
    for i, clip_id in enumerate(sample_clips):
        print(f"  [{i+1}/{len(sample_clips)}] {clip_id[:12]}...", end=" ", flush=True)
        
        try:
            result = run_comparison(
                client=client,
                model=model,
                key=KEY,
                clip_id=clip_id,
                stage1_context=clips_with_stage1[clip_id],
                valid_answers=VALID_ANSWERS,
            )
            results_exp4.append(result)
            
            match = "✓" if result.answers_match else "✗"
            print(f"{result.structured_answer:6} vs {result.freeform_answer:6} {match}")
        except Exception as e:
            print(f"Error: {e}")

print("\n\nDone.")


MODEL: qwen3-vl:4b
  [1/5] 0248180b-650... stop   vs stop   ✓
  [2/5] 751758b2-2d5... stop   vs stop   ✓
  [3/5] 75537f4f-ec6... slow   vs slow   ✓
  [4/5] 994af02f-895... stop   vs stop   ✓
  [5/5] e4a24200-d20... slow   vs slow   ✓

MODEL: qwen3-vl:8b
  [1/5] 0248180b-650... stop   vs stop   ✓
  [2/5] 751758b2-2d5... stop   vs stop   ✓
  [3/5] 75537f4f-ec6... none   vs none   ✓
  [4/5] 994af02f-895... stop   vs stop   ✓
  [5/5] e4a24200-d20... slow   vs slow   ✓

MODEL: qwen3-vl:30b
  [1/5] 0248180b-650... stop   vs none   ✗
  [2/5] 751758b2-2d5... stop   vs stop   ✓
  [3/5] 75537f4f-ec6... none   vs none   ✓
  [4/5] 994af02f-895... stop   vs stop   ✓
  [5/5] e4a24200-d20... none   vs none   ✓


Done.


In [26]:
# Analyze cross-model results
df_exp4 = pd.DataFrame([
    {
        "model": r.model,
        "model_size": r.model.split(":")[-1],
        "clip_id": r.clip_id[:12],
        "structured": r.structured_answer,
        "freeform": r.freeform_answer,
        "match": r.answers_match,
        "structured_len": r.structured_reasoning_len,
        "freeform_len": r.freeform_reasoning_len,
        "ratio": r.reasoning_ratio,
        "structured_ms": r.structured_time_ms,
        "freeform_ms": r.freeform_time_ms,
    }
    for r in results_exp4
])

# Summary by model
summary_model = df_exp4.groupby("model_size").agg({
    "match": "mean",
    "structured_len": "mean",
    "freeform_len": "mean",
    "ratio": "mean",
    "structured_ms": "mean",
    "freeform_ms": "mean",
}).round(2)
summary_model.columns = ["Agreement", "Struct Len", "Free Len", "Ratio", "Struct ms", "Free ms"]

print("SUMMARY BY MODEL SIZE:")
display(summary_model)

SUMMARY BY MODEL SIZE:


Unnamed: 0_level_0,Agreement,Struct Len,Free Len,Ratio,Struct ms,Free ms
model_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
30b,0.8,1437.6,2576.6,2.34,46898.75,83499.24
4b,1.0,583.0,3323.6,5.76,34072.26,47391.68
8b,1.0,500.0,3276.8,6.64,104895.89,166475.23


In [27]:
# Visualization: Agreement by Model Size
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Agreement Rate", "Reasoning Length Ratio"),
)

model_order = ["4b", "8b", "30b"]
colors = {"4b": "#3498db", "8b": "#9b59b6", "30b": "#e74c3c"}

for model in model_order:
    if model in summary_model.index:
        fig.add_trace(
            go.Bar(
                x=[model],
                y=[summary_model.loc[model, "Agreement"]],
                name=model,
                marker_color=colors[model],
                showlegend=True,
            ),
            row=1, col=1
        )
        fig.add_trace(
            go.Bar(
                x=[model],
                y=[summary_model.loc[model, "Ratio"]],
                name=model,
                marker_color=colors[model],
                showlegend=False,
            ),
            row=1, col=2
        )

fig.add_hline(y=0.5, line_dash="dash", line_color="gray", row=1, col=1)
fig.add_hline(y=1.0, line_dash="dash", line_color="gray", row=1, col=2)

fig.update_layout(
    title=f"Structured vs Free-form Effect by Model Size ({KEY})",
    height=400,
)
fig.update_yaxes(title_text="Agreement Rate", row=1, col=1)
fig.update_yaxes(title_text="Ratio (Free/Struct)", row=1, col=2)

fig.show()

---
## Sub-Experiment 5: Reasoning Quality Analysis

Qualitative analysis of reasoning structure and depth.

In [28]:
def analyze_reasoning_structure(text: str) -> dict:
    """Analyze the structure of reasoning text."""
    lines = text.strip().split("\n")
    
    return {
        "total_chars": len(text),
        "total_lines": len(lines),
        "avg_line_len": len(text) / max(len(lines), 1),
        "bullet_points": sum(1 for l in lines if l.strip().startswith(("-", "*", "•"))),
        "numbered_points": sum(1 for l in lines if l.strip()[:2].replace(".", "").isdigit()),
        "headers": sum(1 for l in lines if l.strip().startswith("#") or l.strip().startswith("**")),
        "has_conclusion": any(kw in text.lower() for kw in ["therefore", "conclusion", "final answer", "thus"]),
        "mentions_evidence": any(kw in text.lower() for kw in ["because", "since", "evidence", "indicates", "shows"]),
    }


# Analyze all results from exp2
analysis_data = []

for r in results_exp2:
    struct_analysis = analyze_reasoning_structure(r.structured_reasoning)
    free_analysis = analyze_reasoning_structure(r.freeform_text)
    
    analysis_data.append({
        "clip_id": r.clip_id[:12],
        "match": r.answers_match,
        "struct_chars": struct_analysis["total_chars"],
        "free_chars": free_analysis["total_chars"],
        "struct_bullets": struct_analysis["bullet_points"] + struct_analysis["numbered_points"],
        "free_bullets": free_analysis["bullet_points"] + free_analysis["numbered_points"],
        "struct_headers": struct_analysis["headers"],
        "free_headers": free_analysis["headers"],
        "struct_has_conclusion": struct_analysis["has_conclusion"],
        "free_has_conclusion": free_analysis["has_conclusion"],
        "struct_has_evidence": struct_analysis["mentions_evidence"],
        "free_has_evidence": free_analysis["mentions_evidence"],
    })

df_analysis = pd.DataFrame(analysis_data)
display(df_analysis)

Unnamed: 0,clip_id,match,struct_chars,free_chars,struct_bullets,free_bullets,struct_headers,free_headers,struct_has_conclusion,free_has_conclusion,struct_has_evidence,free_has_evidence
0,0248180b-650,True,884,2789,0,18,0,4,False,True,False,True
1,751758b2-2d5,False,2124,2253,18,20,1,3,False,True,True,False
2,75537f4f-ec6,False,380,2835,0,21,0,4,False,True,True,True
3,994af02f-895,True,1351,3588,15,28,3,7,False,True,True,False
4,e4a24200-d20,False,2848,2429,36,20,2,2,False,True,True,True
5,5bbe668e-ea7,False,1235,2414,9,19,0,3,False,True,True,True
6,0da5e374-0f7,False,2253,2652,17,20,4,3,True,True,False,False
7,d174cda8-311,False,983,2299,13,27,0,4,False,False,True,False
8,14850e11-6a0,True,1750,3335,16,28,0,3,False,True,True,True
9,361a388a-108,True,1699,2843,23,19,0,3,False,True,False,True


In [29]:
# Visualization: Reasoning Structure Comparison
metrics = ["chars", "bullets", "headers"]
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=[f"{m.title()}" for m in metrics],
)

for i, metric in enumerate(metrics, 1):
    fig.add_trace(
        go.Box(
            y=df_analysis[f"struct_{metric}"],
            name="Structured",
            marker_color="#3498db",
            boxmean=True,
        ),
        row=1, col=i
    )
    fig.add_trace(
        go.Box(
            y=df_analysis[f"free_{metric}"],
            name="Free-form",
            marker_color="#9b59b6",
            boxmean=True,
        ),
        row=1, col=i
    )

fig.update_layout(
    title="Reasoning Structure: Structured vs Free-form",
    height=400,
    showlegend=True,
)
fig.show()

In [30]:
# Summary statistics
print("REASONING QUALITY SUMMARY")
print("="*50)
print(f"\nConclusion present:")
print(f"  Structured: {df_analysis['struct_has_conclusion'].mean():.0%}")
print(f"  Free-form:  {df_analysis['free_has_conclusion'].mean():.0%}")

print(f"\nEvidence cited:")
print(f"  Structured: {df_analysis['struct_has_evidence'].mean():.0%}")
print(f"  Free-form:  {df_analysis['free_has_evidence'].mean():.0%}")

print(f"\nAvg bullet/numbered points:")
print(f"  Structured: {df_analysis['struct_bullets'].mean():.1f}")
print(f"  Free-form:  {df_analysis['free_bullets'].mean():.1f}")

REASONING QUALITY SUMMARY

Conclusion present:
  Structured: 10%
  Free-form:  90%

Evidence cited:
  Structured: 70%
  Free-form:  60%

Avg bullet/numbered points:
  Structured: 14.7
  Free-form:  22.0


---
## Save Results

In [31]:
# Save all experiment results
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save DataFrames
df_exp2.to_csv(RESULTS_DIR / f"exp2_single_key_{timestamp}.csv", index=False)
df_exp3.to_csv(RESULTS_DIR / f"exp3_multi_key_{timestamp}.csv", index=False)
df_exp4.to_csv(RESULTS_DIR / f"exp4_cross_model_{timestamp}.csv", index=False)
df_analysis.to_csv(RESULTS_DIR / f"exp5_reasoning_analysis_{timestamp}.csv", index=False)

# Save raw results as JSON for reproducibility
def result_to_dict(r: ExperimentResult) -> dict:
    return {
        "clip_id": r.clip_id,
        "key": r.key,
        "model": r.model,
        "structured_answer": r.structured_answer,
        "structured_reasoning": r.structured_reasoning,
        "structured_time_ms": r.structured_time_ms,
        "freeform_answer": r.freeform_answer,
        "freeform_text": r.freeform_text,
        "freeform_time_ms": r.freeform_time_ms,
    }

all_results = {
    "exp2": [result_to_dict(r) for r in results_exp2],
    "exp3": [result_to_dict(r) for r in results_exp3],
    "exp4": [result_to_dict(r) for r in results_exp4],
}

with open(RESULTS_DIR / f"all_results_{timestamp}.json", "w") as f:
    json.dump(all_results, f, indent=2)

print(f"Results saved to {RESULTS_DIR}")
print(f"Timestamp: {timestamp}")

Results saved to /Users/kaiser/Projects/Masterarbeit/data/experiments/structured_vs_freeform
Timestamp: 20260122_030003
