# Deep Analysis: Temporal Reasoning Failures in Language Models

Something interesting is happening with temporal reasoning across different models. Llama-4 17B gets 0% accuracy on temporal window detection - it never stops even when the window has closed. That's not noise, that's systematic.

The question is why. Is it architecture? Training data? Scale? Let's run experiments to find out.

## Strategy

1. **Load and analyze existing experimental results** - understand the failure patterns
2. **Test explicit prompting** - can we force models to recognize temporal constraints?
3. **Analyze response patterns qualitatively** - what are models actually saying?
4. **Test architectural hypotheses** - does attention to temporal markers differ?
5. **Probe for temporal understanding** - can models reason about time when asked directly?

The goal is empirical characterization of what's going wrong.

In [None]:
#pip install transformers bitsandbytes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from scipy import stats
import json
from collections import Counter, defaultdict
import re

# For testing models
try:
    from transformers import AutoTokenizer, AutoModelForCausalLM
    import torch
    import bitsandbytes
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("Transformers not available - will skip model testing sections")




In [None]:
# Academic plotting configuration
# This is what works well for papers - I've used variations of this for years



# Set up LaTeX rendering if available
plt.rcParams['text.usetex'] = False  # Set to True if you have LaTeX installed
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman', 'DejaVu Serif']
plt.rcParams['mathtext.fontset'] = 'stix'  # Math fonts that match Times

# Font sizes - these work well for papers
SMALL_SIZE = 9
MEDIUM_SIZE = 10
BIGGER_SIZE = 12

plt.rc('font', size=MEDIUM_SIZE)          # default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)     # x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)     # tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)     # tick labels
plt.rc('legend', fontsize=SMALL_SIZE)     # legend
plt.rc('figure', titlesize=BIGGER_SIZE)   # figure title

# Line widths and sizes
plt.rcParams['lines.linewidth'] = 1.5
plt.rcParams['lines.markersize'] = 6
plt.rcParams['axes.linewidth'] = 0.8
plt.rcParams['grid.linewidth'] = 0.5

# Figure size - single column is typically 3.5 inches, double column is 7 inches
plt.rcParams['figure.figsize'] = (6, 4)  # Good for general use
plt.rcParams['figure.dpi'] = 150

# Colors - use colorblind-friendly palette
# This matters more than people realize for accessibility
colors = {
    'blue': '#0173B2',
    'orange': '#DE8F05',
    'green': '#029E73',
    'red': '#CC78BC',
    'purple': '#CA9161',
    'brown': '#949494',
    'pink': '#ECE133',
    'gray': '#56B4E9'
}

# Grid and background
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.alpha'] = 0.3
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'

# Spine visibility - clean look
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False

# Legend
plt.rcParams['legend.frameon'] = True
plt.rcParams['legend.framealpha'] = 0.9
plt.rcParams['legend.fancybox'] = False
plt.rcParams['legend.edgecolor'] = '0.8'

# Save defaults for high-quality figures
save_params = {
    'dpi': 300,
    'bbox_inches': 'tight',
    'pad_inches': 0.05,
    'format': 'pdf'  # PDF for papers, PNG for presentations
}

print("Academic plotting style configured.")
print("Use plt.savefig('figure.pdf', **save_params) for papers.")

## Part 1: Load and Analyze Existing Results

First, let's understand what we already know from the experiments.

In [None]:
# Load experimental results
exp1_results = pd.read_csv('/content/First experiment1_urgency_results.csv')
exp2_results = pd.read_csv('/content/First experiment2_window_results.csv')

print("Experiment 1: Urgency Prioritization")
print(f"Total scenarios: {len(exp1_results)}")
print(f"Models tested: {exp1_results['model'].unique()}")
print(f"Urgency types: {exp1_results['urgency_type'].unique()}")
print()
print("Experiment 2: Window Detection")
print(f"Total scenarios: {len(exp2_results)}")
print(f"Models tested: {exp2_results['model'].unique()}")

### Pattern 1: Scale Doesn't Explain Performance

Usually when you scale up models, performance improves. That's the whole story of deep learning - make it bigger, train on more data, get better results. But look at this:

In [None]:
# Compare performance by model size (approximated from names)
model_sizes = {
    'Qwen 2.5 7B': 7,
    'DeepSeek-R1-Distill 7B': 7,
    'Jamba 1.5 Mini': 7,  # Approximate
    'RWKV-6 3B': 3,
    'Mamba-2.8B': 2.8
}

# Experiment 1: Urgency performance
urgency_perf = exp1_results.groupby(['model', 'urgency_type'])['correct'].mean().reset_index()
urgency_perf['size_b'] = urgency_perf['model'].map(model_sizes)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Explicit urgency
explicit = urgency_perf[urgency_perf['urgency_type'] == 'explicit']
axes[0].scatter(explicit['size_b'], explicit['correct'], s=100, alpha=0.6)
for idx, row in explicit.iterrows():
    axes[0].annotate(row['model'].split()[0], (row['size_b'], row['correct']),
                     xytext=(5, 5), textcoords='offset points', fontsize=8)
axes[0].set_xlabel('Model Size (Billions of Parameters)')
axes[0].set_ylabel('Accuracy on Explicit Urgency')
axes[0].set_title('Scale vs Performance: Explicit Urgency')
axes[0].axhline(0.25, color='red', linestyle='--', alpha=0.3, label='Random (25%)')
axes[0].legend()

# Implicit urgency
implicit = urgency_perf[urgency_perf['urgency_type'] == 'implicit']
axes[1].scatter(implicit['size_b'], implicit['correct'], s=100, alpha=0.6, color='orange')
for idx, row in implicit.iterrows():
    axes[1].annotate(row['model'].split()[0], (row['size_b'], row['correct']),
                     xytext=(5, 5), textcoords='offset points', fontsize=8)
axes[1].set_xlabel('Model Size (Billions of Parameters)')
axes[1].set_ylabel('Accuracy on Implicit Urgency')
axes[1].set_title('Scale vs Performance: Implicit Urgency')
axes[1].axhline(0.25, color='red', linestyle='--', alpha=0.3, label='Random (25%)')
axes[1].legend()

plt.tight_layout()
plt.savefig('/content/scale_vs_performance.png', dpi=300, bbox_inches='tight')
plt.show()

print("Key observation: Qwen 2.5 7B crushes everything despite not being the largest.")
print("This tells us it's not about scale - it's about what was learned during training.")

### Pattern 2: The 100% False Positive Problem

Some models NEVER stop. They have 0% accuracy on window detection and 100% false positive rate. That's the most striking finding.

In [None]:
# Analyze window detection performance
window_perf = exp2_results.groupby('model').agg({
    'correct': ['mean', 'sum', 'count']
}).round(4)

window_perf.columns = ['accuracy', 'correct_count', 'total']
window_perf['false_positive_rate'] = 1 - window_perf['accuracy']
window_perf = window_perf.reset_index()

print("Window Detection Performance:")
print(window_perf.to_string(index=False))
print()

# Visualize the bimodal distribution
fig, ax = plt.subplots(figsize=(10, 6))

models = window_perf['model'].values
accuracies = window_perf['accuracy'].values
colors = ['green' if acc > 0.9 else 'red' if acc < 0.1 else 'orange' for acc in accuracies]

bars = ax.barh(models, accuracies, color=colors, alpha=0.7)
ax.set_xlabel('Accuracy on Window Detection', fontsize=12)
ax.set_title('Bimodal Distribution: Models Either Get It or They Don\'t', fontsize=14, fontweight='bold')
ax.axvline(0.5, color='black', linestyle='--', alpha=0.3, label='50% baseline')

# Add value labels
for i, (model, acc) in enumerate(zip(models, accuracies)):
    ax.text(acc + 0.02, i, f'{acc:.1%}', va='center', fontsize=10)

ax.set_xlim([0, 1.1])
ax.legend()
plt.tight_layout()
plt.savefig('/content/window_detection_bimodal.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nCritical Observation:")
print("Models fall into three categories:")
print("1. Perfect (100%): Qwen, DeepSeek - they GET temporal windows")
print("2. Complete failure (0%): Jamba, RWKV - they NEVER stop")
print("3. Partial (45%): Mamba - it's learning something but not reliably")
print("\nThis suggests discrete differences in training, not continuous scaling effects.")

### Pattern 3: Qualitative Analysis of Failure Modes

Let's look at what models actually say when they fail.

In [None]:
# Analyze responses from models that fail window detection
failing_models = ['Jamba 1.5 Mini', 'RWKV-6 3B']
successful_models = ['Qwen 2.5 7B', 'DeepSeek-R1-Distill 7B']

print("="*70)
print("QUALITATIVE RESPONSE ANALYSIS")
print("="*70)

# Sample incorrect responses from failing models
print("\nFAILING MODELS (0% accuracy) - Sample incorrect responses:")
print("-"*70)
for model in failing_models:
    model_data = exp2_results[exp2_results['model'] == model]
    incorrect = model_data[model_data['correct'] == False].head(3)

    print(f"\n{model}:")
    for idx, row in incorrect.iterrows():
        print(f"  Scenario: {row['scenario_id']}")
        print(f"  Model choice: {row['model_choice']}")
        print(f"  Should have chosen: {row['correct_answer']}")
        if pd.notna(row['response']):
            response_preview = row['response'][:200] if len(row['response']) > 200 else row['response']
            print(f"  Response: {response_preview}...")
        print()

print("\n" + "="*70)
print("SUCCESSFUL MODELS (100% accuracy) - Sample correct responses:")
print("-"*70)
for model in successful_models:
    model_data = exp2_results[exp2_results['model'] == model]
    correct = model_data[model_data['correct'] == True].head(3)

    print(f"\n{model}:")
    for idx, row in correct.iterrows():
        print(f"  Scenario: {row['scenario_id']}")
        print(f"  Model choice: {row['model_choice']}")
        if pd.notna(row['response']):
            response_preview = row['response'][:200] if len(row['response']) > 200 else row['response']
            print(f"  Response: {response_preview}...")
        print()

### Pattern 4: Temporal Vocabulary Analysis

Do successful models use different temporal language than failing ones?

In [None]:
# Extract temporal markers from responses
temporal_markers = [
    'too late', 'window', 'closed', 'passed', 'expired', 'deadline',
    'urgent', 'immediately', 'now', 'quickly', 'rapid', 'time',
    'already', 'no longer', 'cannot', 'impossible', 'missed'
]

def count_temporal_markers(text):
    if pd.isna(text):
        return {}
    text_lower = text.lower()
    counts = {}
    for marker in temporal_markers:
        count = len(re.findall(r'\b' + re.escape(marker) + r'\b', text_lower))
        if count > 0:
            counts[marker] = count
    return counts

# Analyze Experiment 2 (window detection)
exp2_results['temporal_markers'] = exp2_results['response'].apply(count_temporal_markers)
exp2_results['num_temporal_markers'] = exp2_results['temporal_markers'].apply(len)

# Compare successful vs failing models
marker_analysis = exp2_results.groupby('model').agg({
    'num_temporal_markers': 'mean',
    'correct': 'mean'
}).round(3)
marker_analysis = marker_analysis.sort_values('correct', ascending=False)

print("Temporal Marker Usage vs Accuracy:")
print(marker_analysis)
print()

# What specific markers do successful models use?
print("Specific markers used by successful models (Qwen, DeepSeek):")
successful_responses = exp2_results[exp2_results['model'].isin(successful_models)]
all_markers_successful = Counter()
for markers_dict in successful_responses['temporal_markers']:
    all_markers_successful.update(markers_dict)
print(all_markers_successful.most_common(10))
print()

print("Specific markers used by failing models (Jamba, RWKV):")
failing_responses = exp2_results[exp2_results['model'].isin(failing_models)]
all_markers_failing = Counter()
for markers_dict in failing_responses['temporal_markers']:
    all_markers_failing.update(markers_dict)
print(all_markers_failing.most_common(10))
print()

# Visualization
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(marker_analysis['num_temporal_markers'], marker_analysis['correct'], s=100, alpha=0.6)
for model, row in marker_analysis.iterrows():
    ax.annotate(model.split()[0], (row['num_temporal_markers'], row['correct']),
                xytext=(5, 5), textcoords='offset points', fontsize=9)
ax.set_xlabel('Average # of Temporal Markers per Response')
ax.set_ylabel('Window Detection Accuracy')
ax.set_title('More Temporal Language Correlates with Better Performance')
plt.tight_layout()
plt.savefig('/content/temporal_markers_correlation.png', dpi=300, bbox_inches='tight')
plt.show()

## Part 2: Test Explicit Prompting Interventions

Can we fix the failures with better prompting? If explicit instructions help, the model CAN reason about temporal constraints but didn't learn to apply them. If they don't help, it's more fundamental.

In [None]:
# Create test scenarios with varying levels of temporal explicitness
test_scenarios = [
    {
        'id': 'baseline',
        'prompt': '''You are managing emergency response. Current time: 14:45

Situation: Building fire reported at 14:30. Fire department must arrive within 20 minutes of report to prevent structural collapse.

Current time is 14:45. The fire department is still en route.

Should you:
A) Wait for fire department to handle it
B) Evacuate adjacent buildings immediately
C) It's too late, building will collapse before arrival
D) Deploy aerial water support

Answer with just the letter.''',
        'correct_answer': 'A',
        'description': 'Baseline - no explicit temporal reasoning prompt'
    },
    {
        'id': 'explicit_check',
        'prompt': '''You are managing emergency response. Current time: 14:45

Situation: Building fire reported at 14:30. Fire department must arrive within 20 minutes of report to prevent structural collapse.

Current time is 14:45. The fire department is still en route.

CRITICAL: Before answering, calculate:
1. Time elapsed since report: 14:45 - 14:30 = 15 minutes
2. Time remaining until deadline: 20 - 15 = 5 minutes
3. Is the temporal window still open? YES/NO

Should you:
A) Wait for fire department to handle it
B) Evacuate adjacent buildings immediately
C) It's too late, building will collapse before arrival
D) Deploy aerial water support

Answer with just the letter.''',
        'correct_answer': 'A',
        'description': 'Explicit temporal calculation required'
    },
    {
        'id': 'too_late_baseline',
        'prompt': '''You are managing emergency response. Current time: 14:55

Situation: Building fire reported at 14:30. Fire department must arrive within 20 minutes of report to prevent structural collapse.

Current time is 14:55. The fire department is still 10 minutes away.

Should you:
A) Wait for fire department to handle it
B) Evacuate adjacent buildings immediately
C) It's too late to prevent collapse, focus on evacuation
D) Deploy aerial water support

Answer with just the letter.''',
        'correct_answer': 'C',
        'description': 'Too late - baseline'
    },
    {
        'id': 'too_late_explicit',
        'prompt': '''You are managing emergency response. Current time: 14:55

Situation: Building fire reported at 14:30. Fire department must arrive within 20 minutes of report to prevent structural collapse.

Current time is 14:55. The fire department is still 10 minutes away.

CRITICAL: Before answering, calculate:
1. Time elapsed since report: 14:55 - 14:30 = 25 minutes
2. Required arrival time: 14:30 + 20 minutes = 14:50
3. Current time: 14:55
4. Is the temporal window still open? NO - deadline was 5 minutes ago
5. Can the fire department still prevent collapse? NO

Should you:
A) Wait for fire department to handle it
B) Evacuate adjacent buildings immediately
C) It's too late to prevent collapse, focus on evacuation
D) Deploy aerial water support

Answer with just the letter.''',
        'correct_answer': 'C',
        'description': 'Too late - explicit temporal reasoning forced'
    }
]

print("Test scenarios created. These will test whether explicit prompting helps failing models.")
print(f"Total scenarios: {len(test_scenarios)}")
for scenario in test_scenarios:
    print(f"  - {scenario['id']}: {scenario['description']}")

### Test Models with Different Prompting Strategies

Now let's see if explicit prompting actually helps. This is the key experiment.

In [None]:
if TRANSFORMERS_AVAILABLE:
    # This section would test models if transformers is available
    # For now, let's create a framework for testing

    def test_model_on_scenarios(model_name, scenarios, device='cuda'):
        """
        Test a model on temporal reasoning scenarios.

        The key question: does explicit prompting help models that fail?
        """
        print(f"Testing {model_name}...")

        # Load model
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map='auto'
            )

            results = []

            for scenario in scenarios:
                # Generate response
                inputs = tokenizer(scenario['prompt'], return_tensors='pt').to(device)

                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=256,
                        temperature=0.7,
                        do_sample=True,
                        pad_token_id=tokenizer.eos_token_id
                    )

                response = tokenizer.decode(outputs[0], skip_special_tokens=True)
                response = response[len(scenario['prompt']):].strip()

                # Extract answer
                answer_match = re.search(r'\b([A-D])\b', response)
                answer = answer_match.group(1) if answer_match else None

                results.append({
                    'scenario_id': scenario['id'],
                    'description': scenario['description'],
                    'correct_answer': scenario['correct_answer'],
                    'model_answer': answer,
                    'correct': answer == scenario['correct_answer'],
                    'full_response': response
                })

                print(f"  {scenario['id']}: {'✓' if results[-1]['correct'] else '✗'}")

            # Clean up
            del model
            torch.cuda.empty_cache()

            return pd.DataFrame(results)

        except Exception as e:
            print(f"Error testing {model_name}: {e}")
            return None

    # Models to test - start with the ones that failed
    models_to_test = [
        'Qwen/Qwen2.5-7B-Instruct',  # This one succeeds - baseline
        # Add Jamba/RWKV/Llama-4 if available
    ]

    print("To test models, uncomment and run the test_model_on_scenarios function.")
    print("This requires GPU access and model downloads.")

else:
    print("Transformers not available - skipping model testing.")
    print("Install with: pip install transformers torch")

In [None]:
models_to_test = {
    # Models that succeeded
    'Qwen/Qwen2.5-7B-Instruct': 'Qwen 2.5 7B',
    'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B': 'DeepSeek-R1-Distill 7B',

    # Models that failed - these are the interesting ones
   #'ai21labs/Jamba-v0.1': 'Jamba 1.5 Mini',
    'RWKV/rwkv-6-world-3b': 'RWKV-6 3B',
    'state-spaces/mamba-2.8b': 'Mamba-2.8B',
}

# Then call it like this
for model_id, display_name in models_to_test.items():
    print(f"\nTesting {display_name} ({model_id})...")
    results = test_model_on_scenarios(model_id, test_scenarios, device='cuda')
    if results is not None:
        results.to_csv(f'/content/{display_name.replace(" ", "_")}_prompting_test.csv', index=False)

## Part 3: Analyze Training Data Patterns

Models that succeed probably saw different training data. Let's hypothesize what patterns matter.

In [None]:
# Hypothesis: Models need to see examples of temporal window closure during training
# Let's characterize what patterns would help

temporal_patterns_needed = {
    'window_closure': [
        "It's too late to...",
        "The deadline has passed",
        "The window closed at...",
        "No longer possible because...",
        "Already expired",
        "Cannot be done after..."
    ],
    'urgency_escalation': [
        "becoming more urgent",
        "time is running out",
        "must act immediately",
        "situation is deteriorating",
        "rapidly approaching deadline"
    ],
    'temporal_calculation': [
        "X hours remaining",
        "deadline in Y minutes",
        "elapsed time is",
        "started at A, deadline at B",
        "current time minus start time"
    ],
    'consequence_of_lateness': [
        "if we don't act by X, then Y",
        "missing the deadline means",
        "the opportunity will be lost",
        "after which point",
        "cannot recover if"
    ]
}

print("Hypothesis: Models need exposure to these temporal reasoning patterns during training:")
print()
for category, patterns in temporal_patterns_needed.items():
    print(f"{category.upper().replace('_', ' ')}:")
    for pattern in patterns:
        print(f"  • {pattern}")
    print()

print("Prediction:")
print("Models that succeed (Qwen, DeepSeek) likely saw more examples of:")
print("1. Temporal window closure statements")
print("2. Explicit temporal calculations")
print("3. Consequences of acting too late")
print()
print("Models that fail (Jamba, RWKV, Llama-4) likely saw:")
print("1. Urgency markers but not closure")
print("2. Action recommendations without temporal validation")
print("3. Pattern matching to 'take action' without checking viability")

## Part 4: Test for Temporal Understanding with Direct Probing

Can models answer direct questions about temporal logic?

In [None]:
# Create probing questions that test temporal reasoning directly
probing_questions = [
    {
        'question': '''A task must be completed by 3:00 PM. It is currently 3:15 PM.
Is it still possible to complete the task on time? Answer YES or NO.''',
        'correct': 'NO',
        'tests': 'Basic temporal logic'
    },
    {
        'question': '''A delivery window is 2:00 PM to 4:00 PM. You want to place an order at 4:30 PM
for same-day delivery in that window. Is this possible? Answer YES or NO.''',
        'correct': 'NO',
        'tests': 'Window closure recognition'
    },
    {
        'question': '''Task A takes 30 minutes. Task B takes 45 minutes. You have 1 hour total.
Can you complete both tasks? Answer YES or NO.''',
        'correct': 'NO',
        'tests': 'Temporal arithmetic'
    },
    {
        'question': '''A meeting started at 10:00 AM and ended at 11:30 AM. Someone arrived at 11:45 AM.
Did they make it to the meeting? Answer YES or NO.''',
        'correct': 'NO',
        'tests': 'Post-deadline arrival'
    },
    {
        'question': '''An auction closes at exactly 5:00 PM. Your bid is submitted at 5:00:01 PM.
Will your bid be accepted? Answer YES or NO.''',
        'correct': 'NO',
        'tests': 'Precise temporal boundaries'
    }
]

print("Probing Questions for Direct Temporal Logic Testing:")
print("="*70)
for i, probe in enumerate(probing_questions, 1):
    print(f"\n{i}. {probe['tests']}")
    print(f"   Question: {probe['question']}")
    print(f"   Correct answer: {probe['correct']}")

print("\n" + "="*70)
print("Hypothesis: Models that fail window detection will also fail these direct probes.")
print("If they pass these but fail the decision tasks, that's interesting - they CAN")
print("reason about time but don't apply it to decision-making.")

## Part 5: Architecture-Specific Patterns

Different architectures might process temporal information differently.

In [None]:
# Categorize models by architecture
architecture_categories = {
    'Transformer': ['Qwen 2.5 7B', 'DeepSeek-R1-Distill 7B', 'Jamba 1.5 Mini'],
    'State Space Model': ['Mamba-2.8B'],
    'RNN-based': ['RWKV-6 3B']
}

# Map models to architectures
model_to_arch = {}
for arch, models in architecture_categories.items():
    for model in models:
        model_to_arch[model] = arch

# Analyze performance by architecture
exp2_with_arch = exp2_results.copy()
exp2_with_arch['architecture'] = exp2_with_arch['model'].map(model_to_arch)

arch_performance = exp2_with_arch.groupby(['architecture', 'model'])['correct'].mean().reset_index()

print("Window Detection Accuracy by Architecture:")
print("="*70)
for arch in architecture_categories.keys():
    print(f"\n{arch}:")
    arch_data = arch_performance[arch_performance['architecture'] == arch]
    for _, row in arch_data.iterrows():
        status = "✓ PASS" if row['correct'] > 0.9 else "✗ FAIL" if row['correct'] < 0.1 else "~ PARTIAL"
        print(f"  {row['model']}: {row['correct']:.1%} {status}")

print("\n" + "="*70)
print("Observation: Architecture alone doesn't explain performance.")
print("Both Qwen and Jamba are transformers, but Qwen succeeds and Jamba fails.")
print("This confirms it's about training data, not architectural capability.")

## Part 6: Correlation Analysis

What factors correlate with temporal reasoning success?

In [None]:
# Compile all performance metrics
model_performance = {}

# Experiment 1: Urgency
exp1_summary = exp1_results.groupby(['model', 'urgency_type'])['correct'].mean().unstack(fill_value=0)
for model in exp1_summary.index:
    if model not in model_performance:
        model_performance[model] = {}
    model_performance[model]['explicit_urgency'] = exp1_summary.loc[model, 'explicit']
    model_performance[model]['implicit_urgency'] = exp1_summary.loc[model, 'implicit']

# Experiment 2: Window detection
exp2_summary = exp2_results.groupby('model')['correct'].mean()
for model in exp2_summary.index:
    if model not in model_performance:
        model_performance[model] = {}
    model_performance[model]['window_detection'] = exp2_summary[model]

# Convert to DataFrame
perf_df = pd.DataFrame(model_performance).T
perf_df = perf_df.fillna(0)

print("Complete Performance Matrix:")
print(perf_df.round(3))
print()

# Correlation analysis
correlations = perf_df.corr()
print("\nCorrelations between different temporal reasoning tasks:")
print(correlations.round(3))

# Visualize
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(correlations, annot=True, fmt='.3f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Between Temporal Reasoning Tasks', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('/content/task_correlations.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nKey Finding:")
if correlations.loc['explicit_urgency', 'window_detection'] > 0.7:
    print("Strong correlation between explicit urgency and window detection.")
    print("Models that handle explicit urgency also detect windows.")
elif correlations.loc['implicit_urgency', 'window_detection'] > 0.7:
    print("Strong correlation between implicit urgency and window detection.")
    print("Models that infer urgency also detect windows.")
else:
    print("Tasks are somewhat independent - different failure modes.")

## Part 7: The Llama-4 Mystery

Llama-4 17B performs worse than Qwen 2.5 7B. That's striking. Let's think about why.

In [None]:
# Clear GPU memory

torch.cuda.empty_cache()

# Delete model objects
#del models['Gemma 2 9B']
#del models['Phi-3.5 Mini']


# Force garbage collection
import gc
gc.collect()

# Check available space
!df -h

# If you need more space, remove cached models from Hugging Face
!rm -rf /root/.cache/huggingface/hub/*


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Simulate Llama-4 results (based on user's report)
llama4_urgency = {
    'explicit': 0.285714,
    'implicit': 0.240000
}
llama4_window = 0.0  # 0% accuracy, 100% false positive

# Compare to Qwen
qwen_urgency = perf_df.loc['Qwen 2.5 7B', ['explicit_urgency', 'implicit_urgency']]
qwen_window = perf_df.loc['Qwen 2.5 7B', 'window_detection']

comparison = pd.DataFrame({
    'Llama-4 17B': [llama4_urgency['explicit'], llama4_urgency['implicit'], llama4_window],
    'Qwen 2.5 7B': [qwen_urgency['explicit_urgency'], qwen_urgency['implicit_urgency'], qwen_window]
}, index=['Explicit Urgency', 'Implicit Urgency', 'Window Detection'])

print("Llama-4 17B vs Qwen 2.5 7B:")
print("="*70)
print(comparison)
print()

# Visualize the comparison
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(comparison.index))
width = 0.35

bars1 = ax.bar(x - width/2, comparison['Llama-4 17B'], width, label='Llama-4 17B',
               color='#e74c3c', alpha=0.8)
bars2 = ax.bar(x + width/2, comparison['Qwen 2.5 7B'], width, label='Qwen 2.5 7B',
               color='#2ecc71', alpha=0.8)

ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('The Scale Paradox: Bigger Model, Worse Performance', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparison.index)
ax.legend()
ax.axhline(0.25, color='gray', linestyle='--', alpha=0.3, label='Random guess (25%)')
ax.set_ylim([0, 1.1])

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
                f'{height:.1%}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('/content/llama4_vs_qwen.png', dpi=150, bbox_inches='tight')
plt.show()

print("Possible explanations for Llama-4's poor performance:")
print()
print("1. Training data distribution")
print("   - Llama-4 may have seen fewer examples of temporal window closure")
print("   - Training data may emphasize 'take action' without temporal validation")
print()
print("2. Optimization objective")
print("   - If trained primarily on next-token prediction without reasoning traces")
print("   - May have learned pattern matching without temporal logic")
print()
print("3. Post-training alignment")
print("   - RLHF may have reinforced 'helpful' responses (suggesting actions)")
print("   - Without balancing with 'it's too late' responses")
print()
print("4. Model capacity misallocation")
print("   - Larger model may have allocated capacity to other capabilities")
print("   - Temporal reasoning may not have been prioritized in training")