In [None]:
"""
Temporal Perception in Neural Architectures: Empirical Validation
================================================================

This notebook implements statistically rigorous experiments to test whether
different neural architectures exhibit varying degrees of temporal reasoning
capability.

Architectures tested:
1. GPT-2 style transformer (baseline)
2. Mamba (state space model)
3. RWKV (RNN alternative)
4. Hybrid architectures if available

Experiments:
1. Urgency-based action prioritization
2. Too-late window detection
3. Multi-agent temporal coordination (simplified)

Statistical approach:
- Power analysis for sample size determination
- Multiple comparison correction (Bonferroni)
- Effect size calculations (Cohen's d)
- Bootstrap confidence intervals
"""

In [None]:
# Clear GPU memory

#torch.cuda.empty_cache()

# Delete model objects
#del models['Gemma 2 9B']
#del models['Phi-3.5 Mini']


# Force garbage collection
import gc
gc.collect()

# Check available space
#!df -h

# If you need more space, remove cached models from Hugging Face
#!rm -rf /root/.cache/huggingface/hub/*


In [None]:
#!pip install -q transformers accelerate bitsandbytes
#!pip install -q mamba-ssm  # State space models
#!pip install -q causal-conv1d>=1.2.0

In [None]:
import torch
import torch.nn.functional as F
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline
)
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.power import TTestIndPower
import plotly.graph_objects as go
import plotly.express as px
from dataclasses import dataclass
from typing import List, Dict, Tuple
import json
import random
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

In [None]:
# ============================================================================
# POWER ANALYSIS - DETERMINE REQUIRED SAMPLE SIZE
# ============================================================================

def calculate_required_sample_size(
    effect_size: float = 0.5,  # Medium effect size (Cohen's d)
    alpha: float = 0.05,        # Significance level
    power: float = 0.8,         # Statistical power
    num_groups: int = 4         # Number of architectures
):
    """
    Calculate required sample size for detecting differences between architectures.

    Cohen's d interpretations:
    - 0.2: Small effect
    - 0.5: Medium effect
    - 0.8: Large effect

    We use 0.5 (medium) because we expect meaningful but not massive differences.
    """
    analysis = TTestIndPower()
    n_per_group = analysis.solve_power(
        effect_size=effect_size,
        alpha=alpha / (num_groups * (num_groups - 1) / 2),  # Bonferroni correction
        power=power,
        alternative='two-sided'
    )

    # Round up
    n_per_group = int(np.ceil(n_per_group))

    print(f"Power Analysis Results:")
    print(f"  Effect size (Cohen's d): {effect_size}")
    print(f"  Significance level (α): {alpha}")
    print(f"  Statistical power: {power}")
    print(f"  Number of groups: {num_groups}")
    print(f"  Bonferroni-corrected α: {alpha / (num_groups * (num_groups - 1) / 2):.4f}")
    print(f"  Required sample size per group: {n_per_group}")
    print(f"  Total scenarios needed: {n_per_group * num_groups}")

    return n_per_group

# Run power analysis
SAMPLE_SIZE_PER_GROUP = calculate_required_sample_size()

# ============================================================================
# MODEL LOADING
# ============================================================================

@dataclass
class ModelConfig:
    name: str
    model_id: str
    architecture_type: str  # 'transformer', 'ssm', 'rnn', 'hybrid'
    max_length: int = 2048

MODELS = [

    ModelConfig(
        name="Llama-4 17B",
        model_id="meta-llama/Llama-4-Scout-17B-16E-Instruct",
        architecture_type="transformer",
        max_length=4096
    ),
]

class ArchitectureWrapper:
    """Unified interface for different architectures."""

    def __init__(self, config: ModelConfig):
        self.config = config
        self.device = device

        print(f"\nLoading {config.name}...")

        try:
            # Load model with appropriate settings
            if config.architecture_type == "ssm":
                # Mamba models
                self.model = AutoModelForCausalLM.from_pretrained(
                    config.model_id,
                    torch_dtype=torch.float16,
                    device_map="auto",
                    trust_remote_code=True
                )
            elif config.architecture_type == "rnn":
                # RWKV models
                self.model = AutoModelForCausalLM.from_pretrained(
                    config.model_id,
                    torch_dtype=torch.float16,
                    device_map="auto",
                    trust_remote_code=True
                )
            else:
                # Standard transformers
                self.model = AutoModelForCausalLM.from_pretrained(
                    config.model_id,
                    torch_dtype=torch.float16,
                    device_map="auto"
                )

            self.tokenizer = AutoTokenizer.from_pretrained(
                config.model_id,
                trust_remote_code=True
            )

            # Set pad token if needed
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token

            self.model.eval()
            print(f"✓ {config.name} loaded successfully")

        except Exception as e:
            print(f"✗ Failed to load {config.name}: {str(e)}")
            self.model = None
            self.tokenizer = None

    def generate_response(
        self,
        prompt: str,
        max_new_tokens: int = 256,
        temperature: float = 0.7
    ) -> str:
        """Generate response from model."""
        if self.model is None:
            return "MODEL_LOAD_FAILED"

        try:
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=self.config.max_length
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    do_sample=True,
                    pad_token_id=self.tokenizer.pad_token_id
                )

            response = self.tokenizer.decode(
                outputs[0][inputs['input_ids'].shape[1]:],
                skip_special_tokens=True
            )

            return response.strip()

        except Exception as e:
            print(f"Generation error: {str(e)}")
            return "GENERATION_FAILED"

# Load all models
models = {}
for config in MODELS:
    try:
        models[config.name] = ArchitectureWrapper(config)
    except Exception as e:
        print(f"Could not load {config.name}: {e}")

print(f"\n{len(models)} models loaded successfully")

In [None]:
# ============================================================================
# EXPERIMENT 1: URGENCY-BASED ACTION PRIORITIZATION
# ============================================================================

class UrgencyScenario:
    """Generate scenarios with varying urgency profiles."""

    DOMAINS = ['medical', 'operational', 'crisis', 'project']

    @staticmethod
    def generate_medical_scenario(seed: int) -> Dict:
        """Generate medical triage scenario."""
        random.seed(seed)
        np.random.seed(seed)

        # Define patients with different urgency levels
        patients = [
            {
                'room': '203',
                'complaint': 'chronic pain level 6/10',
                'vitals': 'stable',
                'timeline': '45 minutes ago',
                'urgency_score': 2,  # Low
                'explicit_marker': False
            },
            {
                'room': '187',
                'complaint': 'mildly elevated white blood cell count',
                'vitals': 'no symptoms',
                'timeline': '10 minutes ago',
                'urgency_score': 1,  # Very low
                'explicit_marker': False
            },
            {
                'room': '145',
                'complaint': 'temperature rising from 100.2°F to 102.8°F over past hour',
                'vitals': 'otherwise stable',
                'timeline': 'just reported',
                'urgency_score': 4,  # High - temporal dynamics indicate urgency
                'explicit_marker': False
            },
            {
                'room': '201',
                'complaint': 'family question about discharge paperwork',
                'vitals': 'patient stable, discharge tomorrow',
                'timeline': 'waiting 30 minutes',
                'urgency_score': 1,  # Very low
                'explicit_marker': False
            }
        ]

        # Shuffle to avoid position bias
        random.shuffle(patients)

        # Find correct answer (highest urgency)
        correct_idx = max(range(len(patients)),
                         key=lambda i: patients[i]['urgency_score'])

        # Build prompt
        prompt = "You are managing a hospital unit. Four situations require attention:\n\n"

        for i, p in enumerate(patients):
            prompt += f"{chr(65+i)}: Patient in Room {p['room']} - "
            prompt += f"{p['complaint']}. Vitals: {p['vitals']}. "
            prompt += f"Reported {p['timeline']}.\n\n"

        prompt += "You can only address one situation immediately. "
        prompt += "Which do you prioritize? Respond with just the letter (A, B, C, or D) "
        prompt += "and a brief explanation of why.\n\nAnswer:"

        return {
            'scenario': prompt,
            'correct_answer': chr(65 + correct_idx),
            'urgency_type': 'implicit',  # Requires inferring from temporal dynamics
            'domain': 'medical',
            'patients': patients
        }

    @staticmethod
    def generate_crisis_scenario(seed: int) -> Dict:
        """Generate crisis response scenario with explicit urgency markers."""
        random.seed(seed)

        situations = [
            {
                'description': 'power outage in server room, backup generators running',
                'timeline': '15 minutes ago',
                'urgency_score': 3,
                'explicit_marker': False
            },
            {
                'description': 'URGENT: smoke detector activated in chemical storage',
                'timeline': 'just now',
                'urgency_score': 5,
                'explicit_marker': True  # Explicit marker present
            },
            {
                'description': 'employee locked out of building, waiting at entrance',
                'timeline': '20 minutes ago',
                'urgency_score': 2,
                'explicit_marker': False
            },
            {
                'description': 'routine system update ready for installation',
                'timeline': 'scheduled for today',
                'urgency_score': 1,
                'explicit_marker': False
            }
        ]

        random.shuffle(situations)
        correct_idx = max(range(len(situations)),
                         key=lambda i: situations[i]['urgency_score'])

        prompt = "You are the facility manager on duty. Four situations need attention:\n\n"

        for i, s in enumerate(situations):
            prompt += f"{chr(65+i)}: {s['description'].capitalize()}. "
            prompt += f"Reported {s['timeline']}.\n\n"

        prompt += "You must respond to one situation immediately. "
        prompt += "Which do you prioritize? Respond with just the letter (A, B, C, or D) "
        prompt += "and a brief explanation.\n\nAnswer:"

        has_explicit = any(s['explicit_marker'] for s in situations)

        return {
            'scenario': prompt,
            'correct_answer': chr(65 + correct_idx),
            'urgency_type': 'explicit' if has_explicit else 'implicit',
            'domain': 'crisis',
            'situations': situations
        }

def run_urgency_experiment(
    models: Dict[str, ArchitectureWrapper],
    n_scenarios: int = None
) -> pd.DataFrame:
    """
    Run Experiment 1: Urgency-based prioritization.

    Tests whether models can assess urgency appropriately based on:
    1. Explicit markers ("URGENT", "CRITICAL")
    2. Implicit temporal dynamics (rising temperature, rapid change)
    """
    if n_scenarios is None:
        n_scenarios = SAMPLE_SIZE_PER_GROUP

    print(f"\n{'='*70}")
    print(f"EXPERIMENT 1: URGENCY-BASED ACTION PRIORITIZATION")
    print(f"{'='*70}")
    print(f"Running {n_scenarios} scenarios per model...")

    results = []

    for model_name, model in models.items():
        print(f"\nTesting {model_name}...")

        for i in tqdm(range(n_scenarios)):
            # Generate mixed scenario types
            if i % 2 == 0:
                scenario = UrgencyScenario.generate_medical_scenario(seed=i)
            else:
                scenario = UrgencyScenario.generate_crisis_scenario(seed=i)

            # Get model response
            response = model.generate_response(
                scenario['scenario'],
                max_new_tokens=100,
                temperature=0.7
            )

            # Extract choice (first letter A-D in response)
            choice = None
            for char in response:
                if char in ['A', 'B', 'C', 'D']:
                    choice = char
                    break

            # Record result
            correct = (choice == scenario['correct_answer'])

            results.append({
                'model': model_name,
                'architecture': model.config.architecture_type,
                'scenario_id': i,
                'domain': scenario['domain'],
                'urgency_type': scenario['urgency_type'],
                'correct_answer': scenario['correct_answer'],
                'model_choice': choice if choice else 'NONE',
                'correct': correct,
                'response': response[:200]  # Store first 200 chars
            })

    df = pd.DataFrame(results)

    # Calculate statistics
    print("\n" + "="*70)
    print("RESULTS: Urgency Prioritization Accuracy")
    print("="*70)

    summary = df.groupby(['model', 'architecture', 'urgency_type'])['correct'].agg([
        ('accuracy', 'mean'),
        ('n', 'count'),
        ('correct_count', 'sum')
    ]).reset_index()

    print(summary.to_string(index=False))

    return df

In [None]:
# ============================================================================
# EXPERIMENT 2: TOO-LATE WINDOW DETECTION
# ============================================================================

class WindowScenario:
    """Generate scenarios testing detection of closed temporal windows."""

    @staticmethod
    def generate_trading_scenario(seed: int) -> Dict:
        """Generate trading scenario with price movements."""
        random.seed(seed)
        np.random.seed(seed)

        initial_price = 50.00
        target_price = 51.00

        # Generate price trajectory
        timeline = [
            {'time': 0, 'price': initial_price, 'event': 'Breaking news expected at T=30min'},
            {'time': 10, 'price': 50.25, 'event': 'Can place market or limit order'},
            {'time': 20, 'price': 50.80, 'event': None},
            {'time': 28, 'price': 50.95, 'event': None},
            {'time': 30, 'price': 52.00, 'event': 'Breaking news drops, price jumps'},
            {'time': 31, 'price': 52.50, 'event': 'Price rising rapidly'},
        ]

        # Decision point: T=31, after price exceeded target
        prompt = "You are trading a stock position.\n\n"

        for point in timeline:
            prompt += f"T={point['time']} min: Price ${point['price']:.2f}"
            if point['event']:
                prompt += f" - {point['event']}"
            prompt += "\n"

        prompt += "\nAt T=31 minutes, you consider: "
        prompt += f"Should I place a limit order to buy at ${target_price:.2f}?\n\n"
        prompt += "Answer YES or NO and explain your reasoning.\n\nAnswer:"

        # Correct answer is NO - window has closed
        return {
            'scenario': prompt,
            'correct_answer': 'NO',
            'window_type': 'hard',  # Physically impossible now
            'domain': 'trading',
            'timeline': timeline
        }

    @staticmethod
    def generate_project_scenario(seed: int) -> Dict:
        """Generate project deadline scenario."""
        random.seed(seed)

        prompt = """You are managing a project with the following timeline:

Day 1 (Monday): Design phase begins
Day 3 (Wednesday): Design phase must be completed for review
Day 5 (Friday): Implementation begins (requires approved design)
Day 8 (Monday): Project deadline

Current situation:
- Today is Thursday (Day 4)
- Design phase is 80% complete but not yet submitted for review
- Review process takes minimum 2 days
- Implementation will take 3 days minimum

Question: Should we continue refining the design to make it perfect before submission?

Answer YES or NO and explain your reasoning.

Answer:"""

        # Correct answer is NO - window for design refinement has closed
        # Must submit now to have any chance of meeting deadline
        return {
            'scenario': prompt,
            'correct_answer': 'NO',
            'window_type': 'soft',  # Technically possible but suboptimal
            'domain': 'project',
            'timeline': None
        }

def run_toolate_experiment(
    models: Dict[str, ArchitectureWrapper],
    n_scenarios: int = None
) -> pd.DataFrame:
    """
    Run Experiment 2: Too-late window detection.

    Tests whether models recognize when temporal windows have closed.
    """
    if n_scenarios is None:
        n_scenarios = SAMPLE_SIZE_PER_GROUP

    print(f"\n{'='*70}")
    print(f"EXPERIMENT 2: TOO-LATE WINDOW DETECTION")
    print(f"{'='*70}")
    print(f"Running {n_scenarios} scenarios per model...")

    results = []

    for model_name, model in models.items():
        print(f"\nTesting {model_name}...")

        for i in tqdm(range(n_scenarios)):
            # Alternate between scenario types
            if i % 2 == 0:
                scenario = WindowScenario.generate_trading_scenario(seed=i)
            else:
                scenario = WindowScenario.generate_project_scenario(seed=i)

            # Get model response
            response = model.generate_response(
                scenario['scenario'],
                max_new_tokens=150,
                temperature=0.7
            )

            # Extract YES/NO from response
            response_upper = response.upper()

            # Look for clear YES or NO
            has_yes = 'YES' in response_upper
            has_no = 'NO' in response_upper

            if has_no and not has_yes:
                choice = 'NO'
            elif has_yes and not has_no:
                choice = 'YES'
            elif has_no and has_yes:
                # Both present, take first occurrence
                yes_pos = response_upper.find('YES')
                no_pos = response_upper.find('NO')
                choice = 'YES' if yes_pos < no_pos else 'NO'
            else:
                choice = 'UNCLEAR'

            correct = (choice == scenario['correct_answer'])

            results.append({
                'model': model_name,
                'architecture': model.config.architecture_type,
                'scenario_id': i,
                'domain': scenario['domain'],
                'window_type': scenario['window_type'],
                'correct_answer': scenario['correct_answer'],
                'model_choice': choice,
                'correct': correct,
                'response': response[:200]
            })

    df = pd.DataFrame(results)

    # Calculate statistics
    print("\n" + "="*70)
    print("RESULTS: Too-Late Detection Accuracy")
    print("="*70)

    summary = df.groupby(['model', 'architecture'])['correct'].agg([
        ('accuracy', 'mean'),
        ('n', 'count'),
        ('correct_count', 'sum')
    ]).reset_index()

    print(summary.to_string(index=False))

    # Calculate false positive rate (saying YES when should say NO)
    fp_df = df[df['correct_answer'] == 'NO'].copy()
    fp_rate = fp_df.groupby('model')['correct'].apply(
        lambda x: 1 - x.mean()  # 1 - accuracy = false positive rate
    ).reset_index()
    fp_rate.columns = ['model', 'false_positive_rate']

    print("\nFalse Positive Rates (continuing action after window closed):")
    print(fp_rate.to_string(index=False))

    return df

In [None]:
# ============================================================================
# STATISTICAL ANALYSIS
# ============================================================================

def perform_statistical_analysis(
    df: pd.DataFrame,
    experiment_name: str
) -> Dict:
    """
    Perform comprehensive statistical analysis on experimental results.

    Returns:
    - Pairwise comparisons between architectures
    - Effect sizes (Cohen's d)
    - Confidence intervals
    - ANOVA results
    """
    print(f"\n{'='*70}")
    print(f"STATISTICAL ANALYSIS: {experiment_name}")
    print(f"{'='*70}")

    # Group by model
    model_accuracies = df.groupby('model')['correct'].apply(list).to_dict()
    model_names = list(model_accuracies.keys())

    # Calculate means and stds
    stats_summary = []
    for model in model_names:
        accuracies = model_accuracies[model]
        mean_acc = np.mean(accuracies)
        std_acc = np.std(accuracies, ddof=1)
        n = len(accuracies)

        # 95% confidence interval
        ci = stats.t.interval(
            0.95,
            n-1,
            loc=mean_acc,
            scale=stats.sem(accuracies)
        )

        stats_summary.append({
            'model': model,
            'mean_accuracy': mean_acc,
            'std': std_acc,
            'n': n,
            'ci_lower': ci[0],
            'ci_upper': ci[1]
        })

    stats_df = pd.DataFrame(stats_summary)
    print("\nDescriptive Statistics:")
    print(stats_df.to_string(index=False))

    # Pairwise t-tests with Bonferroni correction
    num_comparisons = len(model_names) * (len(model_names) - 1) // 2
    bonferroni_alpha = 0.05 / num_comparisons

    print(f"\nPairwise Comparisons (Bonferroni-corrected α = {bonferroni_alpha:.4f}):")
    print("-" * 70)

    pairwise_results = []

    for i in range(len(model_names)):
        for j in range(i+1, len(model_names)):
            model_a = model_names[i]
            model_b = model_names[j]

            data_a = np.array(model_accuracies[model_a])
            data_b = np.array(model_accuracies[model_b])

            # Independent samples t-test
            t_stat, p_value = stats.ttest_ind(data_a, data_b)

            # Cohen's d effect size
            pooled_std = np.sqrt(
                ((len(data_a) - 1) * np.var(data_a, ddof=1) +
                 (len(data_b) - 1) * np.var(data_b, ddof=1)) /
                (len(data_a) + len(data_b) - 2)
            )
            cohens_d = (np.mean(data_a) - np.mean(data_b)) / pooled_std

            significant = "***" if p_value < bonferroni_alpha else ""

            print(f"{model_a} vs {model_b}:")
            print(f"  Mean difference: {np.mean(data_a) - np.mean(data_b):.4f}")
            print(f"  t-statistic: {t_stat:.4f}")
            print(f"  p-value: {p_value:.4f} {significant}")
            print(f"  Cohen's d: {cohens_d:.4f}")
            print()

            pairwise_results.append({
                'comparison': f"{model_a} vs {model_b}",
                'mean_diff': np.mean(data_a) - np.mean(data_b),
                't_stat': t_stat,
                'p_value': p_value,
                'significant': p_value < bonferroni_alpha,
                'cohens_d': cohens_d
            })

    pairwise_df = pd.DataFrame(pairwise_results)

    # One-way ANOVA
    accuracy_groups = [model_accuracies[m] for m in model_names]
    f_stat, p_anova = stats.f_oneway(*accuracy_groups)

    print(f"One-Way ANOVA:")
    print(f"  F-statistic: {f_stat:.4f}")
    print(f"  p-value: {p_anova:.4f}")

    if p_anova < 0.05:
        print(f"  → Significant differences exist between models (p < 0.05)")
    else:
        print(f"  → No significant differences detected between models")

    return {
        'descriptive_stats': stats_df,
        'pairwise_comparisons': pairwise_df,
        'anova_f': f_stat,
        'anova_p': p_anova
    }

# ============================================================================
# VISUALIZATION
# ============================================================================

def visualize_results(
    experiment1_df: pd.DataFrame,
    experiment2_df: pd.DataFrame
):
    """Create comprehensive visualizations of experimental results."""

    # Experiment 1: Urgency prioritization by urgency type
    fig1 = px.bar(
        experiment1_df.groupby(['model', 'urgency_type'])['correct'].mean().reset_index(),
        x='model',
        y='correct',
        color='urgency_type',
        barmode='group',
        title='Experiment 1: Urgency Prioritization Accuracy by Type',
        labels={'correct': 'Accuracy', 'model': 'Model', 'urgency_type': 'Urgency Type'},
        color_discrete_map={'explicit': '#2ecc71', 'implicit': '#e74c3c'}
    )
    fig1.update_layout(yaxis_range=[0, 1])
    fig1.show()

    # Experiment 2: Window detection accuracy
    fig2 = px.bar(
        experiment2_df.groupby('model')['correct'].mean().reset_index(),
        x='model',
        y='correct',
        title='Experiment 2: Too-Late Window Detection Accuracy',
        labels={'correct': 'Accuracy', 'model': 'Model'},
        color='model'
    )
    fig2.update_layout(yaxis_range=[0, 1], showlegend=False)
    fig2.show()

    # Combined architecture comparison
    exp1_summary = experiment1_df.groupby('architecture')['correct'].mean()
    exp2_summary = experiment2_df.groupby('architecture')['correct'].mean()

    combined_df = pd.DataFrame({
        'Urgency Prioritization': exp1_summary,
        'Window Detection': exp2_summary
    }).reset_index()

    fig3 = go.Figure()
    for exp in ['Urgency Prioritization', 'Window Detection']:
        fig3.add_trace(go.Bar(
            x=combined_df['architecture'],
            y=combined_df[exp],
            name=exp
        ))

    fig3.update_layout(
        title='Overall Performance by Architecture Type',
        xaxis_title='Architecture',
        yaxis_title='Accuracy',
        yaxis_range=[0, 1],
        barmode='group'
    )
    fig3.show()



In [None]:
"""Run complete experimental pipeline."""

print(f"\n{'#'*70}")
print(f"# TEMPORAL PERCEPTION IN NEURAL ARCHITECTURES")
print(f"# Empirical Validation Study")
print(f"{'#'*70}\n")

# Run experiments
print("Starting experiments...")

exp1_df = run_urgency_experiment(models)
exp2_df = run_toolate_experiment(models)

# Statistical analysis
exp1_stats = perform_statistical_analysis(exp1_df, "Urgency Prioritization")
exp2_stats = perform_statistical_analysis(exp2_df, "Window Detection")

# Visualizations
visualize_results(exp1_df, exp2_df)

# Save results
exp1_df.to_csv('experiment1_urgency_results.csv', index=False)
exp2_df.to_csv('experiment2_window_results.csv', index=False)

print("\n" + "="*70)
print("EXPERIMENTS COMPLETE")
print("="*70)
print(f"Results saved to CSV files")
print(f"Total scenarios run: {len(exp1_df) + len(exp2_df)}")

# exp1_df, exp2_df, exp1_stats, exp2_stats

