In [1]:
# -*- coding: utf-8 -*-
"""
Enhanced SAIP Benchmarking System with Information Systems Theory & Econometric Analysis
- Comprehensive multi-model, multi-task analysis
- Information Systems Quality Model (DeLone & McLean)
- Technology Acceptance Model (TAM) integration
- Advanced econometric modeling
- Production efficiency frontier analysis
- Comprehensive result storage and visualization
"""

import os
import io
import time
import json
import math
import random
import warnings
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import scipy.stats as stats
from sklearn.model_selection import ParameterGrid
from sklearn.utils import resample
import statsmodels.api as sm
import patsy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.offline as pyo
from datetime import datetime
import xlsxwriter

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

class EnhancedSAIPAnalyzer:
    """
    Enhanced SAIP Analyzer with Information Systems Theory and Econometric Models
    """
    
    def __init__(self):
        self.results_storage = {}
        self.econometric_models = {}
        self.is_quality_metrics = {}
        self.tam_metrics = {}
        self.production_frontier = None
        
    def setup_directories(self):
        """Create directories for saving results"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.output_dir = f"SAIP_Analysis_Results_{timestamp}"
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(f"{self.output_dir}/visualizations", exist_ok=True)
        os.makedirs(f"{self.output_dir}/models", exist_ok=True)
        os.makedirs(f"{self.output_dir}/data", exist_ok=True)
        return self.output_dir

    def load_and_prepare_enhanced_data(self, df_main_raw, df_hf_raw, df_ts_raw):
        """Enhanced data loading with comprehensive preprocessing"""
        print("üîÑ Enhanced Data Loading and Preparation...")
        
        # Time-series data cleaning
        df_ts_clean = df_ts_raw.copy()
        ts_column_mapping = {
            'Model': 'model',
            'Task': 'task', 
            'Sample': 'sample',
            'Tokens Generated': 'tokens_generated',
            'Tokens/sec': 'tokens_per_sec'
        }
        df_ts_clean.rename(columns=ts_column_mapping, inplace=True)

        # Enhanced data type conversion
        numeric_cols = ['latency_ms', 'tokens_generated', 'tokens_per_sec']
        for col in numeric_cols:
            if col in df_ts_clean.columns:
                df_ts_clean[col] = pd.to_numeric(df_ts_clean[col], errors='coerce')

        # Advanced outlier detection using IQR method
        initial_rows = len(df_ts_clean)
        for col in ['latency_ms', 'tokens_generated']:
            if col in df_ts_clean.columns:
                Q1 = df_ts_clean[col].quantile(0.25)
                Q3 = df_ts_clean[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                df_ts_clean = df_ts_clean[
                    (df_ts_clean[col] >= lower_bound) & 
                    (df_ts_clean[col] <= upper_bound)
                ]

        df_ts_clean.dropna(subset=['latency_ms', 'tokens_generated'], inplace=True)
        df_ts_clean = df_ts_clean[df_ts_clean['tokens_generated'] >= 1]
        
        print(f"   ‚úÖ Cleaned time-series data: {len(df_ts_clean)}/{initial_rows} valid rows")

        # Main data processing
        df_main = df_main_raw.copy()
        main_column_mapping = {
            '    Model': 'model',
            'Task': 'task',
            'Latency Avg (ms)': 'latency_avg_ms',
            'Latency P95 (ms)': 'latency_p95_ms',
            'Latency Std (ms)': 'latency_std_ms',
            'throughput_tokens_per_sec': 'throughput_tokens_per_sec',
            'ÌååÎùºÎØ∏ÌÑ∞_b': 'parameters_b',
            'ÏµúÎåÄ ÏãúÌÄÄÏä§ Í∏∏Ïù¥': 'max_context_length',
            'ÏòàÏÉÅ Ï†ÑÎ†• (W)': 'estimated_power_w',
            'CPU Memory Avg (MB)': 'cpu_memory_mb'
        }
        df_main.rename(columns=main_column_mapping, inplace=True)
        
        # Standardize model names
        for df in [df_main, df_ts_clean]:
            if 'model' in df.columns:
                df['model'] = df['model'].str.strip()

        # Create comprehensive analysis dataframe
        df_final_analysis = df_main.copy()
        
        # Convert numeric columns
        numeric_main_cols = ['latency_avg_ms', 'latency_p95_ms', 'latency_std_ms', 
                            'parameters_b', 'estimated_power_w', 'cpu_memory_mb',
                            'throughput_tokens_per_sec']
        
        for col in numeric_main_cols:
            if col in df_final_analysis.columns:
                if col == 'max_context_length':
                    df_final_analysis[col] = pd.to_numeric(
                        df_final_analysis[col].astype(str).str.replace(',', ''),
                        errors='coerce'
                    )
                else:
                    df_final_analysis[col] = pd.to_numeric(df_final_analysis[col], errors='coerce')

        print(f"   ‚úÖ Processed main data: {len(df_final_analysis)} records")
        print(f"   ‚úÖ Available models: {df_final_analysis['model'].nunique()}")
        print(f"   ‚úÖ Available tasks: {df_final_analysis['task'].nunique()}")
        
        return df_ts_clean, df_final_analysis

    def create_advanced_simulator(self, df_ts_clean, df_final_analysis):
        """Advanced simulator with machine learning-based prediction"""
        print("ü§ñ Creating Advanced ML-based Simulator...")
        
        # Build performance prediction models for each task
        self.task_models = {}
        tasks = df_ts_clean['task'].unique()
        
        for task in tasks:
            task_data = df_ts_clean[df_ts_clean['task'] == task].copy()
            if len(task_data) < 10:
                continue
                
            # Feature engineering
            task_data = task_data.merge(
                df_final_analysis[['model', 'task', 'parameters_b', 'estimated_power_w']],
                on=['model', 'task'], 
                how='left'
            )
            
            # Prepare features and targets
            feature_cols = ['tokens_generated', 'parameters_b']
            feature_cols = [col for col in feature_cols if col in task_data.columns]
            
            if len(feature_cols) > 0:
                X = task_data[feature_cols].fillna(task_data[feature_cols].median())
                y = task_data['latency_ms']
                
                # Train Random Forest model
                rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
                rf_model.fit(X, y)
                
                r2 = rf_model.score(X, y)
                self.task_models[task] = {
                    'model': rf_model,
                    'features': feature_cols,
                    'r2': r2,
                    'data': task_data
                }
                print(f"   ‚úÖ {task} model trained (R¬≤ = {r2:.3f})")

        def enhanced_generate(model_name, task, max_new_tokens=50, temperature=0.0, token_callback=None):
            """Enhanced generation simulation with ML prediction"""
            
            # Try ML prediction first
            if task in self.task_models:
                model_info = self.task_models[task]
                
                # Get model features
                model_data = df_final_analysis[
                    (df_final_analysis['model'] == model_name) & 
                    (df_final_analysis['task'] == task)
                ]
                
                if not model_data.empty:
                    features = []
                    for feat in model_info['features']:
                        if feat == 'tokens_generated':
                            features.append(max_new_tokens)
                        elif feat == 'parameters_b':
                            features.append(model_data['parameters_b'].iloc[0])
                        else:
                            features.append(0)  # Default value
                    
                    # Predict latency
                    try:
                        X_pred = np.array(features).reshape(1, -1)
                        predicted_latency = model_info['model'].predict(X_pred)[0]
                        
                        # Add some realistic variance
                        noise_factor = 0.1
                        predicted_latency *= (1 + np.random.normal(0, noise_factor))
                        
                        # Estimate other metrics
                        ttft_est = predicted_latency * 0.15
                        generation_time = predicted_latency - ttft_est
                        tbt_est = generation_time / max(1, max_new_tokens - 1)
                        
                        return {
                            "latency_ms": float(max(0, predicted_latency)),
                            "ttft_ms": float(max(0, ttft_est)),
                            "tbt_mean_ms": float(max(0, tbt_est)),
                            "tbt_p95_ms": float(max(0, tbt_est * 1.2)),
                            "tokens": int(max_new_tokens)
                        }
                    except Exception as e:
                        pass  # Fall back to original method
            
            # Fallback to original sampling method
            model_task_data = df_ts_clean[
                (df_ts_clean['model'] == model_name) & (df_ts_clean['task'] == task)
            ]
            
            if model_task_data.empty:
                model_task_data = df_ts_clean[df_ts_clean['model'] == model_name]
                if model_task_data.empty:
                    return "SIMULATION_ERROR: NO_DATA"

            sample_row = model_task_data.sample(1).iloc[0]
            actual_latency = sample_row['latency_ms']
            actual_tokens = sample_row['tokens_generated']

            # Scale to requested tokens
            scaling_factor = max_new_tokens / max(1, actual_tokens)
            simulated_latency = actual_latency * (scaling_factor ** 0.8)  # Sublinear scaling
            
            ttft_est = simulated_latency * 0.15
            generation_time = simulated_latency - ttft_est
            tbt_est = generation_time / max(1, max_new_tokens - 1)

            return {
                "latency_ms": float(max(0, simulated_latency)),
                "ttft_ms": float(max(0, ttft_est)),
                "tbt_mean_ms": float(max(0, tbt_est)),
                "tbt_p95_ms": float(max(0, tbt_est * 1.2)),
                "tokens": int(max_new_tokens)
            }

        def enhanced_power(model_name):
            """Enhanced power estimation"""
            try:
                model_data = df_final_analysis[df_final_analysis['model'] == model_name]
                if not model_data.empty:
                    base_power = model_data['estimated_power_w'].iloc[0]
                    if pd.notna(base_power):
                        # Add load-dependent variation
                        load_factor = np.random.uniform(0.7, 1.3)
                        return base_power * load_factor
                return np.nan
            except:
                return np.nan
                
        print("   ‚úÖ Enhanced simulator ready")
        return enhanced_generate, enhanced_power

    def information_systems_quality_analysis(self, df_results):
        """
        Information Systems Quality Analysis based on DeLone & McLean Model
        - System Quality: Performance, reliability, response time
        - Information Quality: Accuracy, completeness, consistency
        - Service Quality: Responsiveness, assurance, empathy
        """
        print("üìä Information Systems Quality Analysis (DeLone & McLean Model)...")
        
        is_quality = {}
        
        for model in df_results['model'].unique():
            model_data = df_results[df_results['model'] == model]
            
            if model_data.empty:
                continue
                
            # System Quality Metrics
            latency_mean = model_data['latency_ms'].mean()
            latency_std = model_data['latency_ms'].std()
            reliability = 1 / (1 + model_data['latency_ms'].std() / model_data['latency_ms'].mean())
            availability = len(model_data) / len(model_data)  # Simulated availability
            
            system_quality = {
                'performance_score': 1 / (1 + latency_mean / 1000),  # Normalized performance
                'reliability_score': reliability,
                'availability_score': availability,
                'response_time_score': 1 / (1 + latency_mean / 500)
            }
            
            # Information Quality Metrics
            consistency = 1 - (latency_std / latency_mean) if latency_mean > 0 else 0
            accuracy = 1 - min(model_data.get('error_rate', 0).mean() if 'error_rate' in model_data else 0, 1)
            completeness = len(model_data.dropna()) / len(model_data)
            
            information_quality = {
                'accuracy_score': accuracy,
                'consistency_score': max(0, consistency),
                'completeness_score': completeness,
                'timeliness_score': system_quality['response_time_score']
            }
            
            # Service Quality Metrics
            energy_efficiency = 1 / (1 + model_data.get('energy_Wh', 1).mean())
            cost_effectiveness = 1 / (1 + model_data.get('carbon_g', 1).mean())
            
            service_quality = {
                'responsiveness_score': system_quality['response_time_score'],
                'efficiency_score': energy_efficiency,
                'cost_effectiveness_score': cost_effectiveness,
                'user_satisfaction_score': (system_quality['performance_score'] + 
                                          information_quality['consistency_score']) / 2
            }
            
            # Overall IS Quality Score
            overall_score = (
                np.mean(list(system_quality.values())) * 0.4 +
                np.mean(list(information_quality.values())) * 0.3 +
                np.mean(list(service_quality.values())) * 0.3
            )
            
            is_quality[model] = {
                'system_quality': system_quality,
                'information_quality': information_quality,
                'service_quality': service_quality,
                'overall_is_quality_score': overall_score
            }
        
        self.is_quality_metrics = is_quality
        return is_quality

    def technology_acceptance_model_analysis(self, df_results, df_final_analysis):
        """
        Technology Acceptance Model (TAM) Analysis
        - Perceived Usefulness (PU)
        - Perceived Ease of Use (PEOU)
        - Behavioral Intention to Use (BI)
        """
        print("üéØ Technology Acceptance Model (TAM) Analysis...")
        
        tam_analysis = {}
        
        for model in df_results['model'].unique():
            model_data = df_results[df_results['model'] == model]
            model_info = df_final_analysis[df_final_analysis['model'] == model]
            
            if model_data.empty or model_info.empty:
                continue
            
            # Perceived Usefulness (PU) - based on performance and efficiency
            avg_latency = model_data['latency_ms'].mean()
            throughput = model_info['throughput_tokens_per_sec'].iloc[0] if 'throughput_tokens_per_sec' in model_info else 1
            energy_efficiency = 1 / (1 + model_data.get('energy_Wh', 1).mean())
            
            perceived_usefulness = {
                'performance_utility': 1 / (1 + avg_latency / 1000),
                'efficiency_utility': energy_efficiency,
                'throughput_utility': min(throughput / 50, 1),  # Normalized
                'overall_usefulness': 0
            }
            perceived_usefulness['overall_usefulness'] = np.mean([
                perceived_usefulness['performance_utility'],
                perceived_usefulness['efficiency_utility'],
                perceived_usefulness['throughput_utility']
            ])
            
            # Perceived Ease of Use (PEOU) - based on consistency and simplicity
            latency_cv = model_data['latency_ms'].std() / model_data['latency_ms'].mean()
            complexity = model_info['parameters_b'].iloc[0] if 'parameters_b' in model_info else 1
            memory_usage = model_info.get('cpu_memory_mb', pd.Series([1000])).iloc[0]
            
            perceived_ease_of_use = {
                'consistency_ease': 1 / (1 + latency_cv),
                'complexity_ease': 1 / (1 + complexity / 5),  # Normalized by 5B params
                'resource_ease': 1 / (1 + memory_usage / 10000),  # Normalized by 10GB
                'overall_ease_of_use': 0
            }
            perceived_ease_of_use['overall_ease_of_use'] = np.mean([
                perceived_ease_of_use['consistency_ease'],
                perceived_ease_of_use['complexity_ease'],
                perceived_ease_of_use['resource_ease']
            ])
            
            # Behavioral Intention to Use (BI) - TAM model prediction
            # BI = Œ±‚ÇÅ*PU + Œ±‚ÇÇ*PEOU + Œ±‚ÇÉ*PU*PEOU
            pu = perceived_usefulness['overall_usefulness']
            peou = perceived_ease_of_use['overall_ease_of_use']
            
            behavioral_intention = 0.6 * pu + 0.3 * peou + 0.1 * pu * peou
            
            tam_analysis[model] = {
                'perceived_usefulness': perceived_usefulness,
                'perceived_ease_of_use': perceived_ease_of_use,
                'behavioral_intention': behavioral_intention,
                'tam_score': behavioral_intention
            }
        
        self.tam_metrics = tam_analysis
        return tam_analysis

    def econometric_analysis(self, df_results, df_final_analysis):
        """
        Advanced Econometric Analysis
        - Production Function Estimation
        - Technical Efficiency Analysis
        - Cost Function Analysis
        - Panel Data Analysis
        """
        print("üìà Advanced Econometric Analysis...")
        
        econometric_results = {}
        
        # Prepare panel data - ensure proper data types
        panel_data = df_results.merge(
            df_final_analysis[['model', 'task', 'parameters_b', 'estimated_power_w']],
            on=['model', 'task'],
            how='left'
        )
        
        # Convert numeric columns to proper types
        numeric_columns = ['latency_ms', 'parameters_b', 'estimated_power_w', 'energy_Wh', 'tokens']
        for col in numeric_columns:
            if col in panel_data.columns:
                panel_data[col] = pd.to_numeric(panel_data[col], errors='coerce')
        
        # 1. Production Function Analysis (Cobb-Douglas)
        # Output = A * (Parameters)^Œ± * (Power)^Œ≤
        print("   üìä Estimating Production Function...")
        
        try:
            # Prepare variables for production function
            prod_data = panel_data.dropna(subset=['latency_ms', 'parameters_b', 'estimated_power_w'])
            
            if len(prod_data) > 10:
                # Ensure all data is numeric
                prod_data = prod_data.copy()
                prod_data['log_output'] = np.log(1 / prod_data['latency_ms'].astype(float))  # Higher output = lower latency
                prod_data['log_params'] = np.log(prod_data['parameters_b'].astype(float))
                prod_data['log_power'] = np.log(prod_data['estimated_power_w'].astype(float))
                
                # Estimate production function
                X_prod = prod_data[['log_params', 'log_power']].astype(float)
                X_prod = sm.add_constant(X_prod)
                y_prod = prod_data['log_output'].astype(float)
                
                prod_model = sm.OLS(y_prod, X_prod).fit()
                
                econometric_results['production_function'] = {
                    'model': prod_model,
                    'parameters_elasticity': prod_model.params.get('log_params', 0),
                    'power_elasticity': prod_model.params.get('log_power', 0),
                    'constant': prod_model.params.get('const', 0),
                    'r_squared': prod_model.rsquared,
                    'summary': prod_model.summary()
                }
                
                print(f"   ‚úÖ Production Function R¬≤ = {prod_model.rsquared:.3f}")
        except Exception as e:
            print(f"   ‚ö†Ô∏è Production Function estimation failed: {e}")
        
        # 2. Technical Efficiency Analysis using Data Envelopment Analysis (DEA) approach
        print("   üìä Technical Efficiency Analysis...")
        
        try:
            efficiency_data = panel_data.groupby('model').agg({
                'latency_ms': 'mean',
                'energy_Wh': lambda x: x.dropna().mean() if not x.dropna().empty else np.nan,
                'parameters_b': 'first',
                'tokens': 'mean'
            }).reset_index()
            
            # Ensure numeric types
            for col in ['latency_ms', 'energy_Wh', 'parameters_b', 'tokens']:
                if col in efficiency_data.columns:
                    efficiency_data[col] = pd.to_numeric(efficiency_data[col], errors='coerce')
            
            # Calculate efficiency scores (lower latency and energy = higher efficiency)
            min_latency = efficiency_data['latency_ms'].min()
            min_energy = efficiency_data['energy_Wh'].min()
            
            efficiency_data['latency_efficiency'] = min_latency / efficiency_data['latency_ms']
            efficiency_data['energy_efficiency'] = min_energy / efficiency_data['energy_Wh'].fillna(min_energy)
            efficiency_data['overall_efficiency'] = (efficiency_data['latency_efficiency'] * 0.6 + 
                                                   efficiency_data['energy_efficiency'] * 0.4)
            
            econometric_results['technical_efficiency'] = efficiency_data
            print(f"   ‚úÖ Technical Efficiency calculated for {len(efficiency_data)} models")
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Technical Efficiency analysis failed: {e}")
        
        # 3. Cost Function Analysis
        print("   üìä Cost Function Analysis...")
        
        try:
            # Cost = f(Output, Input Prices, Technology)
            cost_data = panel_data.dropna(subset=['energy_Wh', 'latency_ms', 'parameters_b'])
            
            if len(cost_data) > 10:
                # Ensure numeric types
                cost_data = cost_data.copy()
                cost_data['total_cost'] = cost_data['energy_Wh'].astype(float) * 0.1 + cost_data['parameters_b'].astype(float) * 0.01  # Simulated cost
                cost_data['log_cost'] = np.log(cost_data['total_cost'] + 1e-6)
                cost_data['log_output'] = np.log(1 / cost_data['latency_ms'].astype(float))
                
                X_cost = sm.add_constant(cost_data[['log_output']].astype(float))
                y_cost = cost_data['log_cost'].astype(float)
                
                cost_model = sm.OLS(y_cost, X_cost).fit()
                
                econometric_results['cost_function'] = {
                    'model': cost_model,
                    'output_elasticity': cost_model.params.get('log_output', 0),
                    'r_squared': cost_model.rsquared,
                    'summary': cost_model.summary()
                }
                
                print(f"   ‚úÖ Cost Function R¬≤ = {cost_model.rsquared:.3f}")
        except Exception as e:
            print(f"   ‚ö†Ô∏è Cost Function estimation failed: {e}")
        
        # 4. Panel Data Fixed Effects Model - FIXED VERSION
        print("   üìä Panel Data Analysis...")
        
        try:
            # Create dummy variables for models (fixed effects)
            panel_reg_data = panel_data.dropna(subset=['latency_ms', 'parameters_b'])
            
            if len(panel_reg_data) > 20:
                # Ensure numeric types for the main variables
                panel_reg_data = panel_reg_data.copy()
                panel_reg_data['latency_ms'] = panel_reg_data['latency_ms'].astype(float)
                panel_reg_data['parameters_b'] = panel_reg_data['parameters_b'].astype(float)
                
                # Create model dummies and ensure they are numeric
                model_dummies = pd.get_dummies(panel_reg_data['model'], prefix='model').astype(float)
                task_dummies = pd.get_dummies(panel_reg_data['task'], prefix='task').astype(float)
                
                # Combine all features and ensure they are numeric
                X_panel = pd.concat([
                    panel_reg_data[['parameters_b']],
                    model_dummies.iloc[:, :-1],  # Drop one dummy to avoid multicollinearity
                    task_dummies.iloc[:, :-1]
                ], axis=1)
                
                # Ensure all columns are numeric
                for col in X_panel.columns:
                    X_panel[col] = pd.to_numeric(X_panel[col], errors='coerce')
                
                X_panel = sm.add_constant(X_panel)
                y_panel = panel_reg_data['latency_ms'].astype(float)
                
                # Drop any remaining NaN values after conversion
                valid_mask = ~(X_panel.isna().any(axis=1) | y_panel.isna())
                X_panel_clean = X_panel[valid_mask]
                y_panel_clean = y_panel[valid_mask]
                
                if len(X_panel_clean) > 10:  # Ensure we have enough data
                    panel_model = sm.OLS(y_panel_clean, X_panel_clean).fit()
                    
                    econometric_results['panel_data_model'] = {
                        'model': panel_model,
                        'r_squared': panel_model.rsquared,
                        'summary': panel_model.summary()
                    }
                    
                    print(f"   ‚úÖ Panel Data Model R¬≤ = {panel_model.rsquared:.3f}")
                else:
                    print("   ‚ö†Ô∏è Not enough valid data for panel data analysis after cleaning")
            else:
                print("   ‚ö†Ô∏è Not enough data for panel data analysis")
                
        except Exception as e:
            print(f"   ‚ö†Ô∏è Panel Data analysis failed: {e}")
        
        self.econometric_models = econometric_results
        return econometric_results

    def comprehensive_simulation(self, generate_fn, power_fn, df_final_analysis, df_ts_clean):
        """
        Comprehensive simulation across all models and tasks
        """
        print("üöÄ Running Comprehensive Multi-Model, Multi-Task Simulation...")
        
        all_results = []
        all_kpis = []
        
        models = df_final_analysis['model'].unique()
        tasks = df_final_analysis['task'].unique()
        
        print(f"   üìã Analyzing {len(models)} models across {len(tasks)} tasks")
        
        # Progress tracking
        total_combinations = len(models) * len(tasks)
        progress_bar = tqdm(total=total_combinations, desc="Simulating Combinations")
        
        for model in models:
            for task in tasks:
                # Check if combination has data
                has_data = not df_ts_clean[
                    (df_ts_clean['model'] == model) & (df_ts_clean['task'] == task)
                ].empty
                
                if not has_data:
                    progress_bar.update(1)
                    continue
                
                # Run simulation for this model-task combination
                try:
                    # Baseline simulation
                    df_baseline = self.simulate_saip_enhanced(
                        generate_fn, model, task, steps=100, 
                        energy_fn=power_fn, apply_policy=False
                    )
                    
                    # SAIP simulation
                    df_saip = self.simulate_saip_enhanced(
                        generate_fn, model, task, steps=100,
                        energy_fn=power_fn, apply_policy=True,
                        targets={"max_p95_ms": 2000, "max_cv": 0.3, "max_power_W": 15.0}
                    )
                    
                    # Add metadata
                    for df, policy_type in [(df_baseline, 'baseline'), (df_saip, 'saip')]:
                        if not df.empty:
                            df['policy_type'] = policy_type
                            df['model_task_combination'] = f"{model}_{task}"
                            all_results.append(df)
                    
                    # Aggregate KPIs
                    if not df_baseline.empty and not df_saip.empty:
                        for df, policy_type in [(df_baseline, 'baseline'), (df_saip, 'saip')]:
                            kpis = self.aggregate_enhanced_kpis(df, model, task, policy_type)
                            all_kpis.append(kpis)
                
                except Exception as e:
                    print(f"   ‚ö†Ô∏è Error in {model}-{task}: {e}")
                
                progress_bar.update(1)
        
        progress_bar.close()
        
        # Combine all results
        df_all_results = pd.concat(all_results, ignore_index=True) if all_results else pd.DataFrame()
        df_all_kpis = pd.DataFrame(all_kpis) if all_kpis else pd.DataFrame()
        
        print(f"   ‚úÖ Completed simulation: {len(df_all_results)} total observations")
        print(f"   ‚úÖ Generated KPIs for {len(df_all_kpis)} model-task-policy combinations")
        
        return df_all_results, df_all_kpis

    def simulate_saip_enhanced(self, generate_fn, model, task, steps=100, init_max_new_tokens=50, 
                             init_temp=0.2, targets=None, energy_fn=None, apply_policy=True):
        """Enhanced SAIP simulation with better policy logic"""
        
        if targets is None:
            targets = {"max_p95_ms": 2000, "max_cv": 0.4, "max_power_W": 10.0}
            
        history = []
        lat_hist = []
        pwr_hist = []
        max_nt = init_max_new_tokens
        temp = init_temp
        
        for i in range(steps):
            try:
                # Run generation
                m = generate_fn(model, task, max_new_tokens=max_nt, temperature=temp)
                
                if not m or isinstance(m, str):
                    continue

                lat_hist.append(m["latency_ms"])
                power_W = energy_fn(model) if energy_fn else np.nan
                pwr_hist.append(power_W)
                
                # Calculate moving statistics
                window = min(30, len(lat_hist))
                lat_window = lat_hist[-window:]
                
                cv = np.std(lat_window) / np.mean(lat_window) if np.mean(lat_window) > 0 else 0
                p95 = np.percentile(lat_window, 95) if len(lat_window) >= 5 else m["latency_ms"]
                
                state = {
                    "cv": cv, "p95_ms": p95, "power_W": power_W,
                    "max_new_tokens": max_nt, "temperature": temp
                }
                
                if apply_policy:
                    actions = self.enhanced_saip_policy(state, targets)
                    max_nt = actions["max_new_tokens"]
                    temp = actions["temperature"]
                    if actions["throttle_ms"] > 0:
                        time.sleep(actions["throttle_ms"] / 1000.0)
                else:
                    actions = {"max_new_tokens": max_nt, "throttle_ms": 0, "temperature": temp}

                # Calculate energy and carbon
                duration_s = m["latency_ms"] / 1000.0
                energy_Wh = power_W * (duration_s / 3600.0) if pd.notna(power_W) else np.nan
                carbon_factor = 0.45  # kg CO2 per kWh
                carbon_g = (energy_Wh / 1000.0) * carbon_factor * 1e3 if pd.notna(energy_Wh) else np.nan

                history.append({
                    **m, **state, **actions, 
                    "step": i, "model": model, "task": task,
                    "power_W_mean": power_W, "energy_Wh": energy_Wh, "carbon_g": carbon_g
                })
                
            except Exception as e:
                continue
                
        return pd.DataFrame(history)

    def enhanced_saip_policy(self, state, targets):
        """Enhanced SAIP policy with more sophisticated logic"""
        
        actions = {
            "max_new_tokens": state.get("max_new_tokens", 50),
            "throttle_ms": 0,
            "temperature": state.get("temperature", 0.0)
        }
        
        # Adaptive response based on multiple criteria
        p95_violation = state.get("p95_ms", 0) > targets.get("max_p95_ms", 2000)
        cv_violation = state.get("cv", 0) > targets.get("max_cv", 0.4)
        power_violation = state.get("power_W", 0) > targets.get("max_power_W", 10.0)
        
        # Count violations for graduated response
        violations = sum([p95_violation, cv_violation, power_violation])
        
        if violations >= 2:  # Multiple violations - aggressive action
            actions["max_new_tokens"] = max(16, actions["max_new_tokens"] - 12)
            actions["throttle_ms"] = min(100, actions["throttle_ms"] + 30)
            actions["temperature"] = max(0.0, actions["temperature"] - 0.15)
        elif violations == 1:  # Single violation - moderate action
            actions["max_new_tokens"] = max(20, actions["max_new_tokens"] - 6)
            actions["throttle_ms"] = min(50, actions["throttle_ms"] + 15)
            actions["temperature"] = max(0.0, actions["temperature"] - 0.1)
        else:  # No violations - gradual recovery
            actions["max_new_tokens"] = min(60, actions["max_new_tokens"] + 2)
            actions["temperature"] = min(0.5, actions["temperature"] + 0.05)
            
        return actions

    def aggregate_enhanced_kpis(self, df_runs, model_name, task_name, policy_type):
        """Enhanced KPI aggregation"""
        
        kpis = {
            "model": model_name,
            "task": task_name,
            "policy_type": policy_type
        }
        
        # Core performance metrics
        for metric in ["latency_ms", "ttft_ms", "tbt_mean_ms", "energy_Wh", "carbon_g"]:
            if metric in df_runs.columns:
                data = df_runs[metric].dropna()
                if not data.empty:
                    kpis[f"{metric}_mean"] = float(data.mean())
                    kpis[f"{metric}_std"] = float(data.std())
                    kpis[f"{metric}_p50"] = float(data.median())
                    kpis[f"{metric}_p95"] = float(np.percentile(data, 95))
                    kpis[f"{metric}_cv"] = float(data.std() / data.mean()) if data.mean() > 0 else np.nan
        
        # Policy effectiveness metrics
        if policy_type == "saip":
            kpis["policy_effectiveness"] = self.calculate_policy_effectiveness(df_runs)
        
        # Stability metrics
        kpis["stability_score"] = self.calculate_stability_score(df_runs)
        
        return kpis

    def calculate_policy_effectiveness(self, df_runs):
        """Calculate policy effectiveness score"""
        try:
            # Measure how well the policy maintains target performance
            cv = df_runs['latency_ms'].std() / df_runs['latency_ms'].mean()
            p95 = np.percentile(df_runs['latency_ms'], 95)
            
            # Effectiveness is higher when CV is lower and P95 is controlled
            effectiveness = 1 / (1 + cv) * (1 / (1 + p95 / 1000))
            return float(effectiveness)
        except:
            return np.nan

    def calculate_stability_score(self, df_runs):
        """Calculate overall stability score"""
        try:
            # Multiple stability indicators
            latency_cv = df_runs['latency_ms'].std() / df_runs['latency_ms'].mean()
            energy_cv = df_runs['energy_Wh'].std() / df_runs['energy_Wh'].mean() if 'energy_Wh' in df_runs else 0
            
            # Lower CV means higher stability
            stability = 1 / (1 + latency_cv + energy_cv)
            return float(stability)
        except:
            return np.nan

    def create_comprehensive_visualizations(self, df_results, df_kpis, output_dir):
        """Create comprehensive visualizations"""
        
        print("üìä Creating Comprehensive Visualizations...")
        
        # Set up plotting parameters
        plt.rcParams['figure.figsize'] = (12, 8)
        plt.rcParams['font.size'] = 10
        
        viz_dir = f"{output_dir}/visualizations"
        
        # 1. Performance Heatmap by Model and Task
        if not df_kpis.empty:
            try:
                pivot_data = df_kpis[df_kpis['policy_type'] == 'baseline'].pivot_table(
                    values='latency_ms_p95', 
                    index='model', 
                    columns='task', 
                    aggfunc='mean'
                )
                
                plt.figure(figsize=(14, 10))
                sns.heatmap(pivot_data, annot=True, fmt='.0f', cmap='RdYlBu_r', cbar_kws={'label': 'P95 Latency (ms)'})
                plt.title('Model Performance Heatmap: P95 Latency by Model and Task', fontsize=16, pad=20)
                plt.xlabel('Task', fontsize=12)
                plt.ylabel('Model', fontsize=12)
                plt.xticks(rotation=45)
                plt.yticks(rotation=0)
                plt.tight_layout()
                plt.savefig(f"{viz_dir}/performance_heatmap.png", dpi=300, bbox_inches='tight')
                plt.close()
                print("   ‚úÖ Performance heatmap saved")
            except Exception as e:
                print(f"   ‚ö†Ô∏è Performance heatmap failed: {e}")
        
        # 2. Policy Effectiveness Comparison
        if not df_kpis.empty and 'policy_type' in df_kpis.columns:
            try:
                policy_comparison = df_kpis.groupby(['model', 'policy_type'])['latency_ms_p95'].mean().reset_index()
                policy_pivot = policy_comparison.pivot(index='model', columns='policy_type', values='latency_ms_p95')
                
                if 'baseline' in policy_pivot.columns and 'saip' in policy_pivot.columns:
                    policy_pivot['improvement'] = (policy_pivot['baseline'] - policy_pivot['saip']) / policy_pivot['baseline'] * 100
                    
                    plt.figure(figsize=(14, 8))
                    policy_pivot['improvement'].plot(kind='bar', color='steelblue', alpha=0.8)
                    plt.title('SAIP Policy Effectiveness: P95 Latency Improvement by Model', fontsize=16, pad=20)
                    plt.xlabel('Model', fontsize=12)
                    plt.ylabel('Improvement (%)', fontsize=12)
                    plt.xticks(rotation=45, ha='right')
                    plt.axhline(y=0, color='red', linestyle='--', alpha=0.7)
                    plt.grid(True, alpha=0.3)
                    plt.tight_layout()
                    plt.savefig(f"{viz_dir}/policy_effectiveness.png", dpi=300, bbox_inches='tight')
                    plt.close()
                    print("   ‚úÖ Policy effectiveness chart saved")
            except Exception as e:
                print(f"   ‚ö†Ô∏è Policy effectiveness chart failed: {e}")
        
        # 3. Information Systems Quality Dashboard
        if hasattr(self, 'is_quality_metrics') and self.is_quality_metrics:
            try:
                is_scores = []
                for model, metrics in self.is_quality_metrics.items():
                    is_scores.append({
                        'model': model,
                        'system_quality': np.mean(list(metrics['system_quality'].values())),
                        'information_quality': np.mean(list(metrics['information_quality'].values())),
                        'service_quality': np.mean(list(metrics['service_quality'].values())),
                        'overall_score': metrics['overall_is_quality_score']
                    })
                
                df_is = pd.DataFrame(is_scores)
                
                fig, axes = plt.subplots(2, 2, figsize=(16, 12))
                
                # System Quality
                df_is.set_index('model')['system_quality'].plot(kind='bar', ax=axes[0,0], color='lightblue')
                axes[0,0].set_title('System Quality Scores')
                axes[0,0].set_ylabel('Score')
                axes[0,0].tick_params(axis='x', rotation=45)
                
                # Information Quality
                df_is.set_index('model')['information_quality'].plot(kind='bar', ax=axes[0,1], color='lightgreen')
                axes[0,1].set_title('Information Quality Scores')
                axes[0,1].set_ylabel('Score')
                axes[0,1].tick_params(axis='x', rotation=45)
                
                # Service Quality
                df_is.set_index('model')['service_quality'].plot(kind='bar', ax=axes[1,0], color='lightcoral')
                axes[1,0].set_title('Service Quality Scores')
                axes[1,0].set_ylabel('Score')
                axes[1,0].tick_params(axis='x', rotation=45)
                
                # Overall IS Quality
                df_is.set_index('model')['overall_score'].plot(kind='bar', ax=axes[1,1], color='gold')
                axes[1,1].set_title('Overall IS Quality Scores')
                axes[1,1].set_ylabel('Score')
                axes[1,1].tick_params(axis='x', rotation=45)
                
                plt.suptitle('Information Systems Quality Analysis Dashboard', fontsize=16)
                plt.tight_layout()
                plt.savefig(f"{viz_dir}/is_quality_dashboard.png", dpi=300, bbox_inches='tight')
                plt.close()
                print("   ‚úÖ IS Quality dashboard saved")
            except Exception as e:
                print(f"   ‚ö†Ô∏è IS Quality dashboard failed: {e}")
        
        # 4. Technology Acceptance Model Visualization
        if hasattr(self, 'tam_metrics') and self.tam_metrics:
            try:
                tam_scores = []
                for model, metrics in self.tam_metrics.items():
                    tam_scores.append({
                        'model': model,
                        'perceived_usefulness': metrics['perceived_usefulness']['overall_usefulness'],
                        'perceived_ease_of_use': metrics['perceived_ease_of_use']['overall_ease_of_use'],
                        'behavioral_intention': metrics['behavioral_intention'],
                        'tam_score': metrics['tam_score']
                    })
                
                df_tam = pd.DataFrame(tam_scores)
                
                # TAM Scatter Plot
                plt.figure(figsize=(12, 8))
                scatter = plt.scatter(df_tam['perceived_usefulness'], df_tam['perceived_ease_of_use'], 
                                    c=df_tam['behavioral_intention'], s=100, alpha=0.7, cmap='viridis')
                
                for i, model in enumerate(df_tam['model']):
                    plt.annotate(model, (df_tam['perceived_usefulness'].iloc[i], df_tam['perceived_ease_of_use'].iloc[i]), 
                               xytext=(5, 5), textcoords='offset points', fontsize=8)
                
                plt.colorbar(scatter, label='Behavioral Intention to Use')
                plt.xlabel('Perceived Usefulness', fontsize=12)
                plt.ylabel('Perceived Ease of Use', fontsize=12)
                plt.title('Technology Acceptance Model (TAM) Analysis', fontsize=16, pad=20)
                plt.grid(True, alpha=0.3)
                plt.tight_layout()
                plt.savefig(f"{viz_dir}/tam_analysis.png", dpi=300, bbox_inches='tight')
                plt.close()
                print("   ‚úÖ TAM analysis chart saved")
            except Exception as e:
                print(f"   ‚ö†Ô∏è TAM analysis chart failed: {e}")
        
        # 5. Econometric Results Visualization
        if hasattr(self, 'econometric_models') and self.econometric_models:
            try:
                if 'technical_efficiency' in self.econometric_models:
                    eff_data = self.econometric_models['technical_efficiency']
                    
                    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
                    
                    # Latency Efficiency
                    eff_data.set_index('model')['latency_efficiency'].plot(kind='bar', ax=axes[0], color='skyblue')
                    axes[0].set_title('Latency Efficiency by Model')
                    axes[0].set_ylabel('Efficiency Score')
                    axes[0].tick_params(axis='x', rotation=45)
                    
                    # Energy Efficiency
                    eff_data.set_index('model')['energy_efficiency'].plot(kind='bar', ax=axes[1], color='lightgreen')
                    axes[1].set_title('Energy Efficiency by Model')
                    axes[1].set_ylabel('Efficiency Score')
                    axes[1].tick_params(axis='x', rotation=45)
                    
                    # Overall Efficiency
                    eff_data.set_index('model')['overall_efficiency'].plot(kind='bar', ax=axes[2], color='orange')
                    axes[2].set_title('Overall Technical Efficiency')
                    axes[2].set_ylabel('Efficiency Score')
                    axes[2].tick_params(axis='x', rotation=45)
                    
                    plt.suptitle('Technical Efficiency Analysis', fontsize=16)
                    plt.tight_layout()
                    plt.savefig(f"{viz_dir}/technical_efficiency.png", dpi=300, bbox_inches='tight')
                    plt.close()
                    print("   ‚úÖ Technical efficiency chart saved")
            except Exception as e:
                print(f"   ‚ö†Ô∏è Technical efficiency chart failed: {e}")

    def save_comprehensive_results(self, df_results, df_kpis, output_dir):
        """Save all results to various formats"""
        
        print("üíæ Saving Comprehensive Results...")
        
        data_dir = f"{output_dir}/data"
        
        # 1. Save to Excel with multiple sheets
        excel_path = f"{data_dir}/comprehensive_results.xlsx"
        with pd.ExcelWriter(excel_path, engine='xlsxwriter') as writer:
            
            # Main results
            df_results.to_excel(writer, sheet_name='Raw_Results', index=False)
            df_kpis.to_excel(writer, sheet_name='KPI_Summary', index=False)
            
            # IS Quality metrics
            if hasattr(self, 'is_quality_metrics') and self.is_quality_metrics:
                is_data = []
                for model, metrics in self.is_quality_metrics.items():
                    row = {'model': model}
                    row.update(metrics['system_quality'])
                    row.update(metrics['information_quality'])
                    row.update(metrics['service_quality'])
                    row['overall_is_quality_score'] = metrics['overall_is_quality_score']
                    is_data.append(row)
                pd.DataFrame(is_data).to_excel(writer, sheet_name='IS_Quality', index=False)
            
            # TAM metrics
            if hasattr(self, 'tam_metrics') and self.tam_metrics:
                tam_data = []
                for model, metrics in self.tam_metrics.items():
                    row = {'model': model}
                    row.update(metrics['perceived_usefulness'])
                    row.update(metrics['perceived_ease_of_use'])
                    row['behavioral_intention'] = metrics['behavioral_intention']
                    row['tam_score'] = metrics['tam_score']
                    tam_data.append(row)
                pd.DataFrame(tam_data).to_excel(writer, sheet_name='TAM_Analysis', index=False)
            
            # Econometric results
            if hasattr(self, 'econometric_models') and self.econometric_models:
                if 'technical_efficiency' in self.econometric_models:
                    self.econometric_models['technical_efficiency'].to_excel(
                        writer, sheet_name='Technical_Efficiency', index=False
                    )
        
        print(f"   ‚úÖ Excel results saved: {excel_path}")
        
        # 2. Save to CSV files
        df_results.to_csv(f"{data_dir}/raw_results.csv", index=False)
        df_kpis.to_csv(f"{data_dir}/kpi_summary.csv", index=False)
        print("   ‚úÖ CSV files saved")
        
        # 3. Save to JSON for web applications
        results_json = {
            'summary': {
                'total_observations': len(df_results),
                'models_analyzed': df_results['model'].nunique() if 'model' in df_results else 0,
                'tasks_analyzed': df_results['task'].nunique() if 'task' in df_results else 0,
                'analysis_timestamp': datetime.now().isoformat()
            },
            'kpis': df_kpis.to_dict('records') if not df_kpis.empty else [],
            'is_quality': self.is_quality_metrics if hasattr(self, 'is_quality_metrics') else {},
            'tam_analysis': self.tam_metrics if hasattr(self, 'tam_metrics') else {}
        }
        
        with open(f"{data_dir}/comprehensive_analysis.json", 'w') as f:
            json.dump(results_json, f, indent=2, default=str)
        print("   ‚úÖ JSON results saved")
        
        # 4. Save econometric model summaries
        if hasattr(self, 'econometric_models') and self.econometric_models:
            models_dir = f"{output_dir}/models"
            
            for model_name, model_data in self.econometric_models.items():
                if 'model' in model_data and hasattr(model_data['model'], 'summary'):
                    with open(f"{models_dir}/{model_name}_summary.txt", 'w') as f:
                        f.write(str(model_data['model'].summary()))
            
            print("   ‚úÖ Econometric model summaries saved")

def main():
    """Main execution function"""
    
    print("="*80)
    print("üöÄ Enhanced SAIP Benchmarking System")
    print("   Information Systems Theory + Econometric Analysis")
    print("="*80)
    
    # Initialize analyzer
    analyzer = EnhancedSAIPAnalyzer()
    output_dir = analyzer.setup_directories()
    
    # Load data (using the same in-memory data as before)
    print("\nüìÇ Loading Data...")
    
    # Data for df_main_raw (same as before)
    main_data_str = """
    Model	Task	Precision	Recall	f1_score	Latency Avg (ms)	Latency Std (ms)	Latency Min (ms)	Latency Max (ms)	Latency P10 (ms)	Latency P25 (ms)	Latency P50 (ms)	Latency P75 (ms)	Latency P90 (ms)	Latency P95 (ms)	Latency P99 (ms)	throughput_tokens_per_sec	Token Efficiency (tokens/sec)	CPU Memory Avg (MB)	ÌååÎùºÎØ∏ÌÑ∞_b	ÏµúÎåÄ ÏãúÌÄÄÏä§ Í∏∏Ïù¥	Î≥ëÎ†¨ ÌÅ¨Í∏∞	Ïª¥ÌååÏùº ÏÜåÏöî	Serialize(Ï¥à)	num_blocks	Best Batch	KV per token (MB)	ÏòàÏÉÅ Ïª®ÌÖçÏä§Ìä∏ 4K Ïãú KV (GB)	peak_npu_mem_Ìú¥	ÏòàÏÉÅ Ï†ÑÎ†• (W)	Tracing(Ï¥à)	Model Conversion(Ï¥à)	ÏµúÏ¢Ö Serialize(Ï¥à)	2K ÌÜ†ÌÅ∞(GB)	4K ÌÜ†ÌÅ∞(GB)	8K ÌÜ†ÌÅ∞(GB)	16K ÌÜ†ÌÅ∞(GB)
    DeepSeek-R1-Distill-Qwen-1.5B	NLU	0.57	1	0.7261	639	492.75	146.92	1431.13	182.27	183.34	564.66	1060.25	1413.59	1416.72	1426.1	35.43	34.73	8226.95	1.5	131072	1	1:44	3	9	1	0.35	1.4	6	13.5	2	36	2	5	6	8.5	13.5
    EXAONE-3.5-2.4B-Instruct	NLU	0.57	1	0.7261	75.59	4.85	72.18	102.25	73.88	74.47	74.66	74.92	75.31	77.23	98.58	26.46	26.54	9691.05	2.4	32768	1	2:05	3	1	1	0.42	1.7	7	16.5	2	46	1	6.5	7.2	10.5	17
    gemma-2b-it	NLU	0.57	1	0.7261	67.61	55.58	61.11	619.68	61.17	61.26	61.61	61.88	63.07	63.26	98.78	32.99	32.38	14093.04	2	8192	1	1:25	4	1	1.5	0.4	1.6	6.5	14.5	1.5	24	4	5.8	6.5	9.5	15.5
    Llama-3.2-3B-Instruct	NLU	0.57	1	0.7261	1400.24	1030.52	90.42	2481.65	106.11	107	2197.27	2219.14	2231.39	2245.64	2397.88	22.34	21.01	13352	3.2	32768	1	1:46	5	9	1	0.002	0.008	16	120	2	36	5	7.6	8.5	13	22
    Midm-2.0-Mini-Instruct	NLU	0.57	1	0.7261	115.62	6.57	103.8	164.88	109.02	114.98	115.5	116.59	117.21	123.73	134.91	17.3	17.34	9127.88	2	32768	1	2:43	3	1	1	0.46	1.8	7.5	17.5	4	63	1	5.8	6.5	9.5	15.5
    opt-2.7b	NLU	0.57	1	0.7261	1314.69	73.94	1259.12	1553.53	1264.56	1279.11	1281.13	1313.08	1443.6	1530.26	1543.72	38.03	38.14	8066	2.7	2048	1	1:34	3	1	1	0.45	1.8	7.2	15.5	4	22	2	6.5	7.2	10.5	17
    Qwen2.5-0.5B-Instruct	NLU	0.57	1	0.7261	550.55	81.81	273.27	885.9	428.81	563.91	581.57	582.53	584.12	588.68	649.71	84.88	85.05	4139.33	0.5	32768	1	0:39	1	1	3	0.18	0.7	3	9	2	32	0.5	3	3.7	5	7.6
    Qwen2.5-1.5B-Instruct	NLU	0.57	1	0.7261	244.01	440.3	54.32	1381.94	66.77	66.81	66.86	66.94	1337.52	1379.31	1381.42	34.96	30.85	7607.46	1.5	32768	1	1:12	2	1	1.5	0.32	1.3	5.8	13	2	38	1.5	5	6	8.5	13.5
    Qwen2.5-3B-Instruct	NLU	0.57	1	0.7261	1130.91	1029.74	95.37	2561.1	108.67	115.8	998.33	2246.19	2284.49	2290.96	2344.85	21.95	19.98	11784.5	3	32768	1	2:05	4	1	1	0.55	2.2	8	18	2	51	2	7.6	8.5	13	22
    DeepSeek-R1-Distill-Qwen-1.5B	QA	0.0089	0.48	0.0173	1362.23	195.48	442.8	1696.66	1202.29	1313.74	1440.25	1455.3	1486.74	1511.4	1692.69	35.14	35.2	8250.61	1.5	131072	1	1:44	3	9	1	0.35	1.4	6	13.5	2	36	2	5	6	8.5	13.5
    EXAONE-3.5-2.4B-Instruct	QA	0.0095	0.48	0.0184	1092.26	368.34	121.11	1710.39	525.48	802.69	1250.16	1321.26	1523.49	1535.27	1582.61	36.29	35.88	9712.48	2.4	32768	1	2:05	3	1	1	0.42	1.7	7	16.5	2	46	1	6.5	7.2	10.5	17
    gemma-2b-it	QA	0.0099	0.48	0.0192	711.26	223.91	293.32	1322.1	423.07	573.75	701.81	808.63	978.48	1141.22	1303.94	36.54	36.22	14100.66	2	8192	1	1:25	4	1	1.5	0.4	1.6	6.5	14.5	1.5	24	4	5.8	6.5	9.5	15.5
    Llama-3.2-3B-Instruct	QA	0.0097	0.48	0.0188	1621.36	671.81	245.42	2661.65	684.8	1036.15	1758.09	2269.7	2314.97	2336.89	2416.83	21.58	21.31	13374.42	3.2	32768	1	1:46	5	9	1	0.002	0.008	16	120	2	36	5	7.6	8.5	13	22
    Midm-2.0-Mini-Instruct	QA	0.0099	0.48	0.0192	1064.67	396	533.45	2307.55	684.26	777.68	964.41	1221.03	1637.91	1919.04	2132.56	22.7	22.54	9143.92	2	32768	1	2:43	3	1	1	0.46	1.8	7.5	17.5	4	63	1	5.8	6.5	9.5	15.5
    opt-2.7b	QA	0.0092	0.48	0.0178	1392.5	86.75	1285.18	1665.51	1321.21	1333.97	1371.68	1419.6	1531.78	1603.39	1656.68	35.91	36.03	8084.59	2.7	2048	1	1:34	3	1	1	0.45	1.8	7.2	15.5	4	22	2	6.5	7.2	10.5	17
    Qwen2.5-0.5B-Instruct	QA	0.0088	0.48	0.0172	585.66	105.11	149.29	1137.48	507.3	591.66	596.35	601.99	613.49	640.59	945.22	81.51	81.9	4158.77	0.5	32768	1	0:39	1	1	3	0.18	0.7	3	9	2	32	0.5	3	3.7	5	7.6
    Qwen2.5-1.5B-Instruct	QA	0.009	0.48	0.0175	1299.95	252.15	263.69	1495.1	768.12	1340.44	1402.14	1413.14	1445.91	1468.63	1489.8	35.69	35.6	7628.45	1.5	32768	1	1:12	2	1	1.5	0.32	1.3	5.8	13	2	38	1.5	5	6	8.5	13.5
    Qwen2.5-3B-Instruct	QA	0.009	0.48	0.0175	2206.83	406.22	610.79	2961.56	1638.76	2265.47	2330.8	2365.1	2411.44	2448.82	2720.45	21.26	21.27	11806.66	3	32768	1	2:05	4	1	1	0.55	2.2	8	18	2	51	2	7.6	8.5	13	22
    DeepSeek-R1-Distill-Qwen-1.5B	Summarization	0.0792	0.9046	0.1433	1614.53	147.35	1215.81	1968.53	1472.44	1523.43	1582.53	1685.63	1834.01	1891.5	1957.85	30.97	31.23	8265.59	1.5	131072	1	1:44	3	9	1	0.35	1.4	6	13.5	2	36	2	5	6	8.5	13.5
    EXAONE-3.5-2.4B-Instruct	Summarization	0.0799	0.9069	0.1446	1628.44	249.6	1267.73	2442.68	1362.03	1413.82	1602.61	1776.01	2018.68	2090.75	2231.78	30.7	31.37	9733.12	2.4	32768	1	2:05	3	1	1	0.42	1.7	7	16.5	2	46	1	6.5	7.2	10.5	17
    gemma-2b-it	Summarization	0.0797	0.9044	0.1441	1480.64	272.46	471.63	2061.3	1261.5	1375.2	1487.86	1654.11	1782.58	1866.11	1989.55	31.8	31.76	14114.01	2	8192	1	1:25	4	1	1.5	0.4	1.6	6.5	14.5	1.5	24	4	5.8	6.5	9.5	15.5
    Llama-3.2-3B-Instruct	Summarization	0.0795	0.9056	0.1439	2579.09	205.32	2280.68	3098.51	2355.64	2413.21	2535.3	2691.54	2893.23	3006.32	3081.16	19.39	19.5	13385.61	3.2	32768	1	1:46	5	9	1	0.002	0.008	16	120	2	36	5	7.6	8.5	13	22
    Midm-2.0-Mini-Instruct	Summarization	0.0796	0.9036	0.144	2443.11	326.29	1965.53	3766.72	2162.48	2208.67	2323.79	2591.71	2869.08	2993.27	3568.25	20.47	20.79	9169.49	2	32768	1	2:43	3	1	1	0.46	1.8	7.5	17.5	4	63	1	5.8	6.5	9.5	15.5
    opt-2.7b	Summarization	0.0813	0.8996	0.1468	1689.43	249.71	1338.42	2427.64	1428.19	1492.03	1642.76	1838.14	2071.59	2166.29	2342.45	29.6	30.2	8095.09	2.7	2048	1	1:34	3	1	1	0.45	1.8	7.2	15.5	4	22	2	6.5	7.2	10.5	17
    Qwen2.5-0.5B-Instruct	Summarization	0.0795	0.9056	0.1438	697.54	74.53	591.98	933.24	613.67	634.84	676.85	743.1	795.81	843.44	870.1	71.55	72.31	4169.39	0.5	32768	1	0:39	1	1	3	0.18	0.7	3	9	2	32	0.5	3	3.7	5	7.6
    Qwen2.5-1.5B-Instruct	Summarization	0.0798	0.908	0.1444	1533.75	155.2	1174.59	2022.64	1341.06	1447.21	1510.19	1609.57	1743.97	1804.24	2008.48	32.6	32.93	7644.25	1.5	32768	1	1:12	2	1	1.5	0.32	1.3	5.8	13	2	38	1.5	5	6	8.5	13.5
    Qwen2.5-3B-Instruct	Summarization	0.0799	0.9073	0.1445	2651.29	255.3	2319.95	3763.13	2375.9	2465.56	2594.74	2779.22	2989.27	3098.85	3462.91	18.86	19.02	11818.7	3	32768	1	2:05	4	1	1	0.55	2.2	8	18	2	51	2	7.6	8.5	13	22
    """
    df_main_raw = pd.read_csv(io.StringIO(main_data_str), sep='\t')

    # Load time-series data
    user_desktop = os.path.join(os.path.expanduser("~"), "Desktop")
    path_ts = os.path.join(user_desktop, r"ÎÖºÎ¨∏\ÌõÑÏÜçÎÖºÎ¨∏(16) sLLM ÌñâÎèôÎ™®Ìòï\[Î∂ÑÏÑù] Data_ÏãúÍ≥ÑÏó¥.xlsx")
    
    try:
        df_ts_raw = pd.read_excel(path_ts, sheet_name=0)
        print(f"   ‚úÖ Loaded {len(df_ts_raw)} time-series observations")
    except Exception as e:
        print(f"   ‚ùå Failed to load time-series data: {e}")
        return
    
    # HuggingFace data (minimal for this example)
    df_hf_raw = pd.DataFrame({'Ìï≠Î™©': ['placeholder'], 'value': ['placeholder']})
    
    # Data preparation
    df_ts_clean, df_final_analysis = analyzer.load_and_prepare_enhanced_data(df_main_raw, df_hf_raw, df_ts_raw)
    
    if df_ts_clean is None or df_final_analysis is None:
        print("‚ùå Data preparation failed")
        return
    
    # Create advanced simulator
    generate_fn, power_fn = analyzer.create_advanced_simulator(df_ts_clean, df_final_analysis)
    
    # Run comprehensive simulation
    df_results, df_kpis = analyzer.comprehensive_simulation(generate_fn, power_fn, df_final_analysis, df_ts_clean)
    
    if df_results.empty:
        print("‚ùå No simulation results generated")
        return
    
    # Information Systems Quality Analysis
    is_quality = analyzer.information_systems_quality_analysis(df_results)
    
    # Technology Acceptance Model Analysis  
    tam_analysis = analyzer.technology_acceptance_model_analysis(df_results, df_final_analysis)
    
    # Econometric Analysis
    econometric_results = analyzer.econometric_analysis(df_results, df_final_analysis)
    
    # Create visualizations
    analyzer.create_comprehensive_visualizations(df_results, df_kpis, output_dir)
    
    # Save all results
    analyzer.save_comprehensive_results(df_results, df_kpis, output_dir)
    
    # Final summary
    print("\n" + "="*80)
    print("üéâ Enhanced SAIP Analysis Complete!")
    print("="*80)
    print(f"üìä Total Observations: {len(df_results):,}")
    print(f"ü§ñ Models Analyzed: {df_results['model'].nunique()}")
    print(f"üìã Tasks Analyzed: {df_results['task'].nunique()}")
    print(f"üìà KPI Records: {len(df_kpis):,}")
    print(f"üìÅ Results saved to: {output_dir}")
    
    # Display top performers
    if not df_kpis.empty:
        baseline_kpis = df_kpis[df_kpis['policy_type'] == 'baseline']
        if not baseline_kpis.empty:
            best_latency = baseline_kpis.loc[baseline_kpis['latency_ms_p95'].idxmin()]
            best_efficiency = baseline_kpis.loc[baseline_kpis['energy_Wh_mean'].idxmin()] if 'energy_Wh_mean' in baseline_kpis else None
            
            print(f"\nüèÜ Best Performance:")
            print(f"   Lowest P95 Latency: {best_latency['model']} ({best_latency['latency_ms_p95']:.1f} ms)")
            if best_efficiency is not None:
                print(f"   Most Energy Efficient: {best_efficiency['model']} ({best_efficiency['energy_Wh_mean']:.4f} Wh/req)")
    
    print("\n‚úÖ Analysis completed successfully!")

if __name__ == "__main__":
    main()

üöÄ Enhanced SAIP Benchmarking System
   Information Systems Theory + Econometric Analysis

üìÇ Loading Data...
   ‚úÖ Loaded 2700 time-series observations
üîÑ Enhanced Data Loading and Preparation...
   ‚úÖ Cleaned time-series data: 2692/2700 valid rows
   ‚úÖ Processed main data: 27 records
   ‚úÖ Available models: 9
   ‚úÖ Available tasks: 3
ü§ñ Creating Advanced ML-based Simulator...
   ‚úÖ NLU model trained (R¬≤ = 0.997)
   ‚úÖ QA model trained (R¬≤ = 0.957)
   ‚úÖ Summarization model trained (R¬≤ = 0.798)
   ‚úÖ Enhanced simulator ready
üöÄ Running Comprehensive Multi-Model, Multi-Task Simulation...
   üìã Analyzing 9 models across 3 tasks


Simulating Combinations: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27/27 [00:59<00:00,  2.22s/it]


   ‚úÖ Completed simulation: 5400 total observations
   ‚úÖ Generated KPIs for 54 model-task-policy combinations
üìä Information Systems Quality Analysis (DeLone & McLean Model)...
üéØ Technology Acceptance Model (TAM) Analysis...
üìà Advanced Econometric Analysis...
   üìä Estimating Production Function...
   ‚úÖ Production Function R¬≤ = 0.283
   üìä Technical Efficiency Analysis...
   ‚úÖ Technical Efficiency calculated for 9 models
   üìä Cost Function Analysis...
   ‚úÖ Cost Function R¬≤ = 0.307
   üìä Panel Data Analysis...
   ‚úÖ Panel Data Model R¬≤ = 0.491
üìä Creating Comprehensive Visualizations...
   ‚úÖ Performance heatmap saved
   ‚úÖ Policy effectiveness chart saved
   ‚úÖ IS Quality dashboard saved
   ‚úÖ TAM analysis chart saved
   ‚úÖ Technical efficiency chart saved
üíæ Saving Comprehensive Results...
   ‚úÖ Excel results saved: SAIP_Analysis_Results_20251107_130251/data/comprehensive_results.xlsx
   ‚úÖ CSV files saved
   ‚úÖ JSON results saved
   ‚úÖ Econo