In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from pathlib import Path
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from typing import Dict, List, Tuple
import matplotlib.cm as cm
import os

In [37]:
user='Lilian'

# Style configuration
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({
    'font.family': 'sans-serif',
    'font.sans-serif': ['Arial', 'DejaVu Sans'],
    'font.size': 11,
    'axes.titlesize': 13,
    'axes.titleweight': 'bold',
    'axes.labelsize': 11,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'figure.titlesize': 16,
    'figure.titleweight': 'bold',
    'axes.spines.top': False,
    'axes.spines.right': False,
})

def get_viridis_colors(n):
    """Get n colors from Viridis colormap."""
    cmap = plt.colormaps['viridis']
    return [cmap(i / (n - 1)) for i in range(n)]

def get_plasma_colors(n):
    """Get n colors from Plasma colormap."""
    cmap = plt.colormaps['plasma']
    return [cmap(i / (n - 1)) for i in range(n)]

COLORS_STATS = {
    'primary': '#440154',      # Viridis dark purple
    'secondary': '#21918c',    # Viridis teal
    'light': '#fde725',        # Viridis yellow
    'palette': None,           # Will be generated dynamically
    'cmap_name': 'viridis'
}

COLORS_ML = {
    'primary': '#0d0887',      # Plasma dark blue
    'secondary': '#cc4778',    # Plasma magenta
    'light': '#f0f921',        # Plasma yellow
    'palette': None,           # Will be generated dynamically
    'cmap_name': 'plasma'
}

COLORS_GENERAL = {
    'background': '#F8F9FA',
    'text': '#2B2D42',
    'grid': '#E0E0E0',
}

In [38]:
class StatsModelAnalyzer:
    """Analyzer for Statistical Models (StatsForecast approach)."""
    
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.file_name = Path(file_path).stem
        self.states = ['IN', 'IL', 'OH', 'MI']
        self.data = None
        self.summary = None
        
    def load_data(self) -> pd.DataFrame:
        """Load and combine all Stats sheets."""
        all_data = []
        
        for state in self.states:
            df = pd.read_excel(self.file_path, sheet_name=f'{state}_Stats')
            df['state'] = state
            all_data.append(df)
        
        self.data = pd.concat(all_data, ignore_index=True)
        return self.data
    
    def get_model_frequency(self) -> pd.DataFrame:
        """Get frequency of each recommended model."""
        freq = self.data['recommended_model'].value_counts().reset_index()
        freq.columns = ['Model', 'Frequency']
        freq['Percentage'] = (freq['Frequency'] / len(self.data) * 100).round(1)
        return freq
    
    def get_model_frequency_by_state(self) -> pd.DataFrame:
        """Get model frequency broken down by state."""
        freq = self.data.groupby(['state', 'recommended_model']).size().reset_index(name='Frequency')
        pivot = freq.pivot(index='recommended_model', columns='state', values='Frequency').fillna(0).astype(int)
        pivot['Total'] = pivot.sum(axis=1)
        pivot = pivot.sort_values('Total', ascending=False)
        return pivot
    
    def get_mae_statistics(self) -> pd.DataFrame:
        """Get MAE statistics for each model."""
        stats = self.data.groupby('recommended_model')['avg_cv_mae'].agg([
            ('Count', 'count'),
            ('Mean_MAE', 'mean'),
            ('Median_MAE', 'median'),
            ('Std_MAE', 'std'),
            ('Min_MAE', 'min'),
            ('Max_MAE', 'max'),
            ('Q25_MAE', lambda x: x.quantile(0.25)),
            ('Q75_MAE', lambda x: x.quantile(0.75)),
        ]).reset_index()
        stats.columns = ['Model', 'Count', 'Mean_MAE', 'Median_MAE', 'Std_MAE', 
                        'Min_MAE', 'Max_MAE', 'Q25_MAE', 'Q75_MAE']
        stats['CV_MAE'] = (stats['Std_MAE'] / stats['Mean_MAE'] * 100).round(1)  # Coefficient of variation
        return stats.sort_values('Count', ascending=False)
    
    def get_mae_statistics_by_state(self) -> pd.DataFrame:
        """Get MAE statistics broken down by state."""
        stats = self.data.groupby(['state', 'recommended_model'])['avg_cv_mae'].agg([
            ('Count', 'count'),
            ('Mean_MAE', 'mean'),
            ('Median_MAE', 'median'),
        ]).reset_index()
        return stats
    
    def get_confidence_distribution(self) -> pd.DataFrame:
        """Get confidence level distribution by model."""
        conf = self.data.groupby(['recommended_model', 'confidence']).size().reset_index(name='Count')
        pivot = conf.pivot(index='recommended_model', columns='confidence', values='Count').fillna(0).astype(int)
        pivot['Total'] = pivot.sum(axis=1)
        return pivot.sort_values('Total', ascending=False)
    
    def get_overall_summary(self) -> Dict:
        """Get overall summary statistics."""
        return {
            'total_series': len(self.data),
            'unique_models': self.data['recommended_model'].nunique(),
            'models_list': self.data['recommended_model'].unique().tolist(),
            'overall_mean_mae': self.data['avg_cv_mae'].mean(),
            'overall_median_mae': self.data['avg_cv_mae'].median(),
            'overall_std_mae': self.data['avg_cv_mae'].std(),
            'overall_min_mae': self.data['avg_cv_mae'].min(),
            'overall_max_mae': self.data['avg_cv_mae'].max(),
        }
    
    def generate_full_summary(self) -> Dict:
        """Generate complete summary."""
        self.summary = {
            'overall': self.get_overall_summary(),
            'model_frequency': self.get_model_frequency(),
            'model_frequency_by_state': self.get_model_frequency_by_state(),
            'mae_statistics': self.get_mae_statistics(),
            'mae_by_state': self.get_mae_statistics_by_state(),
            'confidence_distribution': self.get_confidence_distribution(),
        }
        return self.summary

In [39]:
class MLModelAnalyzer:
    """Analyzer for Machine Learning Models (MLForecast approach)."""
    
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.file_name = Path(file_path).stem
        self.states = ['IN', 'IL', 'OH', 'MI']
        self.ml_models = ['LightGBM', 'XGBoost', 'RandomForest', 'Ridge']
        self.data = None
        self.summary = None
        
    def load_data(self) -> pd.DataFrame:
        """Load and combine all ML sheets."""
        all_data = []
        
        for state in self.states:
            df = pd.read_excel(self.file_path, sheet_name=f'{state}_ML')
            df['state'] = state
            
            # Extract the MAE for the recommended model
            df['best_mae'] = df.apply(self._get_recommended_mae, axis=1)
            all_data.append(df)
        
        self.data = pd.concat(all_data, ignore_index=True)
        return self.data
    
    def _get_recommended_mae(self, row: pd.Series) -> float:
        """Get MAE for the recommended model."""
        model = row['recommended_model']
        mae_col = f"{model}_mae"
        if mae_col in row.index:
            return row[mae_col]
        return row['cv_mae']
    
    def get_model_frequency(self) -> pd.DataFrame:
        """Get frequency of each recommended model."""
        freq = self.data['recommended_model'].value_counts().reset_index()
        freq.columns = ['Model', 'Frequency']
        freq['Percentage'] = (freq['Frequency'] / len(self.data) * 100).round(1)
        return freq
    
    def get_model_frequency_by_state(self) -> pd.DataFrame:
        """Get model frequency broken down by state."""
        freq = self.data.groupby(['state', 'recommended_model']).size().reset_index(name='Frequency')
        pivot = freq.pivot(index='recommended_model', columns='state', values='Frequency').fillna(0).astype(int)
        pivot['Total'] = pivot.sum(axis=1)
        pivot = pivot.sort_values('Total', ascending=False)
        return pivot
    
    def get_mae_statistics(self) -> pd.DataFrame:
        """Get MAE statistics for each model (when recommended)."""
        stats = self.data.groupby('recommended_model')['best_mae'].agg([
            ('Count', 'count'),
            ('Mean_MAE', 'mean'),
            ('Median_MAE', 'median'),
            ('Std_MAE', 'std'),
            ('Min_MAE', 'min'),
            ('Max_MAE', 'max'),
            ('Q25_MAE', lambda x: x.quantile(0.25)),
            ('Q75_MAE', lambda x: x.quantile(0.75)),
        ]).reset_index()
        stats.columns = ['Model', 'Count', 'Mean_MAE', 'Median_MAE', 'Std_MAE', 
                        'Min_MAE', 'Max_MAE', 'Q25_MAE', 'Q75_MAE']
        stats['CV_MAE'] = (stats['Std_MAE'] / stats['Mean_MAE'] * 100).round(1)
        return stats.sort_values('Count', ascending=False)
    
    def get_all_models_mae_comparison(self) -> pd.DataFrame:
        """Compare MAE across ALL ML models (not just recommended)."""
        mae_cols = [f'{m}_mae' for m in self.ml_models]
        
        results = []
        for model in self.ml_models:
            col = f'{model}_mae'
            if col in self.data.columns:
                results.append({
                    'Model': model,
                    'Mean_MAE': self.data[col].mean(),
                    'Median_MAE': self.data[col].median(),
                    'Std_MAE': self.data[col].std(),
                    'Min_MAE': self.data[col].min(),
                    'Max_MAE': self.data[col].max(),
                })
        
        return pd.DataFrame(results).sort_values('Mean_MAE')
    
    def get_mae_statistics_by_state(self) -> pd.DataFrame:
        """Get MAE statistics broken down by state."""
        stats = self.data.groupby(['state', 'recommended_model'])['best_mae'].agg([
            ('Count', 'count'),
            ('Mean_MAE', 'mean'),
            ('Median_MAE', 'median'),
        ]).reset_index()
        return stats
    
    def get_confidence_distribution(self) -> pd.DataFrame:
        """Get confidence level distribution by model."""
        conf = self.data.groupby(['recommended_model', 'confidence']).size().reset_index(name='Count')
        pivot = conf.pivot(index='recommended_model', columns='confidence', values='Count').fillna(0).astype(int)
        pivot['Total'] = pivot.sum(axis=1)
        return pivot.sort_values('Total', ascending=False)
    
    def get_overall_summary(self) -> Dict:
        """Get overall summary statistics."""
        return {
            'total_series': len(self.data),
            'unique_models': self.data['recommended_model'].nunique(),
            'models_list': self.data['recommended_model'].unique().tolist(),
            'overall_mean_mae': self.data['best_mae'].mean(),
            'overall_median_mae': self.data['best_mae'].median(),
            'overall_std_mae': self.data['best_mae'].std(),
            'overall_min_mae': self.data['best_mae'].min(),
            'overall_max_mae': self.data['best_mae'].max(),
        }
    
    def generate_full_summary(self) -> Dict:
        """Generate complete summary."""
        self.summary = {
            'overall': self.get_overall_summary(),
            'model_frequency': self.get_model_frequency(),
            'model_frequency_by_state': self.get_model_frequency_by_state(),
            'mae_statistics': self.get_mae_statistics(),
            'all_models_comparison': self.get_all_models_mae_comparison(),
            'mae_by_state': self.get_mae_statistics_by_state(),
            'confidence_distribution': self.get_confidence_distribution(),
        }
        return self.summary

In [40]:
class WithinApproachVisualizer:
    """Creates visualizations for within-approach analysis."""
    
    def __init__(self):
        self.colors_stats = COLORS_STATS
        self.colors_ml = COLORS_ML
        self.bg_color = COLORS_GENERAL['background']
    
    def create_stats_dashboard(self, analyzer: StatsModelAnalyzer, output_path: str):
        """Create comprehensive dashboard for Stats models."""
        summary = analyzer.summary
        n_models = len(summary['model_frequency'])
        colors = get_viridis_colors(max(n_models, 6))
        
        fig = plt.figure(figsize=(18, 10), facecolor=self.bg_color)
        gs = fig.add_gridspec(2, 3, hspace=0.35, wspace=0.3,
                             left=0.06, right=0.98, top=0.90, bottom=0.08)
        
        #fig.suptitle(f'Statistical Models Analysis: {analyzer.file_name}',
                    #fontsize=18, fontweight='bold', color=COLORS_GENERAL['text'], y=0.96)
        
        # 1. Model Frequency (Donut)
        ax1 = fig.add_subplot(gs[0, 0])
        self._plot_frequency_donut(ax1, summary['model_frequency'], 
                                   colors, 'Model Selection Frequency')
        
        # 2. Model Frequency by State (Grouped Bar)
        ax2 = fig.add_subplot(gs[0, 1:])
        self._plot_frequency_by_state(ax2, summary['model_frequency_by_state'],
                                      colors, 'Model Frequency by State')
        
        # 3. MAE Box Plot by Model
        ax3 = fig.add_subplot(gs[1, 0])
        self._plot_mae_boxplot(ax3, analyzer.data, 'recommended_model', 'avg_cv_mae',
                              'viridis', 'MAE Distribution by Model')
        
        # 4. Confidence Distribution (Stacked Bar)
        ax4 = fig.add_subplot(gs[1, 1])
        self._plot_confidence_distribution(ax4, summary['confidence_distribution'],
                                           'Confidence Level Distribution')
        
        # 5. MAE Statistics Summary (Compact horizontal bar)
        ax5 = fig.add_subplot(gs[1, 2])
        self._plot_mae_summary_bar(ax5, summary['mae_statistics'], colors, 
                                   'Mean MAE by Model')
        
        plt.savefig(output_path, dpi=200, facecolor=self.bg_color,
                   bbox_inches='tight', pad_inches=0.3)
        plt.close()
        print(f"Stats dashboard saved to: {output_path}")
    
    def create_ml_dashboard(self, analyzer: MLModelAnalyzer, output_path: str):
        """Create comprehensive dashboard for ML models."""
        summary = analyzer.summary
        n_models = len(summary['model_frequency'])
        colors = get_plasma_colors(max(n_models, 6))
        
        fig = plt.figure(figsize=(18, 10), facecolor=self.bg_color)
        gs = fig.add_gridspec(2, 3, hspace=0.35, wspace=0.3,
                             left=0.06, right=0.98, top=0.90, bottom=0.08)
        
        #fig.suptitle(f'Machine Learning Models Analysis: {analyzer.file_name}', fontsize=18, fontweight='bold', color=COLORS_GENERAL['text'], y=0.96)
        
        # 1. Model Frequency (Donut)
        ax1 = fig.add_subplot(gs[0, 0])
        self._plot_frequency_donut(ax1, summary['model_frequency'],
                                   colors, 'Model Selection Frequency')
        
        # 2. Model Frequency by State (Grouped Bar)
        ax2 = fig.add_subplot(gs[0, 1:])
        self._plot_frequency_by_state(ax2, summary['model_frequency_by_state'],
                                      colors, 'Model Frequency by State')
        
        # 3. All Models MAE Comparison (not just recommended)
        ax3 = fig.add_subplot(gs[1, 0])
        self._plot_all_models_comparison(ax3, summary['all_models_comparison'],
                                         colors, 'All Models MAE Comparison')
        
        # 4. Confidence Distribution (Stacked Bar)
        ax4 = fig.add_subplot(gs[1, 1])
        self._plot_confidence_distribution(ax4, summary['confidence_distribution'],
                                           'Confidence Level Distribution')
        
        # 5. MAE Box Plot by Model
        ax5 = fig.add_subplot(gs[1, 2])
        self._plot_mae_boxplot(ax5, analyzer.data, 'recommended_model', 'best_mae',
                              'plasma', 'MAE Distribution by Model')
        
        plt.savefig(output_path, dpi=200, facecolor=self.bg_color,
                   bbox_inches='tight', pad_inches=0.3)
        plt.close()
        print(f"ML dashboard saved to: {output_path}")
    
    def _plot_frequency_donut(self, ax, freq_df: pd.DataFrame, colors: List, title: str):
        """Plot frequency as donut chart."""
        sizes = freq_df['Frequency'].values
        labels = freq_df['Model'].values
        
        n_colors = min(len(sizes), len(colors))
        plot_colors = colors[:n_colors] + ['#CCCCCC'] * (len(sizes) - n_colors)
        
        wedges, texts, autotexts = ax.pie(
            sizes, colors=plot_colors, autopct='%1.1f%%',
            startangle=90, pctdistance=0.75,
            wedgeprops=dict(width=0.5, edgecolor='white', linewidth=2),
            textprops={'fontsize': 9, 'fontweight': 'bold', 'color': 'white'}
        )
        
        # Hide small percentages
        for autotext, size in zip(autotexts, sizes):
            if size / sum(sizes) < 0.05:
                autotext.set_text('')
        
        centre_circle = plt.Circle((0, 0), 0.35, fc=self.bg_color)
        ax.add_patch(centre_circle)
        ax.text(0, 0, f'{sum(sizes)}', ha='center', va='center',
               fontsize=18, fontweight='bold', color=COLORS_GENERAL['text'])
        
        ax.set_title(title, pad=10, fontsize=12, fontweight='bold')
        ax.legend(wedges, labels, loc='center left', bbox_to_anchor=(1, 0.5),
                 fontsize=8, frameon=False)
    
    def _plot_frequency_by_state(self, ax, freq_df: pd.DataFrame, colors: List, title: str):
        """Plot frequency by state as grouped bar."""
        states = [c for c in freq_df.columns if c != 'Total']
        models = freq_df.index.tolist()
        
        x = np.arange(len(states))
        width = 0.8 / len(models)
        
        n_colors = min(len(models), len(colors))
        plot_colors = colors[:n_colors] + ['#CCCCCC'] * (len(models) - n_colors)
        
        for i, (model, color) in enumerate(zip(models, plot_colors)):
            values = [freq_df.loc[model, state] for state in states]
            offset = (i - len(models)/2 + 0.5) * width
            bars = ax.bar(x + offset, values, width, label=model, color=color,
                         edgecolor='white', linewidth=0.5)
        
        ax.set_ylabel('Frequency', fontsize=11)
        ax.set_title(title, pad=10, fontsize=12, fontweight='bold')
        ax.set_xticks(x)
        ax.set_xticklabels(states, fontsize=11, fontweight='bold')
        #ax.legend(loc='upper right', fontsize=9, ncol=2, framealpha=0.9)
        ax.set_facecolor(self.bg_color)
    
    def _plot_mae_comparison(self, ax, mae_df: pd.DataFrame, color: str, title: str):
        """Plot MAE comparison with error bars."""
        df = mae_df.sort_values('Mean_MAE', ascending=True)
        
        y_pos = np.arange(len(df))
        
        # Plot bars
        bars = ax.barh(y_pos, df['Mean_MAE'], xerr=df['Std_MAE'],
                      color=color, edgecolor='white', linewidth=1,
                      capsize=3, error_kw={'elinewidth': 1, 'capthick': 1})
        
        ax.set_yticks(y_pos)
        ax.set_yticklabels(df['Model'], fontsize=10)
        ax.set_xlabel('Mean Absolute Error (MAE)', fontsize=11)
        ax.set_title(title, pad=10, fontsize=12, fontweight='bold')
        
        # Add value labels
        for i, (mean, count) in enumerate(zip(df['Mean_MAE'], df['Count'])):
            ax.text(mean + df['Std_MAE'].iloc[i] + df['Mean_MAE'].max()*0.02, i,
                   f'{mean:,.0f} (n={count})', va='center', fontsize=9)
        
        ax.set_xlim(0, df['Mean_MAE'].max() + df['Std_MAE'].max() + df['Mean_MAE'].max()*0.2)
        ax.set_facecolor(self.bg_color)
    
    def _plot_mae_boxplot(self, ax, data: pd.DataFrame, group_col: str, 
                         value_col: str, cmap_name: str, title: str):
        """Plot MAE distribution as box plot with colormap."""
        models = data[group_col].value_counts().index.tolist()[:6]  # Top 6 models
        
        box_data = [data[data[group_col] == m][value_col].values for m in models]
        
        bp = ax.boxplot(box_data, tick_labels=models, patch_artist=True, vert=True)
        
        # Apply colormap colors
        cmap = plt.colormaps[cmap_name]
        n = len(models)
        for i, patch in enumerate(bp['boxes']):
            patch.set_facecolor(cmap(i / (n - 1) if n > 1 else 0))
            patch.set_alpha(0.8)
        
        for element in ['whiskers', 'caps']:
            for item in bp[element]:
                item.set_color('#555555')
        for median in bp['medians']:
            median.set_color('#333333')
            median.set_linewidth(2)
        
        ax.set_ylabel('MAE', fontsize=10)
        ax.set_title(title, pad=10, fontsize=12, fontweight='bold')
        ax.tick_params(axis='x', rotation=45)
        ax.set_facecolor(self.bg_color)
    
    def _plot_mae_summary_bar(self, ax, mae_df: pd.DataFrame, colors: List, title: str):
        """Plot compact MAE summary as horizontal bar."""
        df = mae_df.sort_values('Mean_MAE', ascending=True)
        
        y_pos = np.arange(len(df))
        bar_colors = colors[:len(df)]
        
        bars = ax.barh(y_pos, df['Mean_MAE'], color=bar_colors,
                      edgecolor='white', linewidth=1.5, height=0.7)
        
        ax.set_yticks(y_pos)
        ax.set_yticklabels(df['Model'], fontsize=10)
        ax.set_xlabel('Mean MAE', fontsize=10)
        ax.set_title(title, pad=10, fontsize=12, fontweight='bold')
        
        # Add value labels
        for i, (bar, mean, count) in enumerate(zip(bars, df['Mean_MAE'], df['Count'])):
            ax.text(mean + df['Mean_MAE'].max()*0.02, i,
                   f'{mean:,.0f} (n={count})', va='center', fontsize=9)
        
        ax.set_xlim(0, df['Mean_MAE'].max() * 1.25)
        ax.set_facecolor(self.bg_color)
    
    def _plot_confidence_distribution(self, ax, conf_df: pd.DataFrame, title: str):
        """Plot confidence distribution as stacked bar."""
        conf_cols = [c for c in conf_df.columns if c != 'Total']
        models = conf_df.index.tolist()[:6]  # Top 6 models
        
        conf_colors = {'High': '#2ECC71', 'Medium-High': '#82E0AA', 
                      'Medium': '#F4D03F', 'Low': '#E74C3C'}
        
        x = np.arange(len(models))
        bottom = np.zeros(len(models))
        
        for conf in conf_cols:
            if conf in conf_df.columns:
                values = [conf_df.loc[m, conf] if m in conf_df.index else 0 for m in models]
                color = conf_colors.get(conf, '#CCCCCC')
                ax.bar(x, values, bottom=bottom, label=conf, color=color, 
                      edgecolor='white', linewidth=0.5)
                bottom += values
        
        ax.set_ylabel('Count', fontsize=10)
        ax.set_title(title, pad=10, fontsize=12, fontweight='bold')
        ax.set_xticks(x)
        ax.set_xticklabels(models, fontsize=9, rotation=45, ha='right')
        ax.legend(loc='upper right', fontsize=8, framealpha=0.9)
        ax.set_facecolor(self.bg_color)
    
    def _plot_all_models_comparison(self, ax, comparison_df: pd.DataFrame, 
                                    colors: List, title: str):
        """Plot comparison of ALL ML models (not just recommended)."""
        df = comparison_df.sort_values('Mean_MAE')
        
        y_pos = np.arange(len(df))
        
        bars = ax.barh(y_pos, df['Mean_MAE'], color=colors[:len(df)],
                      edgecolor='white', linewidth=1.5, height=0.6)
        
        ax.set_yticks(y_pos)
        ax.set_yticklabels(df['Model'], fontsize=11, fontweight='bold')
        ax.set_xlabel('Mean MAE', fontsize=10)
        ax.set_title(title, pad=10, fontsize=12, fontweight='bold')
        
        for i, (bar, mean) in enumerate(zip(bars, df['Mean_MAE'])):
            ax.text(mean + df['Mean_MAE'].max()*0.02, i,
                   f'{mean:,.0f}', va='center', fontsize=10, fontweight='bold')
        
        ax.set_xlim(0, df['Mean_MAE'].max() * 1.2)
        ax.set_facecolor(self.bg_color)
    
    def _plot_summary_table(self, ax, mae_df: pd.DataFrame, title: str):
        """Plot summary statistics as table."""
        ax.axis('off')
        ax.set_title(title, pad=10, fontsize=12, fontweight='bold', loc='left')
        
        # Prepare table data
        table_data = mae_df[['Model', 'Count', 'Mean_MAE', 'Median_MAE', 'Min_MAE', 'Max_MAE', 'CV_MAE']].copy()
        table_data['Mean_MAE'] = table_data['Mean_MAE'].apply(lambda x: f'{x:,.0f}')
        table_data['Median_MAE'] = table_data['Median_MAE'].apply(lambda x: f'{x:,.0f}')
        table_data['Min_MAE'] = table_data['Min_MAE'].apply(lambda x: f'{x:,.0f}')
        table_data['Max_MAE'] = table_data['Max_MAE'].apply(lambda x: f'{x:,.0f}')
        table_data['CV_MAE'] = table_data['CV_MAE'].apply(lambda x: f'{x:.1f}%')
        
        table = ax.table(
            cellText=table_data.values,
            colLabels=['Model', 'Count', 'Mean', 'Median', 'Min', 'Max', 'CV%'],
            loc='center',
            cellLoc='center',
        )
        
        table.auto_set_font_size(False)
        table.set_fontsize(9)
        table.scale(1.2, 1.8)
        
        # Style header
        for i in range(len(table_data.columns)):
            table[(0, i)].set_facecolor('#2F5496')
            table[(0, i)].set_text_props(color='white', fontweight='bold')

In [41]:
class ExcelExporter:
    """Export analysis results to Excel."""
    
    @staticmethod
    def export_stats_analysis(analyzer: StatsModelAnalyzer, output_path: str):
        """Export Stats analysis to Excel."""
        wb = Workbook()
        summary = analyzer.summary
        
        header_font = Font(bold=True, color='FFFFFF')
        header_fill = PatternFill('solid', fgColor='C47500')
        thin_border = Border(
            left=Side(style='thin'), right=Side(style='thin'),
            top=Side(style='thin'), bottom=Side(style='thin')
        )
        
        # Sheet 1: Overall Summary
        ws = wb.active
        ws.title = 'Overall_Summary'
        
        ws['A1'] = 'STATISTICAL MODELS - OVERALL SUMMARY'
        ws['A1'].font = Font(bold=True, size=14)
        
        overall = summary['overall']
        summary_data = [
            ['Metric', 'Value'],
            ['Total Series', overall['total_series']],
            ['Unique Models', overall['unique_models']],
            ['Overall Mean MAE', f"{overall['overall_mean_mae']:,.2f}"],
            ['Overall Median MAE', f"{overall['overall_median_mae']:,.2f}"],
            ['Overall Std MAE', f"{overall['overall_std_mae']:,.2f}"],
            ['Overall Min MAE', f"{overall['overall_min_mae']:,.2f}"],
            ['Overall Max MAE', f"{overall['overall_max_mae']:,.2f}"],
        ]
        
        for row_idx, row_data in enumerate(summary_data, start=3):
            for col_idx, value in enumerate(row_data, start=1):
                cell = ws.cell(row=row_idx, column=col_idx, value=value)
                cell.border = thin_border
                if row_idx == 3:
                    cell.font = header_font
                    cell.fill = header_fill
        
        ws.column_dimensions['A'].width = 25
        ws.column_dimensions['B'].width = 20
        
        # Sheet 2: Model Frequency
        ws2 = wb.create_sheet('Model_Frequency')
        ExcelExporter._write_dataframe(ws2, summary['model_frequency'], 
                                       header_fill, thin_border, 'MODEL SELECTION FREQUENCY')
        
        # Sheet 3: MAE Statistics
        ws3 = wb.create_sheet('MAE_Statistics')
        ExcelExporter._write_dataframe(ws3, summary['mae_statistics'],
                                       header_fill, thin_border, 'MAE STATISTICS BY MODEL')
        
        # Sheet 4: Frequency by State
        ws4 = wb.create_sheet('Frequency_by_State')
        freq_state = summary['model_frequency_by_state'].reset_index()
        ExcelExporter._write_dataframe(ws4, freq_state,
                                       header_fill, thin_border, 'MODEL FREQUENCY BY STATE')
        
        wb.save(output_path)
        print(f"Stats Excel saved to: {output_path}")
    
    @staticmethod
    def export_ml_analysis(analyzer: MLModelAnalyzer, output_path: str):
        """Export ML analysis to Excel."""
        wb = Workbook()
        summary = analyzer.summary
        
        header_font = Font(bold=True, color='FFFFFF')
        header_fill = PatternFill('solid', fgColor='2E86AB')
        thin_border = Border(
            left=Side(style='thin'), right=Side(style='thin'),
            top=Side(style='thin'), bottom=Side(style='thin')
        )
        
        # Sheet 1: Overall Summary
        ws = wb.active
        ws.title = 'Overall_Summary'
        
        ws['A1'] = 'MACHINE LEARNING MODELS - OVERALL SUMMARY'
        ws['A1'].font = Font(bold=True, size=14)
        
        overall = summary['overall']
        summary_data = [
            ['Metric', 'Value'],
            ['Total Series', overall['total_series']],
            ['Unique Models', overall['unique_models']],
            ['Overall Mean MAE', f"{overall['overall_mean_mae']:,.2f}"],
            ['Overall Median MAE', f"{overall['overall_median_mae']:,.2f}"],
            ['Overall Std MAE', f"{overall['overall_std_mae']:,.2f}"],
            ['Overall Min MAE', f"{overall['overall_min_mae']:,.2f}"],
            ['Overall Max MAE', f"{overall['overall_max_mae']:,.2f}"],
        ]
        
        for row_idx, row_data in enumerate(summary_data, start=3):
            for col_idx, value in enumerate(row_data, start=1):
                cell = ws.cell(row=row_idx, column=col_idx, value=value)
                cell.border = thin_border
                if row_idx == 3:
                    cell.font = header_font
                    cell.fill = header_fill
        
        ws.column_dimensions['A'].width = 25
        ws.column_dimensions['B'].width = 20
        
        # Sheet 2: Model Frequency
        ws2 = wb.create_sheet('Model_Frequency')
        ExcelExporter._write_dataframe(ws2, summary['model_frequency'],
                                       header_fill, thin_border, 'MODEL SELECTION FREQUENCY')
        
        # Sheet 3: All Models Comparison
        ws3 = wb.create_sheet('All_Models_Comparison')
        ExcelExporter._write_dataframe(ws3, summary['all_models_comparison'],
                                       header_fill, thin_border, 'ALL ML MODELS MAE COMPARISON')
        
        # Sheet 4: MAE Statistics (Recommended)
        ws4 = wb.create_sheet('MAE_Statistics')
        ExcelExporter._write_dataframe(ws4, summary['mae_statistics'],
                                       header_fill, thin_border, 'MAE STATISTICS BY RECOMMENDED MODEL')
        
        # Sheet 5: Frequency by State
        ws5 = wb.create_sheet('Frequency_by_State')
        freq_state = summary['model_frequency_by_state'].reset_index()
        ExcelExporter._write_dataframe(ws5, freq_state,
                                       header_fill, thin_border, 'MODEL FREQUENCY BY STATE')
        
        wb.save(output_path)
        print(f"ML Excel saved to: {output_path}")
    
    @staticmethod
    def _write_dataframe(ws, df: pd.DataFrame, header_fill, border, title: str):
        """Write dataframe to worksheet."""
        ws['A1'] = title
        ws['A1'].font = Font(bold=True, size=14)
        
        # Headers
        for col_idx, col in enumerate(df.columns, start=1):
            cell = ws.cell(row=3, column=col_idx, value=col)
            cell.font = Font(bold=True, color='FFFFFF')
            cell.fill = header_fill
            cell.border = border
        
        # Data
        for row_idx, (_, row) in enumerate(df.iterrows(), start=4):
            for col_idx, value in enumerate(row, start=1):
                if isinstance(value, float):
                    value = round(value, 2)
                cell = ws.cell(row=row_idx, column=col_idx, value=value)
                cell.border = border
        
        # Auto-width
        for col_idx in range(1, len(df.columns) + 1):
            ws.column_dimensions[chr(64 + col_idx)].width = 15


In [42]:
import os

def main():
    
    # 1. Define Paths 
    # This is the main "COMPARISON" folder where your input files live
    comparison_folder = rf'C:\Users\{user}\OneDrive - purdue.edu\VS code\Data\ATC\COMPARISON'
    
    # This is the "Results_first" subfolder where you want to SAVE the output
    results_folder = os.path.join(comparison_folder, 'Results_first')
    
    # Check if the results folder exists; if not, create it!
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
        print(f"Created new folder: {results_folder}")

    UR_path = os.path.join(comparison_folder, 'PANEL_COMPARISON_UR.xlsx')
    NoP_path = os.path.join(comparison_folder, 'PANEL_COMPARISON_NoP.xlsx')

    # Config list
    files_config = [
        {'path': UR_path,  'name': 'Units_Reimbursed'},
        {'path': NoP_path, 'name': 'Num_Prescriptions'}
    ]
    
    visualizer = WithinApproachVisualizer()
    
    for config in files_config:
        file_path = config['path']
        display_name = config['name'] 
        
        print(f"\n{'='*70}")
        print(f"WITHIN-APPROACH ANALYSIS: {display_name}")
        print('='*70)
        
        # ===== STATS ANALYSIS =====
        print("\n--- Statistical Models Analysis ---")
        stats_analyzer = StatsModelAnalyzer(file_path)
        stats_analyzer.load_data()
        stats_summary = stats_analyzer.generate_full_summary()
        
        print(f"Total series: {stats_summary['overall']['total_series']}")
        print(f"Models: {stats_summary['overall']['models_list']}")
        print(f"Overall Mean MAE: {stats_summary['overall']['overall_mean_mae']:,.2f}")
        
        # SAVE TO results_folder
        stats_viz_path = os.path.join(results_folder, f"Stats_Analysis_{display_name}.png")
        stats_excel_path = os.path.join(results_folder, f"Stats_Analysis_{display_name}.xlsx")
        
        visualizer.create_stats_dashboard(stats_analyzer, stats_viz_path)
        ExcelExporter.export_stats_analysis(stats_analyzer, stats_excel_path)
        print(f"  -> Saved Stats Excel: {stats_excel_path}")
        
        # ===== ML ANALYSIS =====
        print("\n--- Machine Learning Models Analysis ---")
        ml_analyzer = MLModelAnalyzer(file_path)
        ml_analyzer.load_data()
        ml_summary = ml_analyzer.generate_full_summary()
        
        print(f"Total series: {ml_summary['overall']['total_series']}")
        print(f"Models: {ml_summary['overall']['models_list']}")
        print(f"Overall Mean MAE: {ml_summary['overall']['overall_mean_mae']:,.2f}")
        
        # SAVE TO results_folder
        ml_viz_path = os.path.join(results_folder, f"ML_Analysis_{display_name}.png")
        ml_excel_path = os.path.join(results_folder, f"ML_Analysis_{display_name}.xlsx")
        
        visualizer.create_ml_dashboard(ml_analyzer, ml_viz_path)
        ExcelExporter.export_ml_analysis(ml_analyzer, ml_excel_path)
        print(f"  -> Saved ML Excel:    {ml_excel_path}")
    
    print("\n" + "="*70)
    print(f"ANALYSIS COMPLETE. All files saved to: {results_folder}")
    print("="*70)

if __name__ == "__main__":
    main()


WITHIN-APPROACH ANALYSIS: Units_Reimbursed

--- Statistical Models Analysis ---
Total series: 666
Models: ['HistoricAverage', 'Naive', 'SARIMAX', 'WindowAverage', 'ARIMAX', 'SeasonalNaive']
Overall Mean MAE: 973,333.89
Stats dashboard saved to: C:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\ATC\COMPARISON\Results_first\Stats_Analysis_Units_Reimbursed.png
Stats Excel saved to: C:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\ATC\COMPARISON\Results_first\Stats_Analysis_Units_Reimbursed.xlsx
  -> Saved Stats Excel: C:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\ATC\COMPARISON\Results_first\Stats_Analysis_Units_Reimbursed.xlsx

--- Machine Learning Models Analysis ---
Total series: 317
Models: ['RandomForest', 'Ridge', 'LightGBM', 'XGBoost']
Overall Mean MAE: 987,414.67
ML dashboard saved to: C:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\ATC\COMPARISON\Results_first\ML_Analysis_Units_Reimbursed.png
ML Excel saved to: C:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\ATC\CO