In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import os
from pathlib import Path
from pathlib import Path
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side

In [5]:
user='Lilian'

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({
    'font.family': 'sans-serif',
    'font.sans-serif': ['Arial', 'DejaVu Sans'],
    'font.size': 11,
    'axes.titlesize': 13,
    'axes.titleweight': 'bold',
    'axes.labelsize': 11,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'figure.titlesize': 16,
    'figure.titleweight': 'bold',
    'axes.spines.top': False,
    'axes.spines.right': False,
})

# Color palette
COLORS = {
    'ml': '#2E86AB',        # Professional blue
    'stats': '#F18F01',     # Warm orange
    'gray': '#8D99AE',      # Elegant gray
    'ml_light': '#A8D5E5',  
    'stats_light': '#FCCF7D',
    'background': '#F8F9FA',
    'text': '#2B2D42',
}

In [6]:
class ForecastComparisonPipeline:
    
    def __init__(self, file_path: str, gray_zone_pct: float = 5.0):
        """
        Initialize pipeline.
        
        Args:
            file_path: Path to Excel file
            gray_zone_pct: If |MAE_stats - MAE_ml| < X% of avg(MAE_stats, MAE_ml), 
                          classify as "Gray Zone" (practical equivalence)
        """
        self.file_path = file_path
        self.file_name = Path(file_path).stem
        self.states = ['IN', 'IL', 'OH', 'MI']
        self.gray_zone_pct = gray_zone_pct
        self.results = {}
        
    def _get_ml_best_mae(self, row: pd.Series) -> float:
        """Extract MAE for the recommended model from ML sheet."""
        model = row['recommended_model']
        mae_col = f"{model}_mae"
        if mae_col in row.index:
            return row[mae_col]
        return row['cv_mae']
    
    def _determine_best_approach(self, row: pd.Series) -> str:
        """Determine best approach with percentage-based gray zone."""
        avg_mae = (row['ml_mae'] + row['stats_mae']) / 2
        threshold = avg_mae * (self.gray_zone_pct / 100)
        mae_diff = abs(row['ml_mae'] - row['stats_mae'])
        
        if mae_diff < threshold:
            return 'Gray Zone'
        elif row['ml_mae'] < row['stats_mae']:
            return 'ML'
        else:
            return 'Stats'
    
    def _determine_best_model(self, row: pd.Series) -> str:
        """Determine best model with gray zone consideration."""
        if row['best_approach'] == 'Gray Zone':
            return f"{row['ml_model']} â‰ˆ {row['stats_model']}"
        elif row['best_approach'] == 'ML':
            return row['ml_model']
        else:
            return row['stats_model']
    
    def _determine_best_mae(self, row: pd.Series) -> float:
        """Determine best MAE (use average for gray zone)."""
        if row['best_approach'] == 'Gray Zone':
            return (row['ml_mae'] + row['stats_mae']) / 2
        else:
            return min(row['ml_mae'], row['stats_mae'])
    
    def process_state(self, state: str) -> pd.DataFrame:
        """Process and compare ML vs Stats for a given state."""
        ml_df = pd.read_excel(self.file_path, sheet_name=f'{state}_ML')
        stats_df = pd.read_excel(self.file_path, sheet_name=f'{state}_Stats')
        
        ml_df['ml_best_mae'] = ml_df.apply(self._get_ml_best_mae, axis=1)
        ml_processed = ml_df[['unique_id', 'recommended_model', 'ml_best_mae', 'confidence']].copy()
        ml_processed.columns = ['unique_id', 'ml_model', 'ml_mae', 'ml_confidence']
        
        stats_processed = stats_df[['unique_id', 'recommended_model', 'avg_cv_mae', 'confidence']].copy()
        stats_processed.columns = ['unique_id', 'stats_model', 'stats_mae', 'stats_confidence']
        
        comparison = pd.merge(ml_processed, stats_processed, on='unique_id', how='inner')
        
        # Calculate metrics
        comparison['mae_difference'] = comparison['stats_mae'] - comparison['ml_mae']
        comparison['abs_mae_difference'] = comparison['mae_difference'].abs()
        comparison['avg_mae'] = (comparison['ml_mae'] + comparison['stats_mae']) / 2
        comparison['pct_difference'] = (comparison['abs_mae_difference'] / comparison['avg_mae']) * 100
        
        # Apply gray zone logic
        comparison['best_approach'] = comparison.apply(self._determine_best_approach, axis=1)
        comparison['best_model'] = comparison.apply(self._determine_best_model, axis=1)
        comparison['best_mae'] = comparison.apply(self._determine_best_mae, axis=1)
        comparison['state'] = state
        
        return comparison
    
    def run_comparison(self) -> pd.DataFrame:
        """Run comparison for all states."""
        all_comparisons = []
        for state in self.states:
            try:
                state_comparison = self.process_state(state)
                all_comparisons.append(state_comparison)
            except Exception as e:
                print(f"Error processing {state}: {e}")
        
        self.results_df = pd.concat(all_comparisons, ignore_index=True)
        return self.results_df
    
    def get_summary_statistics(self) -> dict:
        """Generate comprehensive summary statistics with gray zone."""
        df = self.results_df
        
        n_total = len(df)
        n_ml = (df['best_approach'] == 'ML').sum()
        n_stats = (df['best_approach'] == 'Stats').sum()
        n_gray = (df['best_approach'] == 'Gray Zone').sum()
        
        summary = {
            'overall': {
                'total_series': n_total,
                'ml_wins': int(n_ml),
                'stats_wins': int(n_stats),
                'gray_zone': int(n_gray),
                'ml_win_rate': n_ml / n_total * 100,
                'stats_win_rate': n_stats / n_total * 100,
                'gray_zone_rate': n_gray / n_total * 100,
                'gray_zone_pct_threshold': self.gray_zone_pct,
                'avg_ml_mae': df['ml_mae'].mean(),
                'avg_stats_mae': df['stats_mae'].mean(),
                'avg_best_mae': df['best_mae'].mean(),
                'median_pct_difference': df['pct_difference'].median(),
                'avg_pct_difference': df['pct_difference'].mean(),
            },
            'by_state': {},
            'by_ml_model': df[df['best_approach'] == 'ML']['ml_model'].value_counts().to_dict(),
            'by_stats_model': df[df['best_approach'] == 'Stats']['stats_model'].value_counts().to_dict(),
        }
        
        for state in df['state'].unique():
            state_df = df[df['state'] == state]
            n_state = len(state_df)
            summary['by_state'][state] = {
                'total_series': n_state,
                'ml_wins': int((state_df['best_approach'] == 'ML').sum()),
                'stats_wins': int((state_df['best_approach'] == 'Stats').sum()),
                'gray_zone': int((state_df['best_approach'] == 'Gray Zone').sum()),
                'ml_win_rate': (state_df['best_approach'] == 'ML').mean() * 100,
                'stats_win_rate': (state_df['best_approach'] == 'Stats').mean() * 100,
                'gray_zone_rate': (state_df['best_approach'] == 'Gray Zone').mean() * 100,
                'avg_pct_difference': state_df['pct_difference'].mean(),
            }
        
        return summary
    
    def get_model_frequency_analysis(self) -> pd.DataFrame:
        """Analyze which models are most frequently selected (excluding gray zone)."""
        df = self.results_df
        clear_winners = df[df['best_approach'] != 'Gray Zone'].copy()
        
        if len(clear_winners) == 0:
            return pd.DataFrame()
        
        clear_winners['single_best_model'] = np.where(
            clear_winners['best_approach'] == 'ML',
            clear_winners['ml_model'],
            clear_winners['stats_model']
        )
        
        best_model_counts = clear_winners['single_best_model'].value_counts().reset_index()
        best_model_counts.columns = ['Model', 'Times_Selected']
        
        avg_mae_by_model = clear_winners.groupby('single_best_model')['best_mae'].agg(['mean', 'median']).reset_index()
        avg_mae_by_model.columns = ['Model', 'Avg_MAE', 'Median_MAE']
        
        model_analysis = pd.merge(best_model_counts, avg_mae_by_model, on='Model')
        model_analysis['Selection_Rate_%'] = (model_analysis['Times_Selected'] / len(clear_winners) * 100).round(1)
        
        ml_models = ['RandomForest', 'LightGBM', 'Ridge', 'XGBoost']
        model_analysis['Approach'] = model_analysis['Model'].apply(
            lambda x: 'ML' if x in ml_models else 'Stats'
        )
        
        return model_analysis.sort_values('Times_Selected', ascending=False)
    
    def get_all_model_frequency(self) -> pd.DataFrame:
        """Get model frequency for ALL selections (ML models when ML wins, Stats when Stats wins)."""
        df = self.results_df
        
        # ML models (count when ML approach wins)
        ml_wins = df[df['best_approach'] == 'ML']
        ml_counts = ml_wins['ml_model'].value_counts().reset_index()
        ml_counts.columns = ['Model', 'Count']
        ml_counts['Approach'] = 'ML'
        
        # Stats models (count when Stats approach wins)
        stats_wins = df[df['best_approach'] == 'Stats']
        stats_counts = stats_wins['stats_model'].value_counts().reset_index()
        stats_counts.columns = ['Model', 'Count']
        stats_counts['Approach'] = 'Stats'
        
        all_counts = pd.concat([ml_counts, stats_counts], ignore_index=True)
        return all_counts.sort_values('Count', ascending=False)

def create_output_excel(pipeline, summary, model_analysis, output_path):
    """Create formatted Excel output with gray zone."""
    wb = Workbook()
    
    header_font = Font(bold=True, color='FFFFFF')
    header_fill = PatternFill('solid', fgColor='2F5496')
    thin_border = Border(
        left=Side(style='thin'), right=Side(style='thin'),
        top=Side(style='thin'), bottom=Side(style='thin')
    )
    ml_fill = PatternFill('solid', fgColor='C6EFCE')
    stats_fill = PatternFill('solid', fgColor='FFEB9C')
    gray_fill = PatternFill('solid', fgColor='D9D9D9')
    
    # Sheet 1: Summary
    ws_summary = wb.active
    ws_summary.title = 'Summary'
    
    ws_summary['A1'] = 'COMPARISON SUMMARY WITH PERCENTAGE-BASED GRAY ZONE'
    ws_summary['A1'].font = Font(bold=True, size=14)
    ws_summary['A2'] = f'Gray Zone: |MAE difference| < {summary["overall"]["gray_zone_pct_threshold"]}% of average MAE'
    ws_summary['A2'].font = Font(italic=True, color='666666')
    
    summary_data = [
        ['Metric', 'Value'],
        ['Total Series Compared', summary['overall']['total_series']],
        ['ML Wins (Clear)', summary['overall']['ml_wins']],
        ['Stats Wins (Clear)', summary['overall']['stats_wins']],
        ['Gray Zone (Equivalent)', summary['overall']['gray_zone']],
        ['ML Win Rate (%)', round(summary['overall']['ml_win_rate'], 1)],
        ['Stats Win Rate (%)', round(summary['overall']['stats_win_rate'], 1)],
        ['Gray Zone Rate (%)', round(summary['overall']['gray_zone_rate'], 1)],
        ['', ''],
        ['Average ML MAE', f"{summary['overall']['avg_ml_mae']:,.2f}"],
        ['Average Stats MAE', f"{summary['overall']['avg_stats_mae']:,.2f}"],
        ['Average Best MAE', f"{summary['overall']['avg_best_mae']:,.2f}"],
        ['', ''],
        ['Avg % Difference', f"{summary['overall']['avg_pct_difference']:.1f}%"],
        ['Median % Difference', f"{summary['overall']['median_pct_difference']:.1f}%"],
    ]
    
    for row_idx, row_data in enumerate(summary_data, start=4):
        for col_idx, value in enumerate(row_data, start=1):
            cell = ws_summary.cell(row=row_idx, column=col_idx, value=value)
            cell.border = thin_border
            if row_idx == 4:
                cell.font = header_font
                cell.fill = header_fill
    
    # State-level summary
    ws_summary['A22'] = 'BY STATE COMPARISON'
    ws_summary['A22'].font = Font(bold=True, size=14)
    
    state_headers = ['State', 'Total', 'ML Wins', 'Stats Wins', 'Gray Zone', 
                     'ML %', 'Stats %', 'Gray %', 'Avg % Diff']
    for col_idx, header in enumerate(state_headers, start=1):
        cell = ws_summary.cell(row=24, column=col_idx, value=header)
        cell.font = header_font
        cell.fill = header_fill
        cell.border = thin_border
    
    row_idx = 25
    for state, stats in summary['by_state'].items():
        values = [state, stats['total_series'], stats['ml_wins'], stats['stats_wins'],
                  stats['gray_zone'], round(stats['ml_win_rate'], 1), 
                  round(stats['stats_win_rate'], 1), round(stats['gray_zone_rate'], 1),
                  f"{stats['avg_pct_difference']:.1f}%"]
        for col_idx, value in enumerate(values, start=1):
            cell = ws_summary.cell(row=row_idx, column=col_idx, value=value)
            cell.border = thin_border
        row_idx += 1
    
    ws_summary.column_dimensions['A'].width = 30
    ws_summary.column_dimensions['B'].width = 15
    for col in ['C', 'D', 'E', 'F', 'G', 'H', 'I']:
        ws_summary.column_dimensions[col].width = 12
    
    # Sheet 2: Model Analysis
    ws_models = wb.create_sheet('Model_Analysis')
    ws_models['A1'] = 'MODEL PERFORMANCE (Clear Winners Only)'
    ws_models['A1'].font = Font(bold=True, size=14)
    
    if len(model_analysis) > 0:
        model_cols = ['Model', 'Approach', 'Times_Selected', 'Selection_Rate_%', 'Avg_MAE', 'Median_MAE']
        for col_idx, col in enumerate(model_cols, start=1):
            cell = ws_models.cell(row=3, column=col_idx, value=col)
            cell.font = header_font
            cell.fill = header_fill
            cell.border = thin_border
        
        for row_idx, (_, row) in enumerate(model_analysis.iterrows(), start=4):
            for col_idx, col in enumerate(model_cols, start=1):
                value = row[col] if col in row else ''
                if isinstance(value, float):
                    value = round(value, 2)
                cell = ws_models.cell(row=row_idx, column=col_idx, value=value)
                cell.border = thin_border
                if col == 'Approach':
                    cell.fill = ml_fill if value == 'ML' else stats_fill
    
    for col in ['A', 'B', 'C', 'D', 'E', 'F']:
        ws_models.column_dimensions[col].width = 16
    
    # Sheet 3: Detailed Results
    ws_detail = wb.create_sheet('Detailed_Results')
    
    detail_df = pipeline.results_df[[
        'state', 'unique_id', 'ml_model', 'ml_mae', 'stats_model', 'stats_mae',
        'pct_difference', 'best_approach', 'best_model', 'best_mae'
    ]].copy()
    detail_df = detail_df.sort_values(['state', 'unique_id'])
    
    headers = list(detail_df.columns)
    for col_idx, header in enumerate(headers, start=1):
        cell = ws_detail.cell(row=1, column=col_idx, value=header)
        cell.font = header_font
        cell.fill = header_fill
        cell.border = thin_border
    
    for row_idx, (_, row) in enumerate(detail_df.iterrows(), start=2):
        for col_idx, col in enumerate(headers, start=1):
            value = row[col]
            if isinstance(value, float):
                value = round(value, 2)
            cell = ws_detail.cell(row=row_idx, column=col_idx, value=value)
            cell.border = thin_border
            if col == 'best_approach':
                if value == 'ML':
                    cell.fill = ml_fill
                elif value == 'Stats':
                    cell.fill = stats_fill
                else:
                    cell.fill = gray_fill
    
    for col in ws_detail.column_dimensions:
        ws_detail.column_dimensions[col].width = 14
    ws_detail.column_dimensions['I'].width = 28
    
    wb.save(output_path)

In [7]:
def create_visualizations(results_dict, output_path):
    """Create beautiful visualizations with model frequency."""
    
    fig = plt.figure(figsize=(20, 12), facecolor=COLORS['background'])
    
    # Create grid layout
    gs = fig.add_gridspec(2, 3, hspace=0.35, wspace=0.3, 
                          left=0.06, right=0.98, top=0.90, bottom=0.08)
    
    #fig.suptitle('ML vs Statistical Models: Forecasting Performance Comparison', fontsize=18, fontweight='bold', color=COLORS['text'], y=0.96)
    
    for idx, (file_name, data) in enumerate(results_dict.items()):
        summary = data['summary']
        model_freq = data['model_freq']
        threshold = summary['overall']['gray_zone_pct_threshold']
        
        # Clean file name for display
        display_name = file_name.replace('PANEL_COMPARISON_', '').replace('_', ' ')
        if 'UR' in display_name:
            display_name = 'Units Reimbursed'
        elif 'NoP' in display_name:
            display_name = 'Number of Prescriptions'
        
        # ===== Plot 1: Donut Chart for Overall Win Rates =====
        ax1 = fig.add_subplot(gs[idx, 0])
        
        sizes = [summary['overall']['ml_win_rate'], 
                 summary['overall']['stats_win_rate'],
                 summary['overall']['gray_zone_rate']]
        colors = [COLORS['ml'], COLORS['stats'], COLORS['gray']]
        labels = ['ML', 'Stats', 'Gray Zone']
        
        # Create donut
        wedges, texts, autotexts = ax1.pie(
            sizes, colors=colors, autopct='%1.1f%%',
            startangle=90, pctdistance=0.75,
            wedgeprops=dict(width=0.5, edgecolor='white', linewidth=2),
            textprops={'fontsize': 11, 'fontweight': 'bold', 'color': 'white'}
        )
        
        # Add center text
        centre_circle = plt.Circle((0, 0), 0.35, fc=COLORS['background'])
        ax1.add_patch(centre_circle)
        ax1.text(0, 0.05, f'{summary["overall"]["total_series"]}', ha='center', va='center', 
                fontsize=20, fontweight='bold', color=COLORS['text'])
        ax1.text(0, -0.15, 'series', ha='center', va='center', 
                fontsize=10, color=COLORS['text'])
        
        ax1.set_title(f'{display_name}\nOverall Distribution', pad=15, fontsize=13, fontweight='bold')
        
        # Legend
        legend_labels = [f'{l} ({s:.1f}%)' for l, s in zip(labels, sizes)]
        ax1.legend(wedges, legend_labels, loc='lower center', bbox_to_anchor=(0.5, -0.12),
                  ncol=3, fontsize=9, frameon=False)
        
        # ===== Plot 2: Stacked Bar for State Comparison =====
        ax2 = fig.add_subplot(gs[idx, 1])
        
        states = list(summary['by_state'].keys())
        ml_rates = [summary['by_state'][s]['ml_win_rate'] for s in states]
        stats_rates = [summary['by_state'][s]['stats_win_rate'] for s in states]
        gray_rates = [summary['by_state'][s]['gray_zone_rate'] for s in states]
        
        x = np.arange(len(states))
        width = 0.6
        
        bars1 = ax2.bar(x, ml_rates, width, label='ML', color=COLORS['ml'], edgecolor='white', linewidth=1)
        bars2 = ax2.bar(x, stats_rates, width, bottom=ml_rates, label='Stats', 
                       color=COLORS['stats'], edgecolor='white', linewidth=1)
        bars3 = ax2.bar(x, gray_rates, width, bottom=np.array(ml_rates)+np.array(stats_rates), 
                       label='Gray Zone', color=COLORS['gray'], edgecolor='white', linewidth=1)
        
        ax2.set_ylabel('Percentage (%)', fontsize=11)
        ax2.set_title(f'{display_name}\nWin Rate by State', pad=15, fontsize=13, fontweight='bold')
        ax2.set_xticks(x)
        ax2.set_xticklabels(states, fontsize=11, fontweight='bold')
        ax2.set_ylim(0, 105)
        
        # Add percentage labels on bars
        for i, (m, s, g) in enumerate(zip(ml_rates, stats_rates, gray_rates)):
            if m > 10:
                ax2.text(i, m/2, f'{m:.0f}%', ha='center', va='center', 
                        fontsize=9, color='white', fontweight='bold')
            if s > 10:
                ax2.text(i, m + s/2, f'{s:.0f}%', ha='center', va='center', 
                        fontsize=9, color='white', fontweight='bold')
            if g > 8:
                ax2.text(i, m + s + g/2, f'{g:.0f}%', ha='center', va='center', 
                        fontsize=9, color='white', fontweight='bold')
        
        ax2.set_facecolor(COLORS['background'])
        
        # ===== Plot 3: Horizontal Bar for Model Frequency =====
        ax3 = fig.add_subplot(gs[idx, 2])
        
        # Get top 8 models
        top_models = model_freq.head(8).copy()
        top_models = top_models.iloc[::-1]  # Reverse for horizontal bar
        
        colors_bars = [COLORS['ml'] if app == 'ML' else COLORS['stats'] 
                      for app in top_models['Approach']]
        
        bars = ax3.barh(top_models['Model'], top_models['Count'], 
                       color=colors_bars, edgecolor='white', linewidth=1, height=0.7)
        
        ax3.set_xlabel('Times Selected as Best', fontsize=11)
        ax3.set_title(f'{display_name}\nModel Selection Frequency', pad=15, fontsize=13, fontweight='bold')
        
        # Add count labels
        for bar, count in zip(bars, top_models['Count']):
            width = bar.get_width()
            ax3.text(width + 2, bar.get_y() + bar.get_height()/2, 
                    f'{count}', ha='left', va='center', fontsize=10, fontweight='bold')
        
        ax3.set_xlim(0, top_models['Count'].max() * 1.15)
        ax3.set_facecolor(COLORS['background'])
        
        # Custom legend for model types
        ml_patch = mpatches.Patch(color=COLORS['ml'], label='ML Models')
        stats_patch = mpatches.Patch(color=COLORS['stats'], label='Stats Models')
        ax3.legend(handles=[ml_patch, stats_patch], loc='lower right', fontsize=9, framealpha=0.9)
    
    # Add footnote
    fig.text(0.5, 0.02, f'Gray Zone Threshold: < {threshold}% difference in MAE between approaches', 
             ha='center', fontsize=10, style='italic', color=COLORS['text'], alpha=0.7)
    
    plt.savefig(output_path, dpi=200, facecolor=COLORS['background'], 
                bbox_inches='tight', pad_inches=0.3)
    plt.close()
    print(f"Visualization saved to: {output_path}")

def create_detailed_model_chart(results_dict, output_path):
    """Create a detailed model comparison chart."""
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 7), facecolor=COLORS['background'])
    fig.suptitle('Best Model Selection Breakdown by Approach', 
                 fontsize=16, fontweight='bold', color=COLORS['text'], y=1.02)
    
    for idx, (file_name, data) in enumerate(results_dict.items()):
        ax = axes[idx]
        summary = data['summary']
        
        display_name = 'Units Reimbursed' if 'UR' in file_name else 'Number of Prescriptions'
        
        # Prepare data
        ml_models = summary['by_ml_model']
        stats_models = summary['by_stats_model']
        
        # Create combined data
        all_models = []
        all_counts = []
        all_approaches = []
        
        for model, count in sorted(ml_models.items(), key=lambda x: -x[1]):
            all_models.append(model)
            all_counts.append(count)
            all_approaches.append('ML')
        
        for model, count in sorted(stats_models.items(), key=lambda x: -x[1]):
            all_models.append(model)
            all_counts.append(count)
            all_approaches.append('Stats')
        
        # Create DataFrame and sort
        df_plot = pd.DataFrame({
            'Model': all_models,
            'Count': all_counts,
            'Approach': all_approaches
        }).sort_values('Count', ascending=True).tail(10)
        
        colors_bars = [COLORS['ml'] if app == 'ML' else COLORS['stats'] 
                      for app in df_plot['Approach']]
        
        bars = ax.barh(df_plot['Model'], df_plot['Count'], color=colors_bars, 
                      edgecolor='white', linewidth=1.5, height=0.7)
        
        # Add labels
        for bar, count in zip(bars, df_plot['Count']):
            ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2, 
                   f'{count}', ha='left', va='center', fontsize=10, fontweight='bold')
        
        ax.set_xlabel('Times Selected as Best Model', fontsize=11)
        ax.set_title(f'{display_name}', fontsize=14, fontweight='bold', pad=10)
        ax.set_xlim(0, df_plot['Count'].max() * 1.15)
        ax.set_facecolor(COLORS['background'])
        
        # Legend
        ml_patch = mpatches.Patch(color=COLORS['ml'], label='ML Models')
        stats_patch = mpatches.Patch(color=COLORS['stats'], label='Stats Models')
        ax.legend(handles=[ml_patch, stats_patch], loc='lower right', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=200, facecolor=COLORS['background'], 
                bbox_inches='tight', pad_inches=0.2)
    plt.close()
    print(f"Detailed chart saved to: {output_path}")


In [8]:
import os

def main():
    # 1. Define your paths
    # Note: I'm assuming 'user' is already defined as 'Lilian' based on your previous code
    base_folder = rf'C:\Users\{user}\OneDrive - purdue.edu\VS code\Data\ATC\COMPARISON'
    
    # Define the output subfolder
    results_folder = os.path.join(base_folder, 'Results_last')
    
    # Create the folder if it doesn't exist
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
        print(f"Created output folder: {results_folder}")
    
    # Input files (These stay in the base folder)
    UR_path = os.path.join(base_folder, 'PANEL_COMPARISON_UR.xlsx')
    NoP_path = os.path.join(base_folder, 'PANEL_COMPARISON_NoP.xlsx')

    files_config = [
        {'path': UR_path,  'name': 'Units_Reimbursed'},
        {'path': NoP_path, 'name': 'Num_Prescriptions'}
    ]
    
    GRAY_ZONE_PCT = 5.0
    all_results = {}
    
    # --- PROCESSING LOOP ---
    for config in files_config:
        file_path = config['path']
        file_name = config['name'] 
        
        print(f"\n{'='*60}")
        print(f"Processing: {file_name}")
        print('='*60)
        
        # Run your pipeline
        pipeline = ForecastComparisonPipeline(file_path, gray_zone_pct=GRAY_ZONE_PCT)
        pipeline.run_comparison()
        
        # Gather results
        summary = pipeline.get_summary_statistics()
        model_analysis = pipeline.get_model_frequency_analysis()
        model_freq = pipeline.get_all_model_frequency()
        
        all_results[file_name] = {
            'pipeline': pipeline,
            'summary': summary,
            'model_analysis': model_analysis,
            'model_freq': model_freq
        }
        
        # --- SAVE EXCEL ---
        # Save to results_folder instead of base_folder
        excel_filename = f"Results_{file_name}.xlsx"
        excel_save_path = os.path.join(results_folder, excel_filename)
        
        create_output_excel(pipeline, summary, model_analysis, excel_save_path)
        print(f"Excel saved to: {excel_save_path}")

    # --- SAVE VISUALIZATIONS ---
    print(f"\nGenerating visualizations...")
    
    # Save images to results_folder
    viz_path = os.path.join(results_folder, "Comparison_Dashboard.png")
    detailed_viz_path = os.path.join(results_folder, "Comparison_Detailed_Models.png")
    
    create_visualizations(all_results, viz_path)
    create_detailed_model_chart(all_results, detailed_viz_path)
    
    print(f"Visualizations saved to folder: {results_folder}")
    return all_results

if __name__ == "__main__":
    results = main()


Processing: Units_Reimbursed
Excel saved to: C:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\ATC\COMPARISON\Results_last\Results_Units_Reimbursed.xlsx

Processing: Num_Prescriptions
Excel saved to: C:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\ATC\COMPARISON\Results_last\Results_Num_Prescriptions.xlsx

Generating visualizations...
Visualization saved to: C:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\ATC\COMPARISON\Results_last\Comparison_Dashboard.png
Detailed chart saved to: C:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\ATC\COMPARISON\Results_last\Comparison_Detailed_Models.png
Visualizations saved to folder: C:\Users\Lilian\OneDrive - purdue.edu\VS code\Data\ATC\COMPARISON\Results_last
