In [22]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import warnings 
from utilsforecast.plotting import plot_series
from utilsforecast.evaluation import evaluate
from utilsforecast.losses import *
from statsforecast import StatsForecast
from statsforecast.models import (
    Naive,WindowAverage,
    AutoARIMA,SeasonalNaive,HoltWinters,
    CrostonClassic as Croston, HistoricAverage,DynamicOptimizedTheta as DOT,
    SeasonalNaive
)

warnings.filterwarnings("ignore")  # To ignore warnings from pandas/numpy

In [23]:
user="lholguin"

### Statistical Models - Dataset with pop

In [24]:
def load_and_prepare_data(filepath, states=None):

    df = pd.read_csv(filepath)
    
    if states is not None:
        if isinstance(states, str):
            states = [states]
        print(f"Filtering data for states: {states}")
        df=df[df['State'].isin(states)].copy()
        print(f"Filtered to {len(df)} rows across {df['State'].nunique()} state(s).")

        if len(df) == 0:
            raise ValueError("No data available for the specified states.")
    
    df['unique_id'] = df['State'] + '_' + df['ATC2 Class']
    df['ds'] = pd.to_datetime(df['Period'])
    df = df.sort_values(['unique_id','ds']).reset_index(drop=True)

    #Preparing dataframes for statsforecast
    df_units=df[['unique_id','ds','Units Reimbursed']].copy()
    df_units.columns=['unique_id','ds','y']

    df_prescriptions=df[['unique_id','ds','Number of Prescriptions']].copy()
    df_prescriptions.columns=['unique_id','ds','y']

    return df_units, df_prescriptions, df

In [25]:
def get_models():

    models = [
        Naive(), HistoricAverage(), WindowAverage(window_size=4),
        SeasonalNaive(season_length=4), AutoARIMA(seasonal=False,alias="ARIMA"),
        AutoARIMA(seasonal=True,season_length=4,alias="SARIMA"),
        HoltWinters(season_length=4), DOT(season_length=4),
    ]
    return models

def train_and_forecast(df,target_name, h=4):
    
    sf=StatsForecast(models=get_models(),freq='QS', n_jobs=-1,fallback_model=SeasonalNaive(season_length=4))
    
    forecasts_df=sf.forecast(df=df, h=h, level=[95])
    return forecasts_df, sf

def evaluate_train_test(df,target_name,test_size=4,h=4):

    train=df.groupby('unique_id').apply(lambda x: x.iloc[:-test_size]).reset_index(drop=True)
    test=df.groupby('unique_id').apply(lambda x: x.iloc[-test_size:]).reset_index(drop=True)

    sf=StatsForecast(models=get_models(),freq='QS', n_jobs=-1,fallback_model=SeasonalNaive(season_length=4))
    sf.fit(df=train)
    preds=sf.predict(h=h)
    preds_df=pd.merge(test, preds, on=['unique_id','ds'], how='left')

    models=[col for col in preds_df.columns if col not in ['unique_id','ds']]
    eval_df=evaluate(preds_df,metrics=[mse, mae, rmse], models=models)
    eval_df['best_model']=eval_df[models].idxmin(axis=1)
    print(eval_df.groupby('metric')['best_model'].value_counts().unstack(fill_value=0))
    return preds_df, eval_df    

def evaluate_model_cross(df,target_name,n_windows=5, h=4):
    
    sf=StatsForecast(models=get_models(),freq='QS', n_jobs=-1,fallback_model=SeasonalNaive(season_length=4))

    print(f"Running cross-validation for {target_name} with {n_windows} windows and horizon {h}...")
    cv_df=sf.cross_validation(df=df, h=h, n_windows=n_windows, step_size=h)
    
    # Errors metrics calculation
    models=[col for col in cv_df.columns if col not in ['unique_id','ds','y','cutoff']]
    
    eval_df=evaluate(cv_df,metrics=[mse, mae, rmse], models=models)
    mae_df=eval_df[eval_df['metric']=='mae'].copy()
    mae_df['best_model']=mae_df[models].idxmin(axis=1)
    print("\nCross-validation Best Model Summary based on MAE:")
    print(mae_df['best_model'].value_counts())

    return eval_df, cv_df, mae_df
    

    print(f"\nBest Models Summary for {target_name}:")
    print(evaluation_df_sumary['best_model'].value_counts())
    return eval_df, cv_df, mae_df

def get_best_model_forecast(forecasts_df,evaluation_df_sumary):

    best_info=evaluation_df_sumary[['unique_id','best_model']]
    forecasts_with_best=forecasts_df.reset_index().merge(best_info, on='unique_id', how='left')
    result=forecasts_with_best[['unique_id','ds']].copy()
    
    result['best_forecast']=forecasts_with_best.apply(
        lambda row: row[row['best_model']], axis=1)

    #confidence intervals
    for level in [95]:
        lo_col = f'best_forecast-lo-{level}'
        hi_col = f'best_forecast-hi-{level}'
        
        result[lo_col] = forecasts_with_best.apply(
            lambda row: row.get(f"{row['best_model']}-lo-{level}", np.nan), axis=1
        )
        result[hi_col] = forecasts_with_best.apply(
            lambda row: row.get(f"{row['best_model']}-hi-{level}", np.nan), axis=1
        )
    result=result.merge(best_info, on='unique_id')
    return result

def save_results(forecasts_df, best_forecasts_df, evaluation_cv_df, evaluation_traintest_df, target_name, save_path=None):
    
    if save_path is None or save_path == "":
        save_path = os.getcwd()
    
    # Create directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)
    
    prefix = target_name.lower().replace(' ', '_')
    
    evaluation_traintest_df.to_csv(
        os.path.join(save_path, f"{prefix}_train_test_evaluation.csv"), index=False
    )
    
    forecasts_df.reset_index().to_csv(
        os.path.join(save_path, f"{prefix}_all_model_forecasts.csv"), index=False
    )
    #saving evaluate_train_forecast results
    best_forecasts_df.to_csv(
        os.path.join(save_path, f"{prefix}_best_model_forecasts.csv"), index=False
    )
    evaluation_df.to_csv(
        os.path.join(save_path, f"{prefix}_model_evaluation.csv"), index=False
    )
    
    print(f"âœ“ Results saved to: {save_path}")
    
def plot_sample_forecasts(df,forecasts_df,target_name,n_samples=4):

    unique_ids = df['unique_id'].unique()
    sample_ids = np.random.choice(unique_ids, min(n_samples, len(unique_ids)), replace=False)
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    for idx, uid in enumerate(sample_ids):
        ax = axes[idx]
        
        # Historical data
        hist_data = df[df['unique_id'] == uid]
        ax.plot(hist_data['ds'], hist_data['y'], 'o-', label='Historical', linewidth=2)
        
        # Forecast data (just plot one model for clarity - SeasonalNaive)
        forecast_data = forecasts_df.reset_index()
        forecast_data = forecast_data[forecast_data['unique_id'] == uid]
        
        if 'SeasonalNaive' in forecast_data.columns:
            ax.plot(forecast_data['ds'], forecast_data['SeasonalNaive'], 
                   's-', label='SeasonalNaive', linewidth=2)
        
        if 'AutoARIMA' in forecast_data.columns:
            ax.plot(forecast_data['ds'], forecast_data['AutoARIMA'], 
                   '^-', label='AutoARIMA', linewidth=2)
        
        ax.set_title(f'{uid}', fontsize=10, fontweight='bold')
        ax.set_xlabel('Date')
        ax.set_ylabel(target_name)
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    #
    # (f'{target_name.lower().replace(" ", "_")}_sample_forecasts.png', dpi=150, bbox_inches='tight')
    print(f"âœ“ Plot saved: {target_name.lower().replace(' ', '_')}_sample_forecasts.png")
    plt.show()

In [26]:
def main(filepath, states=None, h=4, run_cv=True, n_windows=5,save_path=None):
    
    print("\n" + "="*70); print("STATSFORECAST PHARMACEUTICAL FORECASTING PIPELINE"); print("="*70)

    print("\n1. Loading and preparing data...")
    df_units, df_prescriptions, df_original = load_and_prepare_data(filepath, states=states)
    
    # Display summary
    print(f"\nData Summary:")
    print(f"  â€¢ Total unique series: {df_units['unique_id'].nunique()}")
    min_date = df_units['ds'].min()
    max_date = df_units['ds'].max()
    print(f"  â€¢ Date range: {min_date.year}-Q{min_date.quarter} to {max_date.year}-Q{max_date.quarter}")
    if states is not None:
        print(f"  â€¢ Filtered states: {', '.join(states)}")
    

    print("\n" + "="*70); print("UNITS REIMBURSED"); print("="*70)
    
    print("\n2. Training models and generating forecasts...")
    forecasts_units, sf_units = train_and_forecast(df_units, "Units Reimbursed", h=h)

    print("\n3. Evaluating models on training/test split...")
    eval_traintest_units, preds_traintest_units = evaluate_train_test(
        df_units, "Units Reimbursed", test_size=h, h=h
    )
    
    if run_cv:
        print("\n4. Running cross-validation...")
        cv_units, eval_units, eval_summary_units = evaluate_model_cross(
            df_units, "Units Reimbursed", n_windows=n_windows, h=h
        )
        
        print("\n5. Selecting best forecasts...")
        best_forecasts_units = get_best_model_forecast(forecasts_units, eval_summary_units)
        
        print("\n6. Saving results...")
        save_results(forecasts_units, best_forecasts_units, eval_units, eval_traintest_units, "Units Reimbursed",save_path=save_path)
    else:
        eval_units = None
        best_forecasts_units = forecasts_units.reset_index()
        print("\n Skipping cross-validation (run_cv=False)")
    
    # Plot sample forecasts
    print("\nGenerating forecast plots...")
    plot_sample_forecasts(df_units, forecasts_units, "Units Reimbursed", n_samples=4)
    

    print("\n" + "="*70); print("NUMBER OF PRESCRIPTIONS"); print("="*70)
    
    print("\n2. Training models and generating forecasts...")
    forecasts_prescriptions, sf_prescriptions = train_and_forecast(
        df_prescriptions, "Number of Prescriptions", h=h
    )

    print("\n3. Evaluating models on training/test split...")
    eval_traintest_prescriptions, preds_traintest_prescriptions = evaluate_train_test(
        df_prescriptions, "Number of Prescriptions", test_size=h, h=h
    )
    
    if run_cv:
        print("\n4. Running cross-validation...")
        cv_prescriptions, eval_prescriptions, eval_summary_prescriptions = evaluate_model_cross(
            df_prescriptions, "Number of Prescriptions", n_windows=n_windows, h=h
        )
        
        print("\n5. Selecting best forecasts...")
        best_forecasts_prescriptions = get_best_model_forecast(
            forecasts_prescriptions, eval_summary_prescriptions
        )
        
        print("\n6. Saving results...")
        save_results(
            forecasts_prescriptions, best_forecasts_prescriptions, 
            eval_prescriptions, eval_traintest_prescriptions, "Number of Prescriptions",save_path=save_path
        )
    else:
        eval_prescriptions = None
        best_forecasts_prescriptions = forecasts_prescriptions.reset_index()
        print("\n Skipping cross-validation (run_cv=False)")
    
    # Plot sample forecasts
    print("\n Generating forecast plots...")
    plot_sample_forecasts(
        df_prescriptions, 
        forecasts_prescriptions, 
        "Number of Prescriptions", 
        n_samples=4
    )
    
    print("\n" + "="*70); print("âœ… PIPELINE COMPLETE!"); print("="*70)
    
    state_info = f"{', '.join(states)}" if states else "ALL"
    print(f"\n Forecasts generated for state(s): {state_info}")
    print(f"Forecast horizon: {h} quarters ahead")
    
    if run_cv:
        print(f"\n All files generated:")
    
    print(f"\nðŸ“Š Generated Plots:")
    print(f"   â€¢ units_reimbursed_sample_forecasts.png")
    print(f"   â€¢ number_of_prescriptions_sample_forecasts.png")
    print("\n" + "="*70)
    
    # Optional: Return results for programmatic access
    return {
        'units': {
            'forecasts': forecasts_units,
            'best_forecasts': best_forecasts_units if run_cv else None,
            'evaluation': eval_units if run_cv else None,
        },
        'prescriptions': {
            'forecasts': forecasts_prescriptions,
            'best_forecasts': best_forecasts_prescriptions if run_cv else None,
            'evaluation': eval_prescriptions if run_cv else None,
        }
    }

In [27]:
path_save=rf"C:\Users\{user}\OneDrive - purdue.edu\VS code\Data\ATC\Forecast\\"
os.path.isdir(path_save)

True

In [28]:
#path to the pre-built dataset
csv_path=rf"C:\Users\{user}\OneDrive - purdue.edu\VS code\Data\ATC\merged_data\Prebuilt_panels\P1_nopop.csv"
path_save=rf"C:\Users\{user}\OneDrive - purdue.edu\VS code\Data\ATC\Forecast"

if __name__ == "__main__":
    main(filepath=csv_path, states=['IN'], h=4, run_cv=True, n_windows=5,save_path=path_save)
  


STATSFORECAST PHARMACEUTICAL FORECASTING PIPELINE

1. Loading and preparing data...
Filtering data for states: ['IN']
Filtered to 2971 rows across 1 state(s).

Data Summary:
  â€¢ Total unique series: 83
  â€¢ Date range: 2016-Q1 to 2024-Q4
  â€¢ Filtered states: IN

UNITS REIMBURSED

2. Training models and generating forecasts...

3. Evaluating models on training/test split...
best_model   y
metric        
mae         83
mse         83
rmse        83

4. Running cross-validation...
Running cross-validation for Units Reimbursed with 5 windows and horizon 4...

Cross-validation Best Model Summary based on MAE:
best_model
Naive                    106
HoltWinters               69
HistoricAverage           67
ARIMA                     50
WindowAverage             42
SeasonalNaive             32
DynamicOptimizedTheta     30
SARIMA                    19
Name: count, dtype: int64

5. Selecting best forecasts...

6. Saving results...


NameError: name 'evaluation_df' is not defined