In [None]:
import os
import zipfile
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from google.colab import files

# Set logging level
logging.getLogger().setLevel(logging.WARNING)

# Upload the dataset
uploaded = files.upload()

# Load data
df = pd.read_csv("Walmart_Sales.csv")

# Parse dates and create time features
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df['store'] = df['Store'].astype(str)
df = df.rename(columns={'Date': 'ds', 'Weekly_Sales': 'y'})
df = df.sort_values('ds')

# Create time-based features
df['year'] = df['ds'].dt.year
df['month'] = df['ds'].dt.month
df['week'] = df['ds'].dt.isocalendar().week.astype(int)
df['dayofweek'] = df['ds'].dt.dayofweek

# Create lag features function
def create_lag_features(store_df, lags=[1, 2, 4, 12]):
    store_df = store_df.sort_values('ds').copy()
    for lag in lags:
        store_df[f'lag_{lag}'] = store_df['y'].shift(lag)
    return store_df

# Generate lag features per store
df_lagged = df.groupby('store').apply(create_lag_features).reset_index(drop=True)
df_lagged = df_lagged.dropna()

# Define regressors
regressors = ['Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
              'year', 'month', 'week', 'dayofweek'] + [f'lag_{l}' for l in [1, 2, 4, 12]]

# Train/test split dates
start_date = df_lagged['ds'].min()
train_end_date = start_date + pd.DateOffset(years=2)

store_metrics = []
store_forecasts = []

# Create directory for plots
output_dir = "xgb_store_forecasts"
os.makedirs(output_dir, exist_ok=True)

future_periods = 12  # Next 12 weeks forecast

for store in df_lagged['store'].unique():
    store_df = df_lagged[df_lagged['store'] == store].copy()
    train_df = store_df[store_df['ds'] < train_end_date]
    test_df = store_df[store_df['ds'] >= train_end_date]

    X_train = train_df[regressors]
    y_train = train_df['y']
    X_test = test_df[regressors]
    y_test = test_df['y']

    model = XGBRegressor(n_estimators=100, learning_rate=0.1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    print(f"Store {store} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, MAPE: {mape:.2f}%")

    results = test_df[['ds', 'y']].copy()
    results['yhat'] = y_pred
    results['store'] = store
    store_forecasts.append(results)

    store_metrics.append({'store': store, 'mae': mae, 'rmse': rmse, 'mape': mape})

    # === Future prediction with lag feature update ===
    last_known = store_df[store_df['ds'] <= test_df['ds'].max()].copy()
    last_known = last_known.sort_values('ds')

    future_dates = pd.date_range(start=last_known['ds'].max() + pd.Timedelta(weeks=1), periods=future_periods, freq='W')

    future_rows = []

    # We'll keep a DataFrame to hold the rolling lags for future predictions
    extended_df = last_known.copy()

    for date in future_dates:
        # Create new row dict
        new_row = {'ds': date, 'store': store}

        # For time features
        new_row['year'] = date.year
        new_row['month'] = date.month
        new_row['week'] = date.isocalendar().week
        new_row['dayofweek'] = date.dayofweek

        # Regressors assumed constant - take from last known row
        for reg in ['Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']:
            # If you have a time series or model for these regressors, replace this with predictions
            # Otherwise, just use the last known value
            new_row[reg] = extended_df.iloc[-1][reg]

        # Calculate lag features based on extended_df
        for lag in [1, 2, 4, 12]:
            lag_date = date - pd.Timedelta(weeks=lag)
            lag_val = extended_df[extended_df['ds'] == lag_date]['y']
            if not lag_val.empty:
                new_row[f'lag_{lag}'] = lag_val.values[0]
            else:
                # If lag value not available (e.g. beyond known dates), fill with mean or 0
                new_row[f'lag_{lag}'] = extended_df['y'].mean()

        # Convert new_row to DataFrame to feed model
        new_df = pd.DataFrame([new_row])
        # Predict yhat
        yhat = model.predict(new_df[regressors])[0]
        new_row['y'] = yhat  # Save predicted value

        # Append new_row to extended_df for next iterations lag calculations
        extended_df = pd.concat([extended_df, pd.DataFrame([new_row])], ignore_index=True)

        future_rows.append(new_row)

    future_df = pd.DataFrame(future_rows)

    # Append future_df to store_forecasts (rename 'y' to 'yhat' for consistency)
    store_forecasts.append(future_df[['ds', 'y']].rename(columns={'y': 'yhat'}).assign(store=store))

    # === Plotting ===
    full_store_df = df[df['store'] == store]
    plt.figure(figsize=(12, 6))
    plt.plot(full_store_df['ds'], full_store_df['y'], label='Actual')
    plt.plot(results['ds'], results['yhat'], label='XGBoost Forecast (Test)')
    plt.plot(future_df['ds'], future_df['y'], label='XGBoost Forecast (Future)', linestyle='--')

    plt.title(f'XGBoost Walmart Store {store} Forecast')
    plt.xlabel('Date')
    plt.ylabel('Weekly Sales')
    plt.legend()

    # Add metrics textbox
    metrics_text = (f"MAE: {mae:.2f}\n"
                    f"RMSE: {rmse:.2f}\n"
                    f"MAPE: {mape:.2f}%")
    plt.gca().text(0.02, 0.95, metrics_text, transform=plt.gca().transAxes,
                   fontsize=10, verticalalignment='top',
                   bbox=dict(boxstyle="round,pad=0.5", facecolor="white", alpha=0.7))

    plt.tight_layout()
    plt.savefig(f"{output_dir}/xgb_store_{store}_forecast.png")
    plt.close()

        # Prepare backtest export CSV
    backtest_export = results[['ds', 'y', 'yhat']].copy()
    backtest_export['store'] = store
    backtest_export['type'] = 'backtest'

    # Prepare future export CSV
    future_export = future_df[['ds', 'y']].rename(columns={'y': 'yhat'}).copy()
    future_export['y'] = np.nan  # no actual values for future
    future_export['store'] = store
    future_export['type'] = 'future'

    # Combine backtest and future
    export_df = pd.concat([backtest_export, future_export], ignore_index=True)

    # Rename columns for Tableau friendliness
    export_df.rename(columns={
        'ds': 'date',
        'y': 'actual_revenue',
        'yhat': 'forecast_revenue'
    }, inplace=True)

    # Save CSV
    csv_filename = f'{output_dir}/xgb_store_{store}_forecast.csv'
    export_df.to_csv(csv_filename, index=False)

# Combine all forecasts
all_forecasts = pd.concat(store_forecasts)

# Overall metrics
overall_mae = np.mean([m['mae'] for m in store_metrics])
overall_rmse = np.mean([m['rmse'] for m in store_metrics])
overall_mape = np.mean([m['mape'] for m in store_metrics])

print(f"\nOverall XGBoost Metrics Across Stores:")
print(f"MAE: {overall_mae:.2f}")
print(f"RMSE: {overall_rmse:.2f}")
print(f"MAPE: {overall_mape:.2f}%")

# Zip the plot folder
zip_filename = "xgb_store_forecasts.zip"
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for root, dirs, files in os.walk(output_dir):
        for file in files:
            zipf.write(os.path.join(root, file), arcname=file)

print(f"Forecast plots saved in '{output_dir}' and zipped as '{zip_filename}'.")

# Add CSV files to the zip as well
with zipfile.ZipFile(zip_filename, 'a') as zipf:
    for root, dirs, files in os.walk(output_dir):
        for file in files:
            if file.endswith('.csv'):
                zipf.write(os.path.join(root, file), arcname=file)

print(f"Forecast CSV files saved alongside plots and zipped as '{zip_filename}'.")