In [6]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

In [7]:
data = pd.read_csv('/Users/harshitgupta/Desktop/vs /VS-Data-Den/rm_dataset_filtered.csv')

# Convert the 'date_field' column to a datetime format
data['date_field'] = pd.to_datetime(data['date_field'])

# Set the 'date_field' as the index
data = data.set_index('date_field')


In [18]:
regional_masters = data.groupby('regional_master')['sales'].apply(lambda x: (x != 0).sum() >= 250)
regional_masters = regional_masters[regional_masters].index.tolist()


In [31]:
def fit_model(group):
    # Split data into train and test
    train_data = group[:-52]
    test_data = group[-52:]

    # Create features and target
    X_train = train_data.index.astype(int) / 1e9  # Unix timestamps as features
    y_train = train_data.values
    X_train = X_train.values.reshape(-1, 1)  # Convert to a 2D NumPy array

    # Fit XGBoost model
    model = XGBRegressor(n_estimators=50, random_state=20)
    model.fit(X_train, y_train)

    # Make predictions
    X_test = test_data.index.astype(int) / 1e9
    X_test = X_test.values.reshape(-1, 1)  # Convert to a 2D NumPy array
    forecast = model.predict(X_test)

    return forecast

In [32]:
forecasts = []
for rm, group in data.groupby('regional_master')['sales']:
    if rm in regional_masters:
        forecast_data = fit_model(group)
        forecast_df = pd.DataFrame({'regional_master': rm, 'forecast': forecast_data})
        forecasts.append(forecast_df)

In [33]:
forecasts = pd.concat(forecasts, ignore_index=True)


In [34]:
def smape(actual, forecast):
    """
    Calculate the Symmetric Mean Absolute Percentage Error (SMAPE)
    """
    n = len(actual)
    combined = np.abs(actual) + np.abs(forecast)
    smape_values = np.abs(actual - forecast) / combined * 200 / n
    return np.mean(smape_values)

In [35]:
smape_scores = []
for rm, group in forecasts.groupby('regional_master'):
    actual = data.loc[data['regional_master'] == rm, 'sales'].values[-52:]
    forecast_values = group['forecast'].values
    smape_score = smape(actual, forecast_values)
    smape_scores.append(smape_score)

In [36]:
mean_smape = np.mean(smape_scores)
print(f"Mean SMAPE across all regional_masters: {mean_smape:.4f}")

Mean SMAPE across all regional_masters: 0.8738
