In [1]:
# ...in a Jupyter notebook or script...
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [2]:

df = pd.read_csv('ai_solutions_web_sales_logs.csv', parse_dates=['date_of_sale'], dayfirst=True)
print(df.shape)
df['month'] = df['date_of_sale'].dt.to_period('M')

(500000, 12)


In [3]:
# Prepare monthly sales data
monthly_sales = df.groupby('month')['cost'].sum().reset_index()

if len(monthly_sales) >= 24:  # Require at least 2 years for monthly seasonality
    monthly_model = ExponentialSmoothing(
        monthly_sales['cost'],
        trend='add',
        seasonal='add',
        seasonal_periods=12
    )
    monthly_fit = monthly_model.fit()
    # Forecast for Jan 2025 to Aug 2025 (8 months)
    monthly_forecast_index = pd.period_range('2025-01', '2025-08', freq='M')
    monthly_forecast = monthly_fit.predict(
        start=monthly_forecast_index[0].ordinal - monthly_sales['month'].iloc[0].ordinal,
        end=monthly_forecast_index[-1].ordinal - monthly_sales['month'].iloc[0].ordinal
    )
    monthly_forecast_df = pd.DataFrame({
        'month': monthly_forecast_index,
        'forecast': monthly_forecast.values
    })
    monthly_sales['forecast'] = monthly_fit.fittedvalues
    monthly_sales.to_csv('monthly_actuals.csv', index=False)
    monthly_forecast_df.to_csv('monthly_forecast.csv', index=False)
else:
    print("Not enough data for seasonal forecasting (need at least 24 months).")




In [4]:
from sklearn.metrics import mean_absolute_percentage_error
 
# Split the last 6 months as a validation set for monthly sales
train_monthly = monthly_sales.iloc[:-6]
valid_monthly = monthly_sales.iloc[-6:]
 
# Refit the model on the training set only
monthly_model_val = ExponentialSmoothing(
    train_monthly['cost'],
    trend='add',
    seasonal='add',
    seasonal_periods=12
)
monthly_fit_val = monthly_model_val.fit()
 
# Forecast for the validation period
monthly_forecast_val = monthly_fit_val.forecast(6)
 
# Calculate MAPE on the validation set
monthly_val_mape = mean_absolute_percentage_error(valid_monthly['cost'], monthly_forecast_val)
print(f"Monthly Validation MAPE: {monthly_val_mape:.2%}")
MVAPE = 1 - monthly_val_mape
print(f"Monthly Validation Accuracy: {MVAPE:.2%}")
# Calculate training MAPE for monthly model
monthly_mape = mean_absolute_percentage_error(monthly_sales['cost'], monthly_sales['forecast'])
MTVAPE = 1 - monthly_mape
print(f"Monthly Training Accuracy: {MTVAPE:.2%}")



Monthly Validation MAPE: 9.77%
Monthly Validation Accuracy: 90.23%
Monthly Training Accuracy: 89.64%




In [5]:
len(monthly_sales)

37

In [6]:
# For each salesperson
for sp in df['salesperson'].unique():
    sp_sales = df[df['salesperson'] == sp].groupby('month')['cost'].sum().reset_index()
    if len(sp_sales) > 12:  # Only forecast if enough data
        model = ExponentialSmoothing(sp_sales['cost'], trend='add', seasonal='add', seasonal_periods=12)
        fit = model.fit()
        forecast = fit.forecast(6)
        forecast_df = pd.DataFrame({
            'month': pd.period_range(sp_sales['month'].iloc[-1]+1, periods=6, freq='M'),
            'forecast': forecast
        })
        sp_sales['forecast'] = fit.fittedvalues
        sp_sales.to_csv(f'monthly_actuals_{sp}.csv', index=False)
        forecast_df.to_csv(f'monthly_forecast_{sp}.csv', index=False)



In [7]:
import os

# Create output directory if it doesn't exist
output_dir = "weekly_outputs"
os.makedirs(output_dir, exist_ok=True)

# Prepare weekly actuals and forecasts for all sales
df['week'] = df['date_of_sale'].dt.to_period('W')
weekly_sales = df.groupby('week')['cost'].sum().reset_index()

if len(weekly_sales) >= 104:  # Require at least 2 full years for weekly seasonality
    model = ExponentialSmoothing(weekly_sales['cost'], trend='add', seasonal='add', seasonal_periods=52)
    fit = model.fit()
    forecast = fit.forecast(6)
    forecast_df = pd.DataFrame({
        'week': pd.period_range(weekly_sales['week'].iloc[-1]+1, periods=6, freq='W'),
        'forecast': forecast
    })
    weekly_sales['forecast'] = fit.fittedvalues
    weekly_sales.to_csv(os.path.join(output_dir, 'weekly_actuals.csv'), index=False)
    forecast_df.to_csv(os.path.join(output_dir, 'weekly_forecast.csv'), index=False)
else:
    print("Not enough data for seasonal weekly forecasting (need at least 104 weeks).")



In [8]:
from sklearn.metrics import mean_absolute_percentage_error

# Calculate MAPE for the training period (actuals vs fitted values)
mape = mean_absolute_percentage_error(weekly_sales['cost'], weekly_sales['forecast'])
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2%}")
accuracy = 1 - mape
print(f"Validation Accuracy: {accuracy:.2%}")

Mean Absolute Percentage Error (MAPE): 2.14%
Validation Accuracy: 97.86%


In [9]:
from sklearn.metrics import mean_absolute_percentage_error

# A MAPE of 4.81% indicates your model fits the training data very well, but it could be a sign of overfitting if the model is too closely following noise in the data.
# 

# Example: Split the last 6 weeks as a validation set
train = weekly_sales.iloc[:-6]
valid = weekly_sales.iloc[-6:]

# Refit the model on the training set only
model_val = ExponentialSmoothing(train['cost'], trend='add', seasonal='add', seasonal_periods=52)
fit_val = model_val.fit()

# Forecast for the validation period
forecast_val = fit_val.forecast(6)

val_mape = mean_absolute_percentage_error(valid['cost'], forecast_val)
print(f"Validation MAPE: {val_mape:.2%}")

# If validation MAPE is much higher than training MAPE, your model may be overfitting.
# To mitigate overfitting:
# - Try a simpler model (remove trend or seasonal components)
# - Use regularization (if available)
# - Aggregate data to reduce noise
# - Ensure enough data for each seasonal cycle

Validation MAPE: 8.86%




In [10]:
accuracy = 1 - val_mape
print(f"Validation Accuracy: {accuracy:.2%}")

Validation Accuracy: 91.14%
