In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import time

In [20]:
forecasting_df = pd.read_csv("https://raw.githubusercontent.com/Impact026/Data-Forecasting-project/refs/heads/main/forecasting_case_study.xlsx%20-%20Sheet1.csv")
print(forecasting_df.head())
print(forecasting_df.columns)
print(forecasting_df.info())
# print(forecasting_df.describe())

  Product       date  Sales Price Discount (%)  In-Store Promo  \
0    SKU1   2/5/2017  27750                 0%               0   
1    SKU1  2/12/2017  29023                 0%               1   
2    SKU1  2/19/2017  45630                17%               0   
3    SKU1  2/26/2017  26789                 0%               1   
4    SKU1   3/5/2017  41999                17%               0   

   Catalogue Promo  Store End Promo  Google_Mobility  Covid_Flag  V_DAY  \
0                0                0              0.0           0      0   
1                0                1              0.0           0      1   
2                0                0              0.0           0      0   
3                0                1              0.0           0      0   
4                0                0              0.0           0      0   

   EASTER  CHRISTMAS  
0       0          0  
1       0          0  
2       0          0  
3       0          0  
4       0          0  
Index(['Produc

In [22]:
# Load the data
df = forecasting_df

# Preprocess the data
df['date'] = pd.to_datetime(df['date'])
df['Discount'] = df['Price Discount (%)'].str.rstrip('%').astype('float') / 100

# Select features and target
features = ['Discount', 'In-Store Promo', 'Catalogue Promo', 'Store End Promo', 
            'Google_Mobility', 'Covid_Flag', 'V_DAY', 'EASTER', 'CHRISTMAS']
target = 'Sales'

X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to calculate Weighted MAPE
def weighted_mape(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(y_true)

# Function to calculate Forecast Accuracy
def forecast_accuracy(y_true, y_pred):
    return 1 - weighted_mape(y_true, y_pred)

# Function to train and evaluate a model
def train_evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = forecast_accuracy(y_test, y_pred)
    execution_time = time.time() - start_time
    print(f"{model_name} - Accuracy: {accuracy:.4f}, Execution Time: {execution_time:.2f} seconds")
    return accuracy, execution_time

# Linear Regression
lr_model = LinearRegression()
lr_accuracy, lr_time = train_evaluate_model(lr_model, X_train_scaled, y_train, X_test_scaled, y_test, "Linear Regression")

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_accuracy, rf_time = train_evaluate_model(rf_model, X_train_scaled, y_train, X_test_scaled, y_test, "Random Forest")

# Support Vector Regression
svr_model = SVR(kernel='rbf')
svr_accuracy, svr_time = train_evaluate_model(svr_model, X_train_scaled, y_train, X_test_scaled, y_test, "SVR")

# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_accuracy, gb_time = train_evaluate_model(gb_model, X_train_scaled, y_train, X_test_scaled, y_test, "Gradient Boosting")

# Compare models
models = ["Linear Regression", "Random Forest", "SVR", "Gradient Boosting"]
accuracies = [lr_accuracy, rf_accuracy, svr_accuracy, gb_accuracy]
execution_times = [lr_time, rf_time, svr_time, gb_time]

best_model = models[np.argmax(accuracies)]
print(f"\nBest model: {best_model} with accuracy {max(accuracies):.4f}")

# Print sorted results
results = sorted(zip(models, accuracies, execution_times), key=lambda x: x[1], reverse=True)
print("\nModels sorted by accuracy:")
for model, accuracy, exec_time in results:
    print(f"{model}: Accuracy = {accuracy:.4f}, Execution Time = {exec_time:.2f} seconds")


Linear Regression - Accuracy: 0.3285, Execution Time: 0.01 seconds
Random Forest - Accuracy: 0.5461, Execution Time: 0.33 seconds
SVR - Accuracy: 0.3327, Execution Time: 0.10 seconds
Gradient Boosting - Accuracy: 0.5296, Execution Time: 0.10 seconds

Best model: Random Forest with accuracy 0.5461

Models sorted by accuracy:
Random Forest: Accuracy = 0.5461, Execution Time = 0.33 seconds
Gradient Boosting: Accuracy = 0.5296, Execution Time = 0.10 seconds
SVR: Accuracy = 0.3327, Execution Time = 0.10 seconds
Linear Regression: Accuracy = 0.3285, Execution Time = 0.01 seconds
