In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Load dataset
data = pd.read_csv("modelling data/daily_sales.csv")

# Identify features and targets
features = ['time_bucket', 'oil_price', 'is_holiday']
# All pizza columns (everything except date and features)
target_cols = [c for c in data.columns if c not in ['date'] + features]

X = data[features].copy()
y = data[target_cols]

# Encode categorical features
le = LabelEncoder()
X['time_bucket'] = le.fit_transform(X['time_bucket'])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize multi-output XGBoost
xgb = MultiOutputRegressor(XGBRegressor(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
))

# Train
xgb.fit(X_train, y_train)

# Predict
y_pred = xgb.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (all pizza types): {mse:.2f}")

Mean Squared Error (all pizza types): 0.59


In [9]:
# Example prediction for a new data point, round values to integers
new_data = pd.DataFrame({
    'time_bucket': le.transform(['Lunch']),
    'oil_price': [70.0],
    'is_holiday': [0]
})
predicted_sales = xgb.predict(new_data)
predicted_sales_rounded = predicted_sales.round().astype(int)
print(f"Predicted sales for new data point: {predicted_sales_rounded}")

Predicted sales for new data point: [[0 1 2 5 2 1 0 0 1 3 0 0 0 0 1 1 0 0 1 0 3 2 2 0 0 0 1 0 1 1 2 0 2 1 0 0
  1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 2 0 0 0 0 1 1 1 1 2 2 0 1 0 0 1 5 0 1 1
  0 0 0 0 0 0 0 0 2 1 1 0 1 1 2 0 1 1 0]]
