In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [None]:
#read the related data 
df= pd.read_csv("xxxxx.csv",parse_dates = ["date"])
df.head()

In [None]:
#Fill NA values with 0 
df = df.fillna(0)

In [None]:
#set date column as datetime
df["date"] = pd.to_datetime(df["date"])
df.info()

In [None]:
#convert all possible dtypes into float or integer if any other exists
df["xxxxx"] = df["xxxxx"].astype(int)
df.info()

In [None]:
#drop unrelated columns or columns include dtype object
df.drop(columns=["xxxxx"], inplace = True, axis=1)

In [None]:
#define lags to identify past observations to use for training the model
#target column indicates the Y variable for the model
df["lag_1"] = df["target"].shift(1)
df["lag_7"] = df["target"].shift(7)
df["rolling_mean_7"] = df["target"].shift(1).rolling(7).mean()

In [None]:
#drop NaN values occured from defining lag phase
 df.dropna()

In [None]:
#Define forecast horizon and targets for the forecasting model
#do not forget that the target indicates Y variable
horizon = 7
targets = []
for i in range(1, horizon + 1):
    df[f"target_t+{i}"] = df["target"].shift(-i)
    targets.append(f"target_t+{i}")

df = df.dropna()

In [None]:
#define x and y variables
#while defining x, use related or unrelated columns
x = df.drop(columns=["xxxx"] + targets)
y = df[targets]

In [None]:
#set train/test split
train_mask = df["date"] < (df["date"].max() - pd.Timedelta(days=30))
x_train, x_test = x[train_mask], x[~train_mask]
y_train, y_test = y[train_mask], y[~train_mask]

In [None]:
#construct the XGBoost model
model_xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

models = {}
for i, target in enumerate(targets, start=1):
    m = model_xgb.fit(x_train, y_train[target])
    models[target] = m

In [None]:
#make predictions
preds = {}
for target, m in models.items():
    preds[target] = m.predict(x_test)

preds = pd.DataFrame(preds, index=x_test.index)

In [None]:
# Calculate RMSE per horizon step
for target in targets:
    rmse = np.sqrt(mean_squared_error(y_test[target], preds[target]))
    print(f"RMSE for {target}: {rmse:.2f}")

In [None]:
#print results,examine actual and predicted values
results = pd.concat([df.loc[x_test.index, ["date", "productId"]], y_test, preds], axis=1)
print(results.head(14))

In [None]:
#visualize the predictions
#7 days forecast
horizon = preds.columns  # ['target_t+1', ..., 'target_t+7']

for i, col in enumerate(horizon, start=1):
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.index, y_test[col], label=f'Actual {col}', linestyle='--')
    plt.plot(preds.index, preds[col], label=f'Predicted {col}', alpha=0.7)

    plt.xlabel('Date Index')
    plt.ylabel('Demand')
    plt.title(f'XGBoost Forecast ({col})')
    plt.legend()
    plt.show()