In [7]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error
import joblib

In [27]:
df = pd.read_csv("freshcast_dataset.csv")
df = df.drop("quantity", axis=1)
df.head()

Unnamed: 0,date,product,sales,temperature,rainfall_mm,is_holiday,event
0,2025-06-01,Milk (1L),101,35.2,0,1,Local Festival
1,2025-06-01,Bread Loaf,77,35.2,0,1,Local Festival
2,2025-06-01,Bananas (1kg),108,35.2,0,1,Local Festival
3,2025-06-02,Milk (1L),120,35.5,0,1,
4,2025-06-02,Bread Loaf,86,35.5,0,1,


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         180 non-null    object 
 1   product      180 non-null    object 
 2   sales        180 non-null    int64  
 3   temperature  180 non-null    float64
 4   rainfall_mm  180 non-null    int64  
 5   is_holiday   180 non-null    int64  
 6   event        6 non-null      object 
dtypes: float64(1), int64(3), object(3)
memory usage: 10.0+ KB


In [11]:
df['date'] = pd.to_datetime(df['date'])
df.sort_values(['product', 'date'], inplace=True)

In [12]:
df.head()

Unnamed: 0,date,product,sales,temperature,rainfall_mm,is_holiday,event
2,2025-06-01,Bananas (1kg),108,35.2,0,1,Local Festival
5,2025-06-02,Bananas (1kg),89,35.5,0,1,
8,2025-06-03,Bananas (1kg),82,41.1,0,0,
11,2025-06-04,Bananas (1kg),101,27.0,0,0,
14,2025-06-05,Bananas (1kg),97,33.3,0,0,


In [13]:
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month

In [14]:
le = LabelEncoder()
df['product_id'] = le.fit_transform(df['product'])

In [15]:
for lag in [1, 2, 3]:
    df[f'sales_lag_{lag}'] = df.groupby('product')['sales'].shift(lag)

In [16]:
df.head()

Unnamed: 0,date,product,sales,temperature,rainfall_mm,is_holiday,event,day_of_week,month,product_id,sales_lag_1,sales_lag_2,sales_lag_3
2,2025-06-01,Bananas (1kg),108,35.2,0,1,Local Festival,6,6,0,,,
5,2025-06-02,Bananas (1kg),89,35.5,0,1,,0,6,0,108.0,,
8,2025-06-03,Bananas (1kg),82,41.1,0,0,,1,6,0,89.0,108.0,
11,2025-06-04,Bananas (1kg),101,27.0,0,0,,2,6,0,82.0,89.0,108.0
14,2025-06-05,Bananas (1kg),97,33.3,0,0,,3,6,0,101.0,82.0,89.0


In [17]:
df_model = df.dropna(subset=['sales_lag_1', 'sales_lag_2', 'sales_lag_3'])

In [18]:
df_model.head()

Unnamed: 0,date,product,sales,temperature,rainfall_mm,is_holiday,event,day_of_week,month,product_id,sales_lag_1,sales_lag_2,sales_lag_3
11,2025-06-04,Bananas (1kg),101,27.0,0,0,,2,6,0,82.0,89.0,108.0
14,2025-06-05,Bananas (1kg),97,33.3,0,0,,3,6,0,101.0,82.0,89.0
17,2025-06-06,Bananas (1kg),85,30.7,0,0,,4,6,0,97.0,101.0,82.0
20,2025-06-07,Bananas (1kg),95,31.3,5,0,,5,6,0,85.0,97.0,101.0
23,2025-06-08,Bananas (1kg),101,31.1,0,1,,6,6,0,95.0,85.0,97.0


In [19]:
features = [
    'product_id', 'temperature', 'rainfall_mm', 'is_holiday',
    'day_of_week', 'month', 'sales_lag_1', 'sales_lag_2', 'sales_lag_3'
]
target = 'sales'

In [20]:
X = df_model[features]
y = df_model[target]

In [21]:
print(f"Original data shape: {df.shape}")
print(f"After dropping NaNs: {df_model.shape}")

Original data shape: (180, 13)
After dropping NaNs: (162, 13)


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [23]:
model = XGBRegressor(n_estimators=30, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse:.2f}")

RMSE: 18.03


In [25]:
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f"MAPE: {mape:.2f}%")

MAPE: 17.78%


In [26]:
joblib.dump(model, "freshcast_xgb_model.joblib")

joblib.dump(le, "product_label_encoder.joblib")

['product_label_encoder.joblib']