In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import joblib
import warnings
warnings.filterwarnings('ignore') 

# Load the data from the specified 'data/' directory
item_categories = pd.read_csv('data/item_categories.csv')
items = pd.read_csv('data/items.csv')
sales_train = pd.read_csv('data/sales_train.csv')
shops = pd.read_csv('data/shops.csv')
test = pd.read_csv('data/test.csv')

# Convert the 'date' column to datetime
sales_train['date'] = pd.to_datetime(sales_train['date'], format='%d.%m.%Y')

# Aggregate the sales data by month, shop, and item
monthly_sales = sales_train.groupby(['date_block_num', 'shop_id', 'item_id'], as_index=False).agg({
    'item_cnt_day': 'sum',
    'item_price': 'mean'
}).rename(columns={'item_cnt_day': 'item_cnt_month'})

# Merging with items, shops, and item categories for feature enrichment
monthly_sales = monthly_sales.merge(items[['item_id', 'item_category_id']], on='item_id', how='left')
monthly_sales = monthly_sales.merge(shops[['shop_id', 'shop_name']], on='shop_id', how='left')
monthly_sales = monthly_sales.merge(item_categories[['item_category_id', 'item_category_name']], on='item_category_id', how='left')

# Creating lag features to improve predictive power
for lag in [1, 2, 3]:
    lag_col_name = f'item_cnt_month_lag_{lag}'
    monthly_sales[lag_col_name] = monthly_sales.groupby(['shop_id', 'item_id'])['item_cnt_month'].shift(lag)

# Fill missing lag values with 0 (no sales)
monthly_sales.fillna(0, inplace=True)

# Drop columns not needed for modeling, including 'date_block_num'
X = monthly_sales.drop(['item_cnt_month', 'shop_name', 'item_category_name', 'date_block_num'], axis=1)
y = monthly_sales['item_cnt_month']

# Time series split for cross-validation
tscv = TimeSeriesSplit(n_splits=3)

# Initializing XGBoost with hyperparameters
xgb_model = xgb.XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300,
    colsample_bytree=0.8,
    subsample=0.8,
    eta=0.3,
    seed=42,
    verbose=0
)

# Custom early stopping
best_rmse = float('inf')
n_rounds_no_improve = 0
early_stopping_rounds = 10

# Using the updated root_mean_squared_error function
from sklearn.metrics import mean_squared_error

# Time-based cross-validation
for train_index, test_index in tscv.split(X):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    eval_set = [(X_train, y_train), (X_valid, y_valid)]
    
    # Train the model in steps to implement early stopping manually
    for i in range(100, 1001, 100):
        xgb_model.n_estimators = i
        xgb_model.fit(X_train, y_train, eval_set=eval_set, verbose=False)
        
        # Predict on validation set and calculate RMSE
        y_pred = xgb_model.predict(X_valid)
        rmse = mean_squared_error(y_valid, y_pred, squared=False)
        
        print(f"Iteration {i}, RMSE: {rmse}")
        
        # Check for early stopping
        if rmse < best_rmse:
            best_rmse = rmse
            n_rounds_no_improve = 0
        else:
            n_rounds_no_improve += 1
        
        if n_rounds_no_improve >= early_stopping_rounds:
            print(f"Early stopping at iteration {i}, best RMSE: {best_rmse}")
            break

# Save the trained model for deployment
joblib.dump(xgb_model, 'sales_prediction_model.joblib')
print("Model saved as 'sales_prediction_model.joblib'")


Iteration 100, RMSE: 7.1848291183281585
Iteration 200, RMSE: 7.106073813919705
Iteration 300, RMSE: 7.078943216794538
Iteration 400, RMSE: 7.077612008803284
Iteration 500, RMSE: 7.063316694259713
Iteration 600, RMSE: 7.028641237097448
Iteration 700, RMSE: 7.023183553131436
Iteration 800, RMSE: 7.014067946706754
Iteration 900, RMSE: 6.996755496903188
Iteration 1000, RMSE: 6.987927300791501
Iteration 100, RMSE: 6.6938034328061855
Iteration 200, RMSE: 6.588458966157487
Iteration 300, RMSE: 6.478139475314307
Iteration 400, RMSE: 6.391835488039032
Iteration 500, RMSE: 6.353899438232068
Iteration 600, RMSE: 6.328822181178576
Iteration 700, RMSE: 6.287663648810917
Iteration 800, RMSE: 6.269253826613957
Iteration 900, RMSE: 6.255156840409776
Iteration 1000, RMSE: 6.2341113530857415
Iteration 100, RMSE: 8.703564259880254
Iteration 200, RMSE: 8.601213635657343
Iteration 300, RMSE: 8.569869397619078
Iteration 400, RMSE: 8.55823351715245
Iteration 500, RMSE: 8.535675589407777
Iteration 600, RMSE: 