# Demand Forecasting Model Training Pipeline

This notebook trains demand forecasting models for each item-location combination using the feature store data.

In [1]:
# Import necessary libraries
import hopsworks
import joblib
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from dotenv import load_dotenv

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Configuration Parameters

Set the parameters for the training pipeline.

In [2]:
# Load environment variables
load_dotenv()

# Configure training parameters
project_name = 'many_models'
feature_group_name = 'demand_features'
version = 1  # Version can be incremented automatically 
model_name = 'demand_forecaster'
# model_version = 1  # Let Hopsworks handle versioning automatically
test_size = 0.2
location_id = None  # Set to specific location ID to filter for a single location

## Connect to Hopsworks

Establish connection to the Hopsworks Feature Store.

In [3]:
print("Connecting to Hopsworks")
# Connect to Hopsworks
project = hopsworks.login(
    host=os.getenv("HOST"),
    port=os.getenv("PORT"),
    api_key_value=os.getenv("HOPSWORKS_API_KEY"),
    project=project_name or os.getenv("PROJECT")
)
fs = project.get_feature_store()

Connecting to Hopsworks
2025-05-09 10:21:38,202 INFO: Initializing external client
2025-05-09 10:21:38,203 INFO: Base URL: https://demo.hops.works:443






2025-05-09 10:21:39,219 INFO: Python Engine initialized.

Logged in to project, explore it here https://demo.hops.works:443/p/14455


## Retrieve Feature Group

Get the feature group containing the demand data.

In [4]:
print(f"Retrieving Feature Group: {feature_group_name}")
demand_fg = fs.get_feature_group(
    name=feature_group_name,
#    version=1,  # Using fixed version if needed
)

print("Feature Group Investigation")
demand_fg.show(5)

Retrieving Feature Group: demand_features
Feature Group Investigation
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.93s) 


Unnamed: 0,sp_id,loc_id,time_bucket,repetitive_demand_quantity,datetime
0,9032806,3,202404,21.0,2025-05-09 10:05:45.219201+00:00
1,8636975,3,202111,339.0,2025-05-09 10:05:45.219201+00:00
2,9052071,3,202406,65.0,2025-05-09 10:05:45.219201+00:00
3,9036438,3,202203,1284.0,2025-05-09 10:05:45.219201+00:00
4,8413714,3,202312,21.0,2025-05-09 10:05:45.219201+00:00


## Feature Selection and Query

Select features and prepare the query for training data.

In [5]:
print("Feature Selection")
# Define query with proper feature selection
query = demand_fg.select_all()

Feature Selection


## Setup Transformation Functions

Define transformation functions for feature engineering.

In [None]:
# print("Setting up transformation functions")
# # Import the built-in transformations
# from hopsworks.hsfs.builtin_transformations import label_encoder

# print("Applying label encoding to location ID")
# transformation_functions = [label_encoder("loc_id")]

# print("Created transformation function for loc_id using label_encoder")

Setting up transformation functions
Applying label encoding to location ID
Created transformation function for loc_id using label_encoder


## Create Feature View

Create a feature view for the demand data.

In [None]:
print(f"Getting or creating feature view: {feature_group_name}_view")

# Use get_or_create_feature_view method
feature_view = fs.get_or_create_feature_view(
    name=f"{feature_group_name}_view",
    version=1,  
    description="Feature view for demand forecasting",
    labels=["repetitive_demand_quantity"],
    query=query,
    # transformation_functions=transformation_functions
)

print(f"Successfully got or created feature view: {feature_group_name}_view")

Getting or creating feature view: demand_features_view
Feature view created successfully, explore it at 
https://demo.hops.works:443/p/14455/fs/13379/fv/demand_features_view/version/1
Successfully got or created feature view: demand_features_view


## Identify Training Scope

Determine the number of models to train.

In [11]:
# Get model registry
mr = project.get_model_registry()

# Get unique items and locations for training
# Read the data once and reuse it
df = query.read()
items = df['sp_id'].unique()  # Get unique items
locations = df['loc_id'].unique() if location_id is None else [location_id]

# Calculate total number of models
total_models = len(items) * len(locations)

print(f"Training {total_models} models (items: {len(items)} × locations: {len(locations)})")

print(f"\nData Overview:")
print(f"Unique items: {len(items)}")
print(f"Unique locations: {len(locations)}")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.61s) 
Training 200 models (items: 200 × locations: 1)

Data Overview:
Unique items: 200
Unique locations: 1


## Model Training Loop

Train models for each item-location combination.

In [None]:
# Dictionary to store metrics for all models
all_model_metrics = {}

# Counter for progress tracking
model_counter = 0

# Loop through each item-location combination
for item in items:
    for loc in locations:
        model_counter += 1
        
        # Display progress periodically
        if model_counter % 5 == 0 or model_counter == 1:
            print(f"Training model {model_counter}/{total_models} (Item: {item}, Location: {loc})")
        
        try:
            # Create a filter for this item-location combination
            from hsfs.constructor.filter import Filter
            
            # Create filter using feature group 
            filter_cond = (demand_fg.sp_id == item) and (demand_fg.loc_id == loc)
            
            # Apply train_test_split with extra_filter
            X_train, X_test, y_train, y_test = feature_view.train_test_split(
                test_size=test_size,
                extra_filter=filter_cond,
                )
                        
            # Skip if we don't have enough data for this combination
            if len(X_train) < 10 or len(X_test) < 5:
                continue
            
            # Remove ID columns
            X_train = X_train.drop(['sp_id', 'loc_id', 'datetime'], axis=1, errors='ignore')
            X_test = X_test.drop(['sp_id', 'loc_id', 'datetime'], axis=1, errors='ignore')
            
            # Model name for this item-location
            model_prefix = f"{model_name}_item{item}_loc{loc}"
            
            # Train RandomForest
            rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
            rf_model.fit(X_train, y_train)
            
            # Train XGBoost
            xgb_model = XGBRegressor(n_estimators=100, random_state=42)
            xgb_model.fit(X_train, y_train)
            
            # Evaluate models
            models = {
                "RandomForest": rf_model,
                "XGBoost": xgb_model
            }
            
            best_model = None
            best_rmse = float('inf')
            best_metrics = {}
            
            for model_type, model in models.items():
                # Make predictions
                y_pred = model.predict(X_test)
                
                # Calculate metrics
                mae = mean_absolute_error(y_test, y_pred)
                mse = mean_squared_error(y_test, y_pred)
                rmse = np.sqrt(mse)
                r2 = r2_score(y_test, y_pred)
                
                metrics = {
                    "mae": mae,
                    "rmse": rmse,
                    "r2": r2
                }
                
                if model_counter % 5 == 0 or model_counter == 1:
                    print(f"  {model_type}: RMSE: {rmse:.2f}")
                
                # Track best model
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_model = model
                    best_model_type = model_type
                    best_metrics = metrics
            
            # Store metrics for this item-location combination
            all_model_metrics[f"item_{item}_loc_{loc}"] = {
                "model_type": best_model_type,
                "metrics": best_metrics
            }
            
            # Create model directory
            model_dir = model_prefix
            os.makedirs(model_dir, exist_ok=True)
            
            # Save model
            if best_model_type == "RandomForest":
                joblib.dump(best_model, os.path.join(model_dir, "model.joblib"))
            else:  # XGBoost
                best_model.save_model(os.path.join(model_dir, "model.json"))
            
            # Register model in Hopsworks
            model_api = mr.python.create_model(
                name=model_prefix,
                metrics=best_metrics,
                description=f"Demand forecaster for item {item}, location {loc}",
                input_example=X_train.iloc[0].to_dict() if not X_train.empty else None,
                feature_view=feature_view
            )
            
            # Upload the model and artifacts
            model_api.save(model_dir)
            
            # Clean up local model directory
            import shutil
            shutil.rmtree(model_dir, ignore_errors=True)
            
        except Exception as e:
            print(f"FAILED for item {item}, location {loc}: {e}")
            import traceback
            traceback.print_exc()
            continue
