In [1]:
import hopsworks
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
load_dotenv()
project_name = os.getenv("HOPSWORKS_PROJECT")
api_key = os.getenv("HOPSWORKS_API_KEY")
test_start_string = os.getenv("TEST_START_DATE")
test_start_date = pd.to_datetime(test_start_string).date()

project = hopsworks.login(project=project_name, api_key_value=api_key)
fs = project.get_feature_store()

2026-01-05 19:36:51,904 INFO: Initializing external client
2026-01-05 19:36:51,904 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2026-01-05 19:36:53,466 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1271989


In [3]:
print(test_start_date)
print(type(test_start_date))

2025-12-30
<class 'datetime.date'>


In [4]:
vehicle_fg = fs.get_or_create_feature_group(name="vehicle_trip_agg_fg", version=2)
vehicle_df = vehicle_fg.read()

weather_fg = fs.get_or_create_feature_group(name="weather_hourly_fg", version=1)
weather_df = weather_fg.read()

holiday_fg = fs.get_or_create_feature_group(name="swedish_holidays_fg", version=1)
holiday_df = holiday_fg.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (76.02s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.11s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.95s) 


In [5]:
vehicle_df["_date"] = vehicle_df["window_start"].dt.date
print(vehicle_df["_date"].head)

<bound method NDFrame.head of 0          2025-11-24
1          2025-11-24
2          2025-11-24
3          2025-11-24
4          2025-11-24
              ...    
6151596    2025-12-09
6151597    2025-12-09
6151598    2025-12-09
6151599    2025-12-09
6151600    2025-12-09
Name: _date, Length: 6151601, dtype: object>


In [6]:
vehicle_df["date"] = vehicle_df["_date"]
weather_df["date"] = pd.to_datetime(weather_df["date"]).dt.date
holiday_df["date"] = pd.to_datetime(holiday_df["date"]).dt.date

In [7]:
# Features to use for training
VEHICLE_FEATURES = [
    "avg_speed",
    "max_speed",
    "speed_std",
    "n_positions",
    "lat_mean",
    "lon_mean",
    "hour",
    "day_of_week",
]

WEATHER_FEATURES = [
    "temperature_2m",
    "precipitation",
    "cloud_cover",
    "wind_speed_10m",
    "snowfall",
    "rain"
]

HOLIDAY_FEATURES = [
    "is_work_free",
    "is_red_day",
    "is_day_before_holiday",
]

# Target variable
TARGET = "occupancy_mode"

In [8]:
vehicle_df["window_start"] = vehicle_df["window_start"].dt.tz_convert(None)

print(vehicle_df["window_start"].head)

<bound method NDFrame.head of 0         2025-11-24 15:59:00
1         2025-11-24 10:39:00
2         2025-11-24 17:02:00
3         2025-11-24 10:25:00
4         2025-11-24 04:32:00
                  ...        
6151596   2025-12-09 06:59:00
6151597   2025-12-09 04:49:00
6151598   2025-12-09 05:56:00
6151599   2025-12-09 15:24:00
6151600   2025-12-09 13:59:00
Name: window_start, Length: 6151601, dtype: datetime64[us]>


In [9]:
weather_df["date"] = pd.to_datetime(weather_df["date"])
holiday_df["date"] = pd.to_datetime(holiday_df["date"])
# Create merged df
merged_df = (
    vehicle_df[["trip_id", "vehicle_id", "window_start", "occupancy_mode"] + VEHICLE_FEATURES]
    .merge(weather_df[["date"] + WEATHER_FEATURES], left_on="window_start", right_on="date", how="left")
    .merge(holiday_df[["date"] + HOLIDAY_FEATURES], left_on="window_start", right_on="date", how="left")
)

# Sort by vehicle and time for lag creation
merged_df = merged_df.sort_values(by=["vehicle_id", "window_start"])

for col in HOLIDAY_FEATURES:
    if col in merged_df.columns:
        merged_df = merged_df.dropna(subset=HOLIDAY_FEATURES)
        merged_df[col] = merged_df[col].astype(int)

In [10]:
print(merged_df.columns)
print(merged_df["window_start"].isna().sum())
merged_df = merged_df.dropna(subset=["window_start"])

Index(['trip_id', 'vehicle_id', 'window_start', 'occupancy_mode', 'avg_speed',
       'max_speed', 'speed_std', 'n_positions', 'lat_mean', 'lon_mean', 'hour',
       'day_of_week', 'date_x', 'temperature_2m', 'precipitation',
       'cloud_cover', 'wind_speed_10m', 'snowfall', 'rain', 'date_y',
       'is_work_free', 'is_red_day', 'is_day_before_holiday'],
      dtype='object')
0


In [11]:
print(merged_df["window_start"].dtype)
print(merged_df["window_start"].isna().sum())

merged_df["date_x"] = pd.to_datetime(merged_df["date_x"]).dt.tz_localize(None)
merged_df["date_y"] = pd.to_datetime(merged_df["date_y"]).dt.tz_localize(None)

datetime64[us]
0


In [12]:
print(merged_df["date_x"].dtype)
merged_df.drop(['date_x', 'date_y'], axis=1, inplace=True)

print(merged_df.columns)

datetime64[ns]
Index(['trip_id', 'vehicle_id', 'window_start', 'occupancy_mode', 'avg_speed',
       'max_speed', 'speed_std', 'n_positions', 'lat_mean', 'lon_mean', 'hour',
       'day_of_week', 'temperature_2m', 'precipitation', 'cloud_cover',
       'wind_speed_10m', 'snowfall', 'rain', 'is_work_free', 'is_red_day',
       'is_day_before_holiday'],
      dtype='object')


In [13]:
merged_df["window_start"] = pd.to_datetime(merged_df["window_start"], errors="coerce")
merged_df = merged_df.dropna(subset=["window_start"])

In [14]:
LAGS = [1, 2, 3]

# Lag features for target variable
for lag in LAGS:
    merged_df[f"{TARGET}_lag_{lag}"] = merged_df.groupby("vehicle_id")[TARGET].shift(lag)
    
lag_cols = [f"{TARGET}_lag_{lag}" for lag in LAGS]
merged_df = merged_df.dropna(subset=lag_cols)

merged_df = merged_df.dropna(subset=["trip_id"])

lagged_fg = fs.get_or_create_feature_group(
    name="occupancy_lagged_fg",
    description="Vehicle, weather, holiday, traffic features with lag occupancy features",
    version=1,
    primary_key=["trip_id"],
    event_time="window_start"
)

lagged_fg.insert(merged_df)

Uploading Dataframe: 100.00% |██████████| Rows 17066/17066 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: occupancy_lagged_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1271989/jobs/named/occupancy_lagged_fg_1_offline_fg_materialization/executions


(Job('occupancy_lagged_fg_1_offline_fg_materialization', 'SPARK'), None)

In [15]:
lag_features = [f"{TARGET}_lag_{lag}" for lag in LAGS]

selected_features = lagged_fg.select(
    ["trip_id", "vehicle_id", "window_start", "occupancy_mode"] + VEHICLE_FEATURES + WEATHER_FEATURES + HOLIDAY_FEATURES + lag_features
)

feature_view_name = "occupancy_lagged_fv"
feature_view_version = 1

feature_view = fs.get_or_create_feature_view(
    name=feature_view_name,
    description="Vehicle, weather, holiday features with lagged occupancy target",
    version=feature_view_version,
    labels=[TARGET],
    query=selected_features
)

In [16]:
X_train, X_test, y_train, y_test = feature_view.train_test_split(
    test_start=test_start_date
)

print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.43s) 

Train samples: 695, Test samples: 91


In [17]:
print(X_train.columns)
print(X_test.columns)

Index(['trip_id', 'vehicle_id', 'window_start', 'avg_speed', 'max_speed',
       'speed_std', 'n_positions', 'lat_mean', 'lon_mean', 'hour',
       'day_of_week', 'temperature_2m', 'precipitation', 'cloud_cover',
       'wind_speed_10m', 'snowfall', 'rain', 'is_work_free', 'is_red_day',
       'is_day_before_holiday', 'occupancy_mode_lag_1', 'occupancy_mode_lag_2',
       'occupancy_mode_lag_3'],
      dtype='object')
Index(['trip_id', 'vehicle_id', 'window_start', 'avg_speed', 'max_speed',
       'speed_std', 'n_positions', 'lat_mean', 'lon_mean', 'hour',
       'day_of_week', 'temperature_2m', 'precipitation', 'cloud_cover',
       'wind_speed_10m', 'snowfall', 'rain', 'is_work_free', 'is_red_day',
       'is_day_before_holiday', 'occupancy_mode_lag_1', 'occupancy_mode_lag_2',
       'occupancy_mode_lag_3'],
      dtype='object')


In [18]:
X_train.describe()

Unnamed: 0,avg_speed,max_speed,speed_std,n_positions,lat_mean,lon_mean,hour,day_of_week,temperature_2m,precipitation,cloud_cover,wind_speed_10m,snowfall,rain,is_work_free,is_red_day,is_day_before_holiday,occupancy_mode_lag_1,occupancy_mode_lag_2,occupancy_mode_lag_3
count,695.0,695.0,693.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0,695.0
mean,4.717767,7.701871,1.302051,943.048921,58.481095,15.837324,0.0,4.064748,3.578417,0.085755,69.889209,15.263165,0.0,0.085755,0.453237,0.32518,0.014388,0.322302,0.322302,0.322302
std,6.290149,8.505903,1.557097,3884.212934,0.110002,0.321994,0.0,1.907095,2.814172,0.269456,43.854726,5.215502,0.0,0.269456,0.498167,0.468779,0.119172,0.500438,0.500438,0.500438
min,0.0,0.0,0.0,1.0,58.0005,15.04625,0.0,0.0,-3.7,0.0,0.0,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,52.0,58.41736,15.642832,0.0,3.0,2.2,0.0,9.0,11.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.863333,7.2,0.921065,56.0,58.427057,15.67288,0.0,5.0,4.7,0.0,100.0,15.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.025169,11.7,2.125375,59.0,58.58481,16.189632,0.0,6.0,5.2,0.0,100.0,20.1,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
max,43.940386,44.400002,8.337641,19881.0,58.710456,16.558377,0.0,6.0,9.1,1.3,100.0,24.1,0.0,1.3,1.0,1.0,1.0,3.0,3.0,3.0


In [19]:
X_test.describe()

Unnamed: 0,avg_speed,max_speed,speed_std,n_positions,lat_mean,lon_mean,hour,day_of_week,temperature_2m,precipitation,cloud_cover,wind_speed_10m,snowfall,rain,is_work_free,is_red_day,is_day_before_holiday,occupancy_mode_lag_1,occupancy_mode_lag_2,occupancy_mode_lag_3
count,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,41.0,41.0,41.0,41.0,41.0,41.0,91.0,91.0,91.0,91.0,91.0,91.0
mean,6.359806,8.943956,1.562693,433.241758,58.481134,15.808218,0.0,3.846154,-3.4,0.3,100.0,13.3,0.21,0.0,0.615385,0.56044,0.0,0.274725,0.274725,0.274725
std,7.696572,8.605414,1.434853,2525.156322,0.107027,0.34173,0.0,1.124703,4.496061e-16,5.620076e-17,0.0,1.798424e-15,2.810038e-17,0.0,0.4892,0.499083,0.0,0.495893,0.495893,0.495893
min,0.0,0.0,0.0,4.0,58.201298,15.046633,0.0,2.0,-3.4,0.3,100.0,13.3,0.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,57.0,58.416902,15.622466,0.0,3.0,-3.4,0.3,100.0,13.3,0.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.598246,8.6,1.420313,57.0,58.426935,15.656965,0.0,4.0,-3.4,0.3,100.0,13.3,0.21,0.0,1.0,1.0,0.0,0.0,0.0,0.0
75%,8.44386,11.25,2.396021,60.0,58.584856,16.181344,0.0,5.0,-3.4,0.3,100.0,13.3,0.21,0.0,1.0,1.0,0.0,0.5,0.5,0.5
max,43.465001,44.700001,6.22195,17676.0,58.70597,16.56046,0.0,5.0,-3.4,0.3,100.0,13.3,0.21,0.0,1.0,1.0,0.0,2.0,2.0,2.0


In [39]:
# Dropping features that had less than 0.02 in feature importance in a test run, or has no predictive power
X_features = X_train.drop(columns=['speed_std', 'avg_speed', 'trip_id', 'vehicle_id', 'window_start'])            
X_test_features = X_test.drop(columns=['speed_std', 'avg_speed', 'trip_id', 'vehicle_id', 'window_start']) 

In [40]:
X_features

Unnamed: 0,max_speed,n_positions,lat_mean,lon_mean,hour,day_of_week,temperature_2m,precipitation,cloud_cover,wind_speed_10m,snowfall,rain,is_work_free,is_red_day,is_day_before_holiday,occupancy_mode_lag_1,occupancy_mode_lag_2,occupancy_mode_lag_3
1,1.700000,58,58.585379,16.188861,0,6,4.8,0.0,100.0,7.5,0.0,0.0,0,0,0,0.0,0.0,0.0
2,26.100000,19155,58.481030,15.767578,0,0,5.7,0.0,91.0,24.1,0.0,0.0,0,0,0,0.0,0.0,0.0
3,10.800000,49,58.537755,15.046897,0,6,0.4,0.0,100.0,14.5,0.0,0.0,0,0,0,0.0,0.0,0.0
4,7.200000,49,58.401383,15.622221,0,6,0.4,0.0,100.0,14.5,0.0,0.0,0,0,0,1.0,1.0,1.0
6,10.000000,48,58.417838,15.669256,0,4,0.9,0.0,0.0,11.6,0.0,0.0,1,0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,0.000000,58,58.584248,16.210788,0,0,5.7,0.0,91.0,24.1,0.0,0.0,0,0,0,0.0,0.0,0.0
781,25.000000,56,58.228482,15.648653,0,5,7.3,0.0,100.0,22.0,0.0,0.0,1,1,0,0.0,0.0,0.0
782,5.000000,56,58.322589,15.132894,0,5,7.3,0.0,100.0,22.0,0.0,0.0,1,1,0,0.0,0.0,0.0
783,25.799999,18744,58.478619,15.755870,0,2,-1.2,0.0,100.0,4.5,0.0,0.0,0,0,0,0.0,0.0,0.0


In [41]:
y_train

Unnamed: 0,occupancy_mode
1,0
2,0
3,0
4,1
6,0
...,...
780,0
781,0
782,0
783,0


In [42]:
from xgboost import XGBClassifier
from scipy.stats import randint, uniform

XGBOOST_PARAMS = {
    "tree_method": "hist",
    "enable_categorical": True,
    "max_depth": 8,
    "learning_rate": 0.05,
    "n_estimators": 200,
    "subsample": 0.7,
    "colsample_bytree": 0.8,
    "min_child_weight": 1,
    "gamma": 0.1,
    "objective": "multi:softprob",
    "num_class": 7,  # GTFS-RT has 7 occupancy classes (0-6)
    "random_state": 42,
}

CLASS_WEIGHT_MULTIPLIER = {
    0: 1.0,   # EMPTY (72%) - baseline
    1: 2.0,   # MANY_SEATS (26%) - slight boost
    2: 10.0,  # FEW_SEATS (1%) - significant boost
    3: 20.0,  # STANDING (0.4%) - heavy boost
    4: 25.0,  # CRUSHED_STANDING - not observed yet
    5: 30.0,  # FULL - not observed yet
    6: 1.0,   # NOT_ACCEPTING_PASSENGERS - not observed yet
}


# For hyperparameter tuning
param_dist = {
    "max_depth": randint(3, 10),
    "learning_rate": uniform(0.01, 0.3),
    "n_estimators": randint(100, 300),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "min_child_weight": randint(1, 10),
    "gamma": uniform(0, 5),
}

In [43]:
classes = np.unique(y_train)
print(len(classes))
print(classes)
print(classes[0].dtype)

4
[0 1 2 3]
int64


In [44]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)
from sklearn.model_selection import RandomizedSearchCV

MAX_WEIGHT = 50.0
ALL_POSSIBLE_CLASSES = np.array([0, 1, 2, 3, 4, 5, 6])

def compute_sample_weights(y_train):
    """Compute sample weights to handle class imbalance with custom multipliers."""
    from sklearn.utils.class_weight import compute_class_weight

    present_classes = np.unique(y_train)
    base_weights = compute_class_weight('balanced', classes=present_classes, y=y_train)
    base_weight_dict = dict(zip(present_classes, base_weights))

    # Apply additional multipliers for severe imbalance
    weight_dict = {}
    for cls in ALL_POSSIBLE_CLASSES:
        multiplier = CLASS_WEIGHT_MULTIPLIER.get(cls, 1.0)
        weight_dict[cls] = min(base_weight_dict.get(cls, 0) * multiplier, MAX_WEIGHT)  # 0 if cls not in y_train

    print(f"  Base class weights (present in y_train): {base_weight_dict}")
    print(f"  Adjusted class weights (all possible classes): {weight_dict}")

    # Assign sample weights only for rows actually in y_train
    sample_weights = np.array([weight_dict[y] for y in y_train])
    return sample_weights

def ordinal_mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

def train_model(X_train, y_train, use_class_weights=True):
    """Train XGBoost Classifier with optional class weighting."""
    print("\nTraining XGBoost Classifier...")
    print(f"  Parameters: {XGBOOST_PARAMS}")  

    model = XGBClassifier(**XGBOOST_PARAMS)

    if use_class_weights:
        sample_weights = compute_sample_weights(y_train)
        model.fit(X_train, y_train, sample_weight=sample_weights)
    else:
        model.fit(X_train, y_train)

    print("  Training complete!")
    return model


def train_model_tuned(X_train, y_train, use_class_weights=True, n_iter=20, cv=2):
    """Train XGBoost Classifier with optional class weighting and hyperparameter tuning."""
    print("\nStarting hyperparameter tuning...")
    
    X_train = X_train.astype('float32')

    base_model = XGBClassifier(**XGBOOST_PARAMS, use_label_encoder=False, n_jobs=-1)

    if use_class_weights:
        sample_weights = compute_sample_weights(y_train)
    else:
        sample_weights = None

    random_search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring='recall_macro',  
        cv=cv,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    random_search.fit(X_train, y_train, sample_weight=sample_weights)

    print("\nBest hyperparameters found:", random_search.best_params_)
    best_model = random_search.best_estimator_

    return best_model

def predict_ordinal(models, X):
    p_ge_1 = models["ge_1"].predict_proba(X)[:, 1]
    p_ge_2 = models["ge_2"].predict_proba(X)[:, 1]
    p_ge_3 = models["ge_3"].predict_proba(X)[:, 1]

    preds = np.zeros(len(X), dtype=int)
    preds[p_ge_1 > 0.5] = 1
    preds[p_ge_2 > 0.5] = 2
    preds[p_ge_3 > 0.5] = 3

    return preds


def evaluate_model(model, X_test, y_test):
    """Evaluate model and return metrics."""
    print("\nEvaluating model...")

    # Get probabilities (since we use softprob objective)
    y_proba = model.predict_proba(X_test)
    y_pred = np.argmax(y_proba, axis=1)

    # Calculate metrics (weighted for class imbalance)
    accuracy = accuracy_score(y_test, y_pred)
    ordinal_mae_val = ordinal_mae(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

    # Also calculate per-class recall (important for rare classes)
    per_class_recall = recall_score(y_test, y_pred, average=None, zero_division=0)

    metrics = {
        "accuracy": float(accuracy),
        "ordinal_mae": float(ordinal_mae_val),
        "precision_weighted": float(precision),
        "recall_weighted": float(recall),
        "f1_weighted": float(f1),
        "recall_class_0": float(per_class_recall[0]) if len(per_class_recall) > 0 else 0,
        "recall_class_1": float(per_class_recall[1]) if len(per_class_recall) > 1 else 0,
        "recall_class_2": float(per_class_recall[2]) if len(per_class_recall) > 2 else 0,
        "recall_class_3": float(per_class_recall[3]) if len(per_class_recall) > 3 else 0,
    }

    print(f"\n  Results:")
    print(f"    Accuracy:  {accuracy:.4f}")
    print(f"    Precision: {precision:.4f} (weighted)")
    print(f"    Recall:    {recall:.4f} (weighted)")
    print(f"    F1 Score:  {f1:.4f} (weighted)")
    print(f"\n  Per-class Recall (critical for rare classes):")
    class_names = ["EMPTY", "MANY_SEATS", "FEW_SEATS", "STANDING"]
    for i, name in enumerate(class_names):
        if i < len(per_class_recall):
            print(f"    Class {i} ({name}): {per_class_recall[i]:.4f}")

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n  Confusion Matrix:")
    print(cm)

    # Classification report
    print(f"\n  Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    return metrics, y_pred


def plot_feature_importance(model, feature_names, save_path=None):
    """Plot and optionally save feature importance."""
    importance = model.feature_importances_

    # Sort by importance
    indices = np.argsort(importance)[::-1]
    sorted_features = [feature_names[i] for i in indices]
    sorted_importance = importance[indices]

    print("\n  Feature Importance (gain):")
    for feat, imp in zip(sorted_features, sorted_importance):
        print(f"    {feat}: {imp:.4f}")

    # Plot
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(sorted_features)), sorted_importance[::-1])
    plt.yticks(range(len(sorted_features)), sorted_features[::-1])
    plt.xlabel("Feature Importance (Gain)")
    plt.title("XGBoost Feature Importance - Occupancy Prediction")
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path)
        print(f"  Saved feature importance plot to {save_path}")

    plt.close()


def save_model_local(model, model_dir):
    """Save model to local directory."""
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "model.json")
    model.save_model(model_path)
    print(f"  Model saved to {model_path}")
    return model_path

In [45]:
print(y_train.columns)
print(X_features.columns)

print(y_test.columns)

Index(['occupancy_mode'], dtype='object')
Index(['max_speed', 'n_positions', 'lat_mean', 'lon_mean', 'hour',
       'day_of_week', 'temperature_2m', 'precipitation', 'cloud_cover',
       'wind_speed_10m', 'snowfall', 'rain', 'is_work_free', 'is_red_day',
       'is_day_before_holiday', 'occupancy_mode_lag_1', 'occupancy_mode_lag_2',
       'occupancy_mode_lag_3'],
      dtype='object')
Index(['occupancy_mode'], dtype='object')


In [58]:
print(X_features.columns)

Index(['avg_speed', 'max_speed', 'speed_std', 'n_positions', 'lat_mean',
       'lon_mean', 'hour', 'day_of_week', 'temperature_2m', 'precipitation',
       'cloud_cover', 'wind_speed_10m', 'snowfall', 'rain', 'is_work_free',
       'is_red_day', 'is_day_before_holiday'],
      dtype='object')


In [46]:
y_train_series = y_train['occupancy_mode']
y_train_series = y_train_series.astype(int)

print(y_train_series)

1      0
2      0
3      0
4      1
6      0
      ..
780    0
781    0
782    0
783    0
784    0
Name: occupancy_mode, Length: 695, dtype: int64


In [59]:
y_test_series = y_test['occupancy_mode']
y_test_series = y_test_series.astype(int)

print(X_test_features.shape)
print(y_test_series.shape)

(91, 18)
(91,)


In [82]:
# X_features = X_features.drop(columns=['occupancy_mode_lag_1', 'occupancy_mode_lag_2', 'occupancy_mode_lag_3'], axis=1) 
# X_test_features = X_test_features.drop(columns=['occupancy_mode_lag_1', 'occupancy_mode_lag_2', 'occupancy_mode_lag_3', 'avg_speed', 'speed_std'], axis=1) 
X_features = X_features.drop(columns=['avg_speed', 'speed_std'], axis=1)

In [83]:
print(X_test_features.columns)
print(X_features.columns)

Index(['max_speed', 'n_positions', 'lat_mean', 'lon_mean', 'hour',
       'day_of_week', 'temperature_2m', 'precipitation', 'cloud_cover',
       'wind_speed_10m', 'snowfall', 'rain', 'is_work_free', 'is_red_day',
       'is_day_before_holiday'],
      dtype='object')
Index(['max_speed', 'n_positions', 'lat_mean', 'lon_mean', 'hour',
       'day_of_week', 'temperature_2m', 'precipitation', 'cloud_cover',
       'wind_speed_10m', 'snowfall', 'rain', 'is_work_free', 'is_red_day',
       'is_day_before_holiday'],
      dtype='object')


In [84]:
# Fill missing values
# X_train = X_train.fillna(X_train.median())
# X_test = X_test.fillna(X_test.median()) 

model_dir = "./model_plots"
    
model = train_model(X_features, y_train_series)

# Evaluate
metrics, y_pred = evaluate_model(model, X_test_features, y_test_series)

plot_feature_importance(model, X_test_features.columns.tolist(),
                                       os.path.join(model_dir, "feature_importance.png"))


Training XGBoost Classifier...
  Parameters: {'tree_method': 'hist', 'enable_categorical': True, 'max_depth': 8, 'learning_rate': 0.05, 'n_estimators': 200, 'subsample': 0.7, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0.1, 'objective': 'multi:softprob', 'num_class': 7, 'random_state': 42}
  Base class weights (present in y_train): {0: 0.3619791666666667, 1: 0.8353365384615384, 2: 34.75, 3: 86.875}
  Adjusted class weights (all possible classes): {0: 0.3619791666666667, 1: 1.6706730769230769, 2: 50.0, 3: 50.0, 4: 0.0, 5: 0.0, 6: 0.0}
  Training complete!

Evaluating model...

  Results:
    Accuracy:  0.7363
    Precision: 0.7388 (weighted)
    Recall:    0.7363 (weighted)
    F1 Score:  0.7359 (weighted)

  Per-class Recall (critical for rare classes):
    Class 0 (EMPTY): 0.8088
    Class 1 (MANY_SEATS): 0.5714
    Class 2 (FEW_SEATS): 0.0000

  Confusion Matrix:
[[55 13  0]
 [ 9 12  0]
 [ 1  1  0]]

  Classification Report:
              precision    recall  f1-score  

In [85]:
def save_model_local(model, model_dir):
    """Save model to local directory."""
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, "model.json")
    model.save_model(model_path)
    print(f"  Model saved to {model_path}")
    return model_path

save_model_local(model, model_dir)

  Model saved to ./model_plots/model.json


'./model_plots/model.json'

In [None]:
mr = project.get_model_registry()
MODEL_NAME = "occupancy_xgboost_model"
hopsworks_model = mr.get_model(MODEL_NAME)

# Upload model directory
# hopsworks_model.feature_view = feature_view
# hopsworks_model.save(model_dir)
# Log a new version of the model directory

 # Create model in registry
hopsworks_model = mr.python.create_model(
    name="occupancy_xgboost_model_with_lag",
    metrics=metrics,
    feature_view=feature_view,
    description="XGBoost Classifier for bus occupancy prediction (GTFS-RT classes 0-6)",
    input_example=X_test.iloc[:1].values,
)

# Upload model directory
hopsworks_model.save(model_dir)

print(f"  Model version: {hopsworks_model.version}")




  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/kajsalidin/Desktop/HappySardines/./model_plots/model.json: 0.000%|          | 0/1758324 elaps…

Uploading /Users/kajsalidin/Desktop/HappySardines/./model_plots/feature_importance.png: 0.000%|          | 0/3…

Uploading /Users/kajsalidin/Desktop/HappySardines/input_example.json: 0.000%|          | 0/196 elapsed<00:00 r…

Uploading /Users/kajsalidin/Desktop/HappySardines/model_schema.json: 0.000%|          | 0/1877 elapsed<00:00 r…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1271989/models/occupancy_xgboost_model_with_lag/1
  Model registered as 'occupancy_xgboost_model'
  Model version: 1
