In [6]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('aurora',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: C:\Users\lppap\Documents\master\scalable_ML\id2223-project
HopsworksSettings initialized!


## Imports

In [48]:
import os
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)
import hopsworks
from mlfs.aurora import util
import json

import warnings
warnings.filterwarnings("ignore")

FEATURE_VIEW_NAME = "aurora_fv"
FEATURE_VIEW_VERSION = 2   # change to 2 for solar wind, 1 no solar wind
MODEL_NAME = "aurora_xgboost"
MODEL_VERSION = FEATURE_VIEW_VERSION

HORIZONS = [1, 2, 3, 4, 5]

RANDOM_STATE = 42
TEST_RATIO = 0.2

## Hopsworks login & Connect

In [27]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store() 

2026-01-04 17:23:16,591 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-04 17:23:16,600 INFO: Initializing external client
2026-01-04 17:23:16,602 INFO: Base URL: https://c.app.hopsworks.ai:443






2026-01-04 17:23:18,086 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279154


In [4]:
# Retrieve feature groups
geomagnetic_fg = fs.get_feature_group( name="geomagnetic_daily", version=1)
weather_fg = fs.get_feature_group(name="sweden_weather_daily", version=1)
solar_fg = fs.get_feature_group(name="nasa_omni_daily", version=1)

## Feature View creation

In [67]:
# Select and join features for training
OldQuery = (
    geomagnetic_fg.select(
        ["date", "kp1", "kp2", "kp3", "kp4", "kp5", "kp6", "kp7", "kp8", "ap1", "ap2", "ap3", "ap4", "ap5", "ap6", "ap7", "ap8", "ap"]
    ).join(
        weather_fg.select([
            "cloud_cover_mean",
            "precipitation_sum",
            "sunshine_duration"
        ]),
        on=["date"]
    ))

NewQuery = (
    geomagnetic_fg.select(
        ["date", "kp1", "kp2", "kp3", "kp4", "kp5", "kp6", "kp7", "kp8", "ap1", "ap2", "ap3", "ap4", "ap5", "ap6", "ap7", "ap8", "ap"]
    ).join(
        weather_fg.select([
            "cloud_cover_mean",
            "precipitation_sum",
            "sunshine_duration"
        ]),
        on=["date"]
    ).join(
        solar_fg.select([
            "vsw_lag1", "vsw_lag2",
            "bz_lag1", "bz_lag2",
            "pressure_lag1",
            "bz_3d_mean", "bz_7d_min",
            "vsw_3d_mean", "pressure_3d_max",
            "vbz_neg"
        ]),
        on=["date"]
    )
)


## Create the feature view

In [72]:
feature_view_old = fs.get_or_create_feature_view(
    name="aurora_fv",
    description="Geomagnetic and weather features for aurora visibility prediction",
    version= 1,
    labels=["ap"],
    query=OldQuery,
)

feature_view_new = fs.get_or_create_feature_view(
    name="aurora_fv",
    description="Geomagnetic, weather and solar features for aurora visibility prediction",
    version= 2,
    labels=["ap"],
    query=NewQuery,
)


In [49]:
feature_view = fs.get_feature_view(
    name=FEATURE_VIEW_NAME,
    version=FEATURE_VIEW_VERSION
)

In [50]:
X_base, y_base = feature_view.training_data(
    description="Training data for multi-horizon aurora models",
    training_dataset_version=1
)

X_base["date"] = pd.to_datetime(X_base["date"], utc=True)

print("Date range in training data:")
print(X_base["date"].min(), "→", X_base["date"].max())

print(X_base.shape)
print(X_base.columns)
X_base.head()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.39s) 
Date range in training data:
2020-01-01 00:00:00+00:00 → 2026-01-01 00:00:00+00:00
(2192, 30)
Index(['date', 'kp1', 'kp2', 'kp3', 'kp4', 'kp5', 'kp6', 'kp7', 'kp8', 'ap1',
       'ap2', 'ap3', 'ap4', 'ap5', 'ap6', 'ap7', 'ap8', 'cloud_cover_mean',
       'precipitation_sum', 'sunshine_duration', 'vsw_lag1', 'vsw_lag2',
       'bz_lag1', 'bz_lag2', 'pressure_lag1', 'bz_3d_mean', 'bz_7d_min',
       'vsw_3d_mean', 'pressure_3d_max', 'vbz_neg'],
      dtype='object')


Unnamed: 0,date,kp1,kp2,kp3,kp4,kp5,kp6,kp7,kp8,ap1,...,vsw_lag1,vsw_lag2,bz_lag1,bz_lag2,pressure_lag1,bz_3d_mean,bz_7d_min,vsw_3d_mean,pressure_3d_max,vbz_neg
0,2023-01-13 00:00:00+00:00,2.333,1.667,2.0,2.333,2.333,1.667,3.333,4.0,9.0,...,405.166656,387.625,0.220833,-0.5875,2.682917,-1.108333,-2.958333,397.569458,2.682917,-1183.086792
1,2023-01-16 00:00:00+00:00,4.0,1.667,3.0,3.0,3.333,3.333,1.667,0.667,27.0,...,448.458344,467.458344,-6.379167,0.516667,1.949583,-1.658333,-6.379167,463.638885,4.089167,0.0
2,2023-02-08 00:00:00+00:00,3.0,3.333,3.0,4.0,3.333,2.667,3.333,3.0,15.0,...,498.708344,416.041656,-1.891667,-1.5375,38.700001,-1.488889,-1.891667,490.375,38.700001,-577.239075
3,2023-03-01 00:00:00+00:00,3.0,1.0,1.333,2.667,0.667,1.333,2.0,2.0,15.0,...,645.375,737.875,0.095833,-4.841667,1.50125,-0.831944,-4.841667,664.263916,42.334164,0.0
4,2023-04-23 00:00:00+00:00,2.333,1.667,1.333,3.333,4.667,5.333,8.333,7.333,9.0,...,357.291656,379.708344,-0.983333,-1.445833,0.867917,-2.584722,-5.325,385.777771,3.537083,-2238.275146


## Feature engineering - Lagged geomagnetic features

In [51]:
# Ensure data is sorted by time
order = X_base["date"].sort_values().index
X_base = X_base.loc[order].reset_index(drop=True)
y_base = y_base.loc[order].reset_index(drop=True)

ap = y_base["ap"]

# Lagged Ap features
for lag in [1, 2, 3]:
    X_base[f"ap_lag_{lag}"] = ap.shift(lag)
    

# Lagged Kp features (daily mean + max are most informative)
kp_cols = [f"kp{i}" for i in range(1, 9)]
X_base["kp_mean"] = X_base[kp_cols].mean(axis=1)
X_base["kp_max"] = X_base[kp_cols].max(axis=1)

for lag in [1, 2, 3]:
    X_base[f"kp_mean_lag_{lag}"] = X_base["kp_mean"].shift(lag)
    X_base[f"kp_max_lag_{lag}"] = X_base["kp_max"].shift(lag)

In [52]:
# Build binary targets for all horizons
AP_THRESHOLD = 15
MAX_HORIZON = 5

y_targets = {
    h: (ap.shift(-h) >= AP_THRESHOLD).astype("int32")
    for h in range(1, MAX_HORIZON + 1)
}

X = X_base.drop(columns=["date"])

### Training loop — one model per horizon

In [53]:
# Train / test split (time-aware,
split_idx = int((1 - TEST_RATIO) * len(X))

X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]

train_mask = X_train.index
test_mask  = X_test.index

## 5 days target models

Design rules:

- One horizon = one model
- Same features for all horizons
- Only the target shifts
- Same train/test split for all horizons
- One model name per horizon 

In [54]:
import pandas as pd
import seaborn as sns

def feature_importance_plot(model, model_dir):
    # Get feature importance by gain
    importance = model.get_booster().get_score(importance_type="gain")
    
    # Convert to DataFrame
    imp_df = (
        pd.DataFrame(importance.items(), columns=["feature", "gain"])
          .sort_values("gain", ascending=False)
          .head(20)
    )
    
    # Plot
    plt.figure(figsize=(6, 4))
    sns.barplot(
        data=imp_df,
        x="gain",
        y="feature",
        color="steelblue"
    )
    plt.title(f"Top 20 Feature Importances (t+{h})")
    plt.xlabel("Gain")
    plt.ylabel("Feature")
    
    plt.tight_layout()
    plt.savefig(f"{model_dir}/feature_importance_gain_tplus{h}.png")
    plt.close()

In [55]:
from xgboost import XGBClassifier
from sklearn.metrics import (
    roc_auc_score, accuracy_score,
    precision_score, recall_score,
    f1_score, confusion_matrix
)
from hsml.model_schema import ModelSchema
from hsml.schema import Schema
import pandas as pd
import os
import joblib
import seaborn as sns

MODEL_VERSION = 1
mr = project.get_model_registry()


def _dtype_to_hsml(dtype) -> str:
    if pd.api.types.is_integer_dtype(dtype):
        return "int32"
    if pd.api.types.is_float_dtype(dtype):
        return "float32"
    if pd.api.types.is_bool_dtype(dtype):
        return "boolean"
    return "string"


inputs = [
    {
        "name": col,
        "type": _dtype_to_hsml(X_train[col].dtype),
        "description": col,
    }
    for col in X_train.columns
]
input_schema = Schema(inputs)

outputs = [
    {
        "type": "float32",
        "shape": [1],
        "description": "Probability of Ap >= threshold",
    }
]
output_schema = Schema(outputs)

model_schema = ModelSchema(
    input_schema=input_schema,
    output_schema=output_schema,
)

for h in range(1, MAX_HORIZON + 1):
    print(f"\ndYs? Training model for t+{h}")

    y_train = y_targets[h].iloc[train_mask]
    y_test  = y_targets[h].iloc[test_mask]

    # Safety checks
    assert len(X_train) == len(y_train)
    assert len(X_test) == len(y_test)
    assert X_train.index.max() < X_test.index.min()

    pos = y_train.sum()
    neg = len(y_train) - pos
    scale_pos_weight = (neg / max(pos, 1))

    model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="auc",
        random_state=42,
        scale_pos_weight=scale_pos_weight
    )

    model.fit(X_train, y_train)

    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred  = (y_proba >= 0.5).astype(int)

    metrics = {
        "roc_auc": float(roc_auc_score(y_test, y_proba)),
        "accuracy": float(accuracy_score(y_test, y_pred)),
        "precision": float(precision_score(y_test, y_pred, zero_division=0)),
        "recall": float(recall_score(y_test, y_pred, zero_division=0)),
        "f1": float(f1_score(y_test, y_pred, zero_division=0)),
        "horizon_days": h,
        "ap_threshold": AP_THRESHOLD,
        "feature_view_version": FEATURE_VIEW_VERSION,
        "train_pos_class" : int(pos),
        "train_neg_class" : int(neg),
        "train_pos_ratio" : float(pos / (pos + neg))
    }

    cm = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:")
    print(cm)
    print("Metrics:", metrics)

    # Save Model
    model_dir = f"aurora_model_h{h}"
    model_img_dir = model_dir + "/images"
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(model_img_dir, exist_ok=True)

    # Creating confusion matrix
    plt.figure(figsize=(4, 3))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        cbar=False
    )
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.title(f"Confusion Matrix (t+{h})")

    plt.tight_layout()
    plt.savefig(f"{model_img_dir}/confusion_matrix_tplus{h}.png")
    plt.close()

    feature_importance_plot(model, model_img_dir)

    model_path = os.path.join(model_dir, "model.pkl")
    joblib.dump(model, model_path)
    

    model_name = f"{MODEL_NAME}_h{h}"
    model_description = (
        f"Binary aurora classifier (Ap >= 15) predicting t+{h} days ahead "
    )

    registered_model = mr.python.create_model(
        name= model_name,
        description= model_description,
        metrics=metrics,
        model_schema=model_schema,
        feature_view=feature_view,
        training_dataset_version=1
    )

    registered_model.save(model_dir)

    print(f"?. Model {model_name} saved and registered")



dYs? Training model for t+1
Confusion matrix:
[[247  48]
 [ 67  77]]
Metrics: {'roc_auc': 0.7786723163841808, 'accuracy': 0.7380410022779044, 'precision': 0.616, 'recall': 0.5347222222222222, 'f1': 0.5724907063197027, 'horizon_days': 1, 'ap_threshold': 15, 'feature_view_version': 2, 'train_pos_class': 250, 'train_neg_class': 1503, 'train_pos_ratio': 0.1426126640045636}


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h1/feature_…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h1/model.pk…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h1\images/c…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h1\images/f…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h1\images/m…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\model_schema.json: 0.000…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1279154/models/aurora_xgboost_h1/2
?. Model aurora_xgboost_h1 saved and registered

dYs? Training model for t+2
Confusion matrix:
[[255  40]
 [116  28]]
Metrics: {'roc_auc': 0.599882297551789, 'accuracy': 0.6446469248291572, 'precision': 0.4117647058823529, 'recall': 0.19444444444444445, 'f1': 0.2641509433962264, 'horizon_days': 2, 'ap_threshold': 15, 'feature_view_version': 2, 'train_pos_class': 250, 'train_neg_class': 1503, 'train_pos_ratio': 0.1426126640045636}


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h2/feature_…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h2/model.pk…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h2\images/c…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h2\images/f…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h2\images/m…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\model_schema.json: 0.000…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1279154/models/aurora_xgboost_h2/2
?. Model aurora_xgboost_h2 saved and registered

dYs? Training model for t+3
Confusion matrix:
[[256  39]
 [132  12]]
Metrics: {'roc_auc': 0.4741760828625235, 'accuracy': 0.6104783599088838, 'precision': 0.23529411764705882, 'recall': 0.08333333333333333, 'f1': 0.12307692307692308, 'horizon_days': 3, 'ap_threshold': 15, 'feature_view_version': 2, 'train_pos_class': 250, 'train_neg_class': 1503, 'train_pos_ratio': 0.1426126640045636}


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h3/feature_…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h3/model.pk…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h3\images/c…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h3\images/f…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h3\images/m…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\model_schema.json: 0.000…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1279154/models/aurora_xgboost_h3/2
?. Model aurora_xgboost_h3 saved and registered

dYs? Training model for t+4
Confusion matrix:
[[265  30]
 [129  15]]
Metrics: {'roc_auc': 0.5279778719397363, 'accuracy': 0.6378132118451025, 'precision': 0.3333333333333333, 'recall': 0.10416666666666667, 'f1': 0.15873015873015875, 'horizon_days': 4, 'ap_threshold': 15, 'feature_view_version': 2, 'train_pos_class': 250, 'train_neg_class': 1503, 'train_pos_ratio': 0.1426126640045636}


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h4/feature_…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h4/model.pk…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h4\images/c…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h4\images/f…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h4\images/m…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\model_schema.json: 0.000…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1279154/models/aurora_xgboost_h4/2
?. Model aurora_xgboost_h4 saved and registered

dYs? Training model for t+5
Confusion matrix:
[[256  39]
 [126  18]]
Metrics: {'roc_auc': 0.47991996233521655, 'accuracy': 0.6241457858769932, 'precision': 0.3157894736842105, 'recall': 0.125, 'f1': 0.1791044776119403, 'horizon_days': 5, 'ap_threshold': 15, 'feature_view_version': 2, 'train_pos_class': 250, 'train_neg_class': 1503, 'train_pos_ratio': 0.1426126640045636}


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h5/feature_…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h5/model.pk…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h5\images/c…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h5\images/f…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\aurora_model_h5\images/m…

Uploading C:\Users\lppap\Documents\master\scalable_ML\id2223-project\notebooks\aurora\model_schema.json: 0.000…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1279154/models/aurora_xgboost_h5/2
?. Model aurora_xgboost_h5 saved and registered
