In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('aurora',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /Users/appbites/Desktop/id2223-project
Added the following directory to the PYTHONPATH: /Users/appbites/Desktop/id2223-project
HopsworksSettings initialized!


## Imports

In [2]:
import os
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error, r2_score
import hopsworks
from mlfs.aurora import util
import json

import warnings
warnings.filterwarnings("ignore")

In [3]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store() 

2025-12-30 23:01:55,652 INFO: Initializing external client
2025-12-30 23:01:55,652 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-12-30 23:01:57,606 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1289364


In [4]:
# Retrieve feature groups
geomagnetic_fg = fs.get_feature_group(
    name="geomagnetic_daily",
    version=1,
)

weather_fg = fs.get_feature_group(
    name="sweden_weather_daily",
    version=1,
)


## Feature View creation

In [5]:
# Select and join features for training
selected_features = geomagnetic_fg.select(
    ["date", "kp1", "kp2", "kp3", "kp4", "kp5", "kp6", "kp7", "kp8", "ap1", "ap2", "ap3", "ap4", "ap5", "ap6", "ap7", "ap8", "ap"]
).join(
    weather_fg.select_features(),
    on=["date"]
)


2025-12-30 23:02:11,739 INFO: Using ['cloud_cover_mean', 'precipitation_sum', 'sunshine_duration'] from feature group `sweden_weather_daily` as features for the query. To include primary key and event time use `select_all`.


## Create the feature view

In [6]:
feature_view = fs.get_or_create_feature_view(
    name="aurora_fv",
    description="Geomagnetic and weather features for aurora visibility prediction",
    version=1,
    query=selected_features,
)


In [7]:
X, _ = feature_view.training_data(
    description="Training data for aurora model"
)

X["date"] = pd.to_datetime(X["date"], utc=True)


print(X.shape)
print(X.columns)
X.head()


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.67s) 
(2190, 21)
Index(['date', 'kp1', 'kp2', 'kp3', 'kp4', 'kp5', 'kp6', 'kp7', 'kp8', 'ap1',
       'ap2', 'ap3', 'ap4', 'ap5', 'ap6', 'ap7', 'ap8', 'ap',
       'cloud_cover_mean', 'precipitation_sum', 'sunshine_duration'],
      dtype='object')


Unnamed: 0,date,kp1,kp2,kp3,kp4,kp5,kp6,kp7,kp8,ap1,...,ap3,ap4,ap5,ap6,ap7,ap8,ap,cloud_cover_mean,precipitation_sum,sunshine_duration
0,2020-02-17 00:00:00+00:00,1.333,0.333,1.333,0.667,1.0,1.333,3.0,3.0,5.0,...,5.0,3.0,4.0,5.0,15.0,15.0,7.0,80.333336,1.1,14275.975586
1,2020-03-15 00:00:00+00:00,0.0,0.333,0.333,1.667,1.333,1.0,1.667,1.667,0.0,...,2.0,6.0,5.0,4.0,6.0,6.0,4.0,88.583336,1.7,469.042084
2,2020-04-06 00:00:00+00:00,1.0,1.0,0.333,0.0,1.0,0.333,0.0,0.0,4.0,...,2.0,0.0,4.0,2.0,0.0,0.0,2.0,67.708336,0.0,39656.863281
3,2020-04-10 00:00:00+00:00,0.0,0.667,1.0,1.333,1.0,2.333,1.0,0.333,0.0,...,4.0,5.0,4.0,9.0,4.0,2.0,4.0,67.166664,0.6,44831.945312
4,2020-04-15 00:00:00+00:00,3.0,2.667,1.667,1.0,1.0,0.667,1.0,1.333,15.0,...,6.0,4.0,4.0,3.0,4.0,5.0,7.0,75.25,0.8,19225.4375


## Feature engineering - Lagged geomagnetic features

In [8]:
# Ensure data is sorted by time
X = X.sort_values("date").reset_index(drop=True)

# Lagged Ap features
for lag in [1, 2, 3]:
    X[f"ap_lag_{lag}"] = X["ap"].shift(lag)

# Lagged Kp features (daily mean + max are most informative)
kp_cols = [f"kp{i}" for i in range(1, 9)]

X["kp_mean"] = X[kp_cols].mean(axis=1)
X["kp_max"] = X[kp_cols].max(axis=1)

for lag in [1, 2, 3]:
    X[f"kp_mean_lag_{lag}"] = X["kp_mean"].shift(lag)
    X[f"kp_max_lag_{lag}"] = X["kp_max"].shift(lag)


## 5 days target

In [None]:
# Construct binary target variables for future aurora conditions
# Each target indicates whether geomagnetic activity (Ap index)
# reaches aurora-favorable levels (Ap ≥ 15) on a specific future day.

y_d1 = (X["ap"].shift(-1) >= 15).astype("int32")
y_d2 = (X["ap"].shift(-2) >= 15).astype("int32")
y_d3 = (X["ap"].shift(-3) >= 15).astype("int32")
y_d4 = (X["ap"].shift(-4) >= 15).astype("int32")
y_d5 = (X["ap"].shift(-5) >= 15).astype("int32")


In [None]:
# Align features and labels by removing incomplete future rows
# The last 5 rows of X do not have valid future targets (t+1 to t+5)
# due to the use of shift(-k) when creating labels.
# These rows must be removed to avoid label leakage and NaNs.

valid_rows = X.index[:-5]

X = X.loc[valid_rows].reset_index(drop=True)
y_d1 = y_d1.loc[valid_rows].reset_index(drop=True)
y_d2 = y_d2.loc[valid_rows].reset_index(drop=True)
y_d3 = y_d3.loc[valid_rows].reset_index(drop=True)
y_d4 = y_d4.loc[valid_rows].reset_index(drop=True)
y_d5 = y_d5.loc[valid_rows].reset_index(drop=True)


In [11]:
X = X.drop(columns=["ap"])

In [12]:
test_start = pd.Timestamp("2025-05-01", tz="UTC")

train_mask = X["date"] < test_start
test_mask = X["date"] >= test_start

X_train = X.loc[train_mask].drop(columns=["date"])
X_test  = X.loc[test_mask].drop(columns=["date"])


In [13]:
y_train = {
    "d1": y_d1.loc[train_mask],
    "d2": y_d2.loc[train_mask],
    "d3": y_d3.loc[train_mask],
    "d4": y_d4.loc[train_mask],
    "d5": y_d5.loc[train_mask],
}

y_test = {
    "d1": y_d1.loc[test_mask],
    "d2": y_d2.loc[test_mask],
    "d3": y_d3.loc[test_mask],
    "d4": y_d4.loc[test_mask],
    "d5": y_d5.loc[test_mask],
}


In [14]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

models = {}

for horizon in ["d1", "d2", "d3", "d4", "d5"]:
    model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=(
            (y_train[horizon] == 0).sum()
            / (y_train[horizon] == 1).sum()
        ),
        random_state=42,
        eval_metric="logloss",
    )

    model.fit(X_train, y_train[horizon])
    models[horizon] = model



In [15]:
from sklearn.metrics import roc_auc_score

for h in models:
    y_proba = models[h].predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test[h], y_proba)
    print(f"{h} ROC AUC:", auc)


d1 ROC AUC: 0.7587878787878787
d2 ROC AUC: 0.5328461597016062
d3 ROC AUC: 0.4941860465116279
d4 ROC AUC: 0.49796232218377545
d5 ROC AUC: 0.49033704390847244


In [None]:
mr = project.get_model_registry()

In [None]:
model_name = "aurora_xgboost_3day"
model_description = (
    "XGBoost classifier predicting probability of aurora-favorable "
    "geomagnetic conditions occurring within the next 5 days, "
    "based on geomagnetic indices (Kp, Ap) and weather features."
)

In [47]:
import os
import joblib

model_dir = "aurora_model"
os.makedirs(model_dir, exist_ok=True)

model_path = os.path.join(model_dir, "model.pkl")
joblib.dump(model, model_path)


['aurora_model/model.pkl']

In [48]:
registered_model = mr.python.create_model(
    name=model_name,
    description=model_description
)

registered_model.save(model_dir)


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/appbites/Desktop/id2223-project/notebooks/aurora/aurora_model/model.pkl: 0.000%|          | 0…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1289364/models/aurora_xgboost_3day/1


Model(name: 'aurora_xgboost_3day', version: 1)