In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    #if root_dir.parts[-1:] == ('aurora',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: C:\Users\lppap\Documents\master\scalable_ML\id2223-project
Added the following directory to the PYTHONPATH: C:\Users\lppap\Documents\master\scalable_ML\id2223-project
HopsworksSettings initialized!


## Imports & Constants

In [2]:
import datetime as datetime
import pandas as pd
from xgboost import XGBClassifier
import hopsworks
import json
from mlfs.aurora import util
import os
import joblib

In [16]:
FEATURE_VIEW_NAME = "aurora_fv"
FEATURE_VIEW_VERSION = 2

KP_FG = dict(name="geomagnetic_daily", version=1)
WEATHER_FG = dict(name="sweden_weather_daily", version=1)
SOLAR_FG = dict(name="nasa_omni_daily", version=2)  # OJO: aquÃ­ insertaremos el solar_wind realtime normalizado

PRED_FG = ("aurora_predictions", 1)

MODEL_NAME = "aurora_model"   # el prefijo que usas en training: f"{MODEL_NAME}_h{h}"
MAX_HORIZON = 5               # o el que uses
AP_THRESHOLD = 15             # tu umbral del evento

RUN_DATE = datetime.datetime.utcnow().date()- datetime.timedelta(days=1)

## Hopsworks Login

In [4]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store()    # Feature Store
mr = project.get_model_registry()  # Model Registry

2026-01-04 21:48:14,609 INFO: Initializing external client
2026-01-04 21:48:14,610 INFO: Base URL: https://c.app.hopsworks.ai:443






To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'

2026-01-04 21:48:16,424 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279154


## 1. Fetch the Inference Data and Add it to the Feature Store

In [30]:
def fetch_kp():
    # Obtain data
    df = util.get_latest_complete_kp_from_nowcast()

    # Insert into feature store
    #kp_fg = fs.get_feature_group(**KP_FG)
    #kp_fg.insert(df, wait=True)

    print(df)
    
    return df

In [29]:
def fetch_weather(run_date):

    LATITUDE = 62.0
    LONGITUDE = 15.0

    run_date = pd.to_datetime(run_date).date()
    date_str = run_date.strftime("%Y-%m-%d")

    df = util.get_historical_weather_sweden(
        start_date=date_str,
        end_date=date_str,
        latitude=LATITUDE,
        longitude=LONGITUDE,
    )

    if len(df) != 1:
        raise RuntimeError("fetch_weather must return exactly one row")

    print(df)

    #weather_fg = fs.get_feature_group(**WEATHER_FG)
    #weather_fg.insert(df, wait=True)
    
    return df

In [28]:
import requests
import importlib
from mlfs.aurora import util

importlib.reload(util)


def fetch_solar(run_date):

    run_date = pd.to_datetime(run_date).date()
    # 1. Obtain Newest Solar Data
    df = util.fetch_newest_solar_data(run_date)
    
    # 2. Do calculations
    daily_df = df.resample('D', on='date').mean().reset_index()
    df_engi = util.solar_feature_engineering(daily_df)

    df_final = df_engi[df_engi["date"].dt.date == run_date]

    # 3. Add to Hopsworks FG
    print(df_final.columns)
    if len(df_final) != 1:
        raise RuntimeError("fetch_weather must return exactly one row")

    #solar_fg = fs.get_feature_group(**SOLAR_FG)
    #solar_fg.insert(df_final, wait=True)

    print(df_final)
    
    return df_final

In [8]:
def build_features(raw: dict) -> pd.DataFrame:
    df = build_feature_dataframe(
        solar_wind=raw["solar_wind"],
        kp=raw["kp"],
        weather=raw["weather"],
    )

    # Ãºltimo timestamp disponible
    df = df.sort_index().iloc[-1:]
    return df

In [9]:
def load_model(model_name: str):
    mr = project.get_model_registry()
    model = mr.get_model(
        name=model_name,
        alias="champion"
    )
    return model.download()

### Steps of the main Loop

In [21]:
# Step 1
def fetch_latest_raw_data():
    kp_df = fetch_kp()

    date = pd.to_datetime(kp_df["date"].iloc[0]).normalize()

    print("last data from kp: ", date)
    
    fetch_weather(date)
    fetch_solar(date)
    
    return date

In [33]:
# Step 2

def obtain_data_fv(date):
    print ("Date trying to obtain from feature View", date)

    date = pd.to_datetime(date, utc=True).normalize()
    
    fv = fs.get_feature_view(
        name=FEATURE_VIEW_NAME,
        version=FEATURE_VIEW_VERSION
    )

    #######
    # Trae todo y quÃ©date con la mÃ¡s reciente
    X = fv.get_batch_data()

    if len(X) == 0:
        raise RuntimeError("Feature View is empty")

    # Ordenar por event time y quedarnos con la Ãºltima fila
    X = X.sort_values("date").iloc[-1:]
    print("Feature View Date: ", X)
    #########

    '''
    X = fv.get_batch_data(
        start_time=date,
        end_time=date + pd.Timedelta(days=1)
    )

    if len(X) != 1:
        raise RuntimeError(f"Expected 1 row from Feature View, got {len(X)}")

    X = X.drop(columns=["date"], errors="ignore")

    print(X)
    '''

    return X
    

date_raw = pd.to_datetime("2026-01-03 00:00:00")
date = pd.Timestamp("2026-01-03 00:00:00+00:00")
X = obtain_data_fv(date_raw)
X
    

Date trying to obtain from feature View 2026-01-03 00:00:00
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.08s) 
Feature View Date:                            date  kp1  kp2  kp3  kp4  kp5    kp6    kp7  kp8  \
1101 2026-01-03 00:00:00+00:00  3.0  3.0  2.0  2.0  3.0  2.667  2.333  1.0   

       ap1  ...  vsw_lag1  vsw_lag2  bz_lag1  bz_lag2  pressure_lag1  \
1101  15.0  ...       NaN       NaN      NaN      NaN            NaN   

      bz_3d_mean  bz_7d_min  vsw_3d_mean  pressure_3d_max  vbz_neg  
1101         NaN        NaN          NaN              NaN      NaN  

[1 rows x 30 columns]


Unnamed: 0,date,kp1,kp2,kp3,kp4,kp5,kp6,kp7,kp8,ap1,...,vsw_lag1,vsw_lag2,bz_lag1,bz_lag2,pressure_lag1,bz_3d_mean,bz_7d_min,vsw_3d_mean,pressure_3d_max,vbz_neg
1101,2026-01-03 00:00:00+00:00,3.0,3.0,2.0,2.0,3.0,2.667,2.333,1.0,15.0,...,,,,,,,,,,


In [22]:
# Step 3
def run_models(X: pd.DataFrame) -> pd.DataFrame:
    mr = project.get_model_registry()
    
    results = []

    for h in range(1, MAX_HORIZON + 1):
        model_name = f"{MODEL_NAME}_h{h}"

         model = mr.get_model(
            name=model_name,
            version=MODEL_VERSION,
        )

        model_dir = model.download()

        clf = joblib.load(f"{model_dir}/model.pkl")
        feature_names = joblib.load(f"{model_dir}/feature_names.pkl")

        X_inf = X[feature_names]

        y_proba = clf.predict_proba(X_inf)[:, 1]
        threshold = model.metadata.get("threshold", 0.5)

        y_pred = (y_proba >= threshold).astype(int)

        results.append({
            "timestamp": X.index[0],
            "horizon_days": h,
            "ap_threshold": AP_THRESHOLD,
            "probability": float(y_proba[0]),
            "prediction": int(y_pred[0]),
        })

    return pd.DataFrame(results)


IndentationError: unexpected indent (2927939997.py, line 10)

In [13]:
# Step 4
def save_predictions(df: pd.DataFrame):
    fg = fs.get_or_create_feature_group(
        name="aurora_predictions",
        version=1,
        primary_key=["timestamp", "horizon_days"],
        description="Daily batch aurora predictions",
    )

    fg.insert(df)

## Main Loop

In [31]:
def run_daily_inference():
    date = fetch_latest_raw_data()

    print("Date used one the fetch data raw", date)
    X = obtain_data_fv(date)

    print(X)
    #predictions = run_models(features)
    #save_predictions(predictions)

run_daily_inference()



        date  kp1  kp2  kp3  kp4  kp5    kp6    kp7  kp8   ap1   ap2  ap3  \
0 2026-01-03  3.0  3.0  2.0  2.0  3.0  2.667  2.333  1.0  15.0  15.0  7.0   

   ap4   ap5   ap6  ap7  ap8    ap  
0  7.0  15.0  12.0  9.0  4.0  10.0  
last data from kp:  2026-01-03 00:00:00
        date  cloud_cover_mean  precipitation_sum  sunshine_duration
0 2026-01-03             100.0                1.6                0.0
ðŸ§¹ Dropped 6 rows due to NaNs
ðŸ“Š Remaining rows: 2
Index(['date', 'density', 'vsw', 'bz', 'pressure', 'vsw_lag1', 'bz_lag1',
       'pressure_lag1', 'vsw_lag2', 'bz_lag2', 'pressure_lag2', 'vsw_lag3',
       'bz_lag3', 'pressure_lag3', 'bz_3d_mean', 'bz_7d_min', 'vsw_3d_mean',
       'pressure_3d_max', 'vbz', 'vbz_neg'],
      dtype='object')
        date   density         vsw      bz  pressure    vsw_lag1   bz_lag1  \
0 2026-01-03  1.079533  510.620361 -0.9907  0.461538  563.555054 -1.014037   

   pressure_lag1    vsw_lag2   bz_lag2  pressure_lag2    vsw_lag3  bz_lag3  \
0       