In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('aurora',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /Users/appbites/Desktop/id2223-project
Added the following directory to the PYTHONPATH: /Users/appbites/Desktop/id2223-project
HopsworksSettings initialized!


## Imports

In [2]:
import datetime
import requests
import pandas as pd
import hopsworks
from mlfs.aurora import util
import datetime
from pathlib import Path
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")

## Hopsworks login

In [3]:
project = hopsworks.login(engine="python")

2025-12-30 21:48:44,449 INFO: Initializing external client
2025-12-30 21:48:44,450 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-12-30 21:48:46,426 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1289364


In [4]:
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)

csv_file = f"{root_dir}/data/kpdata.csv"
util.check_file_path(csv_file)

kp_yesterday_df = util.get_kp(csv_file, yesterday)
kp_yesterday_df.head()


File successfully found at the path: /Users/appbites/Desktop/id2223-project/data/kpdata.csv


Unnamed: 0,date,Kp1,Kp2,Kp3,Kp4,Kp5,Kp6,Kp7,Kp8,ap1,ap2,ap3,ap4,ap5,ap6,ap7,ap8,Ap
2189,2025-12-29,2.667,2.667,2.333,2.667,1.667,1.0,0.333,1.667,12.0,12.0,9.0,12.0,6.0,4.0,2.0,6.0,8.0


## Read the csv 

In [5]:
df = pd.read_csv(csv_file, skipinitialspace=True)

# Construct date from YYYY, MM, DD
df["date"] = pd.to_datetime(
    dict(year=df.YYYY, month=df.MM, day=df.DD)
)

df


Unnamed: 0,YYYY,MM,DD,days,days_m,BSR,dB,Kp1,Kp2,Kp3,...,ap5,ap6,ap7,ap8,Ap,SN,F10.7obs,F10.7adj,D,date
0,2020,1,1,32142,32142.5,2542,22,0.333,0.000,0.000,...,3,5,4,3,2,6,71.8,69.4,2,2020-01-01
1,2020,1,2,32143,32143.5,2542,23,0.000,0.000,0.333,...,2,3,4,4,2,12,71.9,69.5,2,2020-01-02
2,2020,1,3,32144,32144.5,2542,24,0.667,1.000,2.667,...,4,3,2,5,5,13,71.2,68.9,2,2020-01-03
3,2020,1,4,32145,32145.5,2542,25,1.667,0.667,0.667,...,4,7,9,7,5,12,72.2,69.8,2,2020-01-04
4,2020,1,5,32146,32146.5,2542,26,1.667,1.000,1.667,...,7,7,18,12,8,14,71.8,69.4,2,2020-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,2025,12,25,34327,34327.5,2623,20,3.000,3.333,3.000,...,9,9,4,6,11,123,162.5,157.2,0,2025-12-25
2186,2025,12,26,34328,34328.5,2623,21,1.333,2.333,2.000,...,7,18,3,3,7,120,166.4,160.9,0,2025-12-26
2187,2025,12,27,34329,34329.5,2623,22,1.333,3.000,2.000,...,4,4,7,6,7,140,177.7,171.8,0,2025-12-27
2188,2025,12,28,34330,34330.5,2623,23,2.333,0.667,1.000,...,6,15,12,9,8,152,187.7,181.6,0,2025-12-28


## Check data types

In [6]:
df_kp = df[
    [
        "date",
        "Kp1", "Kp2", "Kp3", "Kp4", "Kp5", "Kp6", "Kp7", "Kp8",
        "ap1", "ap2", "ap3", "ap4", "ap5", "ap6", "ap7", "ap8",
        "Ap"
    ]
].copy()

# Ensure numeric types
feature_cols = [c for c in df_kp.columns if c != "date"]
df_kp[feature_cols] = df_kp[feature_cols].astype("float32")

df_kp


Unnamed: 0,date,Kp1,Kp2,Kp3,Kp4,Kp5,Kp6,Kp7,Kp8,ap1,ap2,ap3,ap4,ap5,ap6,ap7,ap8,Ap
0,2020-01-01,0.333,0.000,0.000,0.667,0.667,1.333,1.000,0.667,2.0,0.0,0.0,3.0,3.0,5.0,4.0,3.0,2.0
1,2020-01-02,0.000,0.000,0.333,0.333,0.333,0.667,1.000,1.000,0.0,0.0,2.0,2.0,2.0,3.0,4.0,4.0,2.0
2,2020-01-03,0.667,1.000,2.667,2.000,1.000,0.667,0.333,1.333,3.0,4.0,12.0,7.0,4.0,3.0,2.0,5.0,5.0
3,2020-01-04,1.667,0.667,0.667,0.667,1.000,2.000,2.333,2.000,6.0,3.0,3.0,3.0,4.0,7.0,9.0,7.0,5.0
4,2020-01-05,1.667,1.000,1.667,1.333,2.000,2.000,3.333,2.667,6.0,4.0,6.0,5.0,7.0,7.0,18.0,12.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,2025-12-25,3.000,3.333,3.000,2.333,2.333,2.333,1.000,1.667,15.0,18.0,15.0,9.0,9.0,9.0,4.0,6.0,11.0
2186,2025-12-26,1.333,2.333,2.000,2.000,2.000,3.333,0.667,0.667,5.0,9.0,7.0,7.0,7.0,18.0,3.0,3.0,7.0
2187,2025-12-27,1.333,3.000,2.000,1.667,1.000,1.000,2.000,1.667,5.0,15.0,7.0,6.0,4.0,4.0,7.0,6.0,7.0
2188,2025-12-28,2.333,0.667,1.000,2.000,1.667,3.000,2.667,2.333,9.0,3.0,4.0,7.0,6.0,15.0,12.0,9.0,8.0


In [7]:
df_kp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2190 entries, 0 to 2189
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    2190 non-null   datetime64[ns]
 1   Kp1     2190 non-null   float32       
 2   Kp2     2190 non-null   float32       
 3   Kp3     2190 non-null   float32       
 4   Kp4     2190 non-null   float32       
 5   Kp5     2190 non-null   float32       
 6   Kp6     2190 non-null   float32       
 7   Kp7     2190 non-null   float32       
 8   Kp8     2190 non-null   float32       
 9   ap1     2190 non-null   float32       
 10  ap2     2190 non-null   float32       
 11  ap3     2190 non-null   float32       
 12  ap4     2190 non-null   float32       
 13  ap5     2190 non-null   float32       
 14  ap6     2190 non-null   float32       
 15  ap7     2190 non-null   float32       
 16  ap8     2190 non-null   float32       
 17  Ap      2190 non-null   float32       
dtypes: datet

In [8]:
df_kp.dropna(inplace=True)
df_kp

Unnamed: 0,date,Kp1,Kp2,Kp3,Kp4,Kp5,Kp6,Kp7,Kp8,ap1,ap2,ap3,ap4,ap5,ap6,ap7,ap8,Ap
0,2020-01-01,0.333,0.000,0.000,0.667,0.667,1.333,1.000,0.667,2.0,0.0,0.0,3.0,3.0,5.0,4.0,3.0,2.0
1,2020-01-02,0.000,0.000,0.333,0.333,0.333,0.667,1.000,1.000,0.0,0.0,2.0,2.0,2.0,3.0,4.0,4.0,2.0
2,2020-01-03,0.667,1.000,2.667,2.000,1.000,0.667,0.333,1.333,3.0,4.0,12.0,7.0,4.0,3.0,2.0,5.0,5.0
3,2020-01-04,1.667,0.667,0.667,0.667,1.000,2.000,2.333,2.000,6.0,3.0,3.0,3.0,4.0,7.0,9.0,7.0,5.0
4,2020-01-05,1.667,1.000,1.667,1.333,2.000,2.000,3.333,2.667,6.0,4.0,6.0,5.0,7.0,7.0,18.0,12.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,2025-12-25,3.000,3.333,3.000,2.333,2.333,2.333,1.000,1.667,15.0,18.0,15.0,9.0,9.0,9.0,4.0,6.0,11.0
2186,2025-12-26,1.333,2.333,2.000,2.000,2.000,3.333,0.667,0.667,5.0,9.0,7.0,7.0,7.0,18.0,3.0,3.0,7.0
2187,2025-12-27,1.333,3.000,2.000,1.667,1.000,1.000,2.000,1.667,5.0,15.0,7.0,6.0,4.0,4.0,7.0,6.0,7.0
2188,2025-12-28,2.333,0.667,1.000,2.000,1.667,3.000,2.667,2.333,9.0,3.0,4.0,7.0,6.0,15.0,12.0,9.0,8.0


In [9]:
earliest_kp_date = df_kp["date"].min().strftime("%Y-%m-%d")
end_date = (datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")

latitude = 62.0
longitude = 15.0

weather_df = util.get_historical_weather_sweden(
    earliest_kp_date,
    end_date,
    latitude,
    longitude
)

weather_df.head()


Unnamed: 0,date,cloud_cover_mean,precipitation_sum,sunshine_duration
0,2020-01-01,92.333336,0.0,3612.911377
1,2020-01-02,96.625,0.0,0.0
2,2020-01-03,52.958332,0.0,7516.577148
3,2020-01-04,19.5,0.0,8072.493164
4,2020-01-05,77.0,1.0,0.0


In [10]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2190 entries, 0 to 2189
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               2190 non-null   datetime64[ns]
 1   cloud_cover_mean   2190 non-null   float32       
 2   precipitation_sum  2190 non-null   float32       
 3   sunshine_duration  2190 non-null   float32       
dtypes: datetime64[ns](1), float32(3)
memory usage: 42.9 KB


## Aurora-specific validation suite for geomagnetic features

In [12]:
import great_expectations as ge

kp_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="geomagnetic_expectation_suite"
)

for kp_col in ["kp1", "kp2", "kp3", "kp4", "kp5", "kp6", "kp7", "kp8"]:
    kp_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_between",
            kwargs={
                "column": kp_col,
                "min_value": 0.0,
                "max_value": 9.0
            }
        )
    )

for ap_col in ["ap1", "ap2", "ap3", "ap4", "ap5", "ap6", "ap7", "ap8", "ap"]:
    kp_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_between",
            kwargs={
                "column": ap_col,
                "min_value": 0.0,
                "max_value": 400.0
            }
        )
    )


## Weather validation suite

In [13]:
weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="sweden_weather_expectation_suite"
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "cloud_cover_mean",
            "min_value": 0.0,
            "max_value": 100.0
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "precipitation_sum",
            "min_value": 0.0
        }
    )
)

weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "sunshine_duration",
            "min_value": 0.0
        }
    )
)


{"expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "sunshine_duration", "min_value": 0.0}, "meta": {}}

## Connect to Hopsworks

In [14]:
fs = project.get_feature_store() 

### Define the feature group

In [15]:
df_kp.columns = df_kp.columns.str.lower()

geomagnetic_fg = fs.get_or_create_feature_group(
    name="geomagnetic_daily",
    description="Daily global geomagnetic activity indices (Kp, ap, Ap)",
    version=1,
    primary_key=["date"],
    event_time="date",
    expectation_suite=kp_expectation_suite
)


## Insert the DataFrame into the feature group

In [None]:
geomagnetic_fg.insert(df_kp, wait=True)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1289364/fs/1278019/fg/1876508
2025-12-30 21:49:52,280 INFO: 	17 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1289364/fs/1278019/fg/1876508


Uploading Dataframe: 100.00% |██████████| Rows 2190/2190 | Elapsed Time: 00:11 | Remaining Time: 00:00


Launching job: geomagnetic_daily_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1289364/jobs/named/geomagnetic_daily_1_offline_fg_materialization/executions
2025-12-30 21:50:23,543 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-12-30 21:50:29,986 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-12-30 21:52:25,854 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-12-30 21:52:26,070 INFO: Waiting for log aggregation to finish.
2025-12-30 21:52:34,760 INFO: Execution finished successfully.


(Job('geomagnetic_daily_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_values_to_be_between",
         "kwargs": {
           "column": "kp1",
           "min_value": 0.0,
           "max_value": 9.0
         },
         "meta": {
           "expectationId": 799786
         }
       },
       "result": {
         "element_count": 2190,
         "missing_count": 0,
         "missing_percent": 0.0,
         "unexpected_count": 0,
         "unexpected_percent": 0.0,
         "unexpected_percent_total": 0.0,
         "unexpected_percent_nonmissing": 0.0,
         "partial_unexpected_list": []
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-12-30T08:49:52.000279Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": 

## Add descriptions for features

In [17]:
geomagnetic_fg.update_feature_description(
    "date",
    "Date of global geomagnetic observation (daily resolution)"
)

geomagnetic_fg.update_feature_description(
    "kp1",
    "Geomagnetic Kp index for 00:00–03:00 UTC"
)
geomagnetic_fg.update_feature_description(
    "kp2",
    "Geomagnetic Kp index for 03:00–06:00 UTC"
)
geomagnetic_fg.update_feature_description(
    "kp3",
    "Geomagnetic Kp index for 06:00–09:00 UTC"
)
geomagnetic_fg.update_feature_description(
    "kp4",
    "Geomagnetic Kp index for 09:00–12:00 UTC"
)
geomagnetic_fg.update_feature_description(
    "kp5",
    "Geomagnetic Kp index for 12:00–15:00 UTC"
)
geomagnetic_fg.update_feature_description(
    "kp6",
    "Geomagnetic Kp index for 15:00–18:00 UTC"
)
geomagnetic_fg.update_feature_description(
    "kp7",
    "Geomagnetic Kp index for 18:00–21:00 UTC"
)
geomagnetic_fg.update_feature_description(
    "kp8",
    "Geomagnetic Kp index for 21:00–24:00 UTC"
)

geomagnetic_fg.update_feature_description(
    "ap1",
    "Linear ap geomagnetic index corresponding to kp1"
)
geomagnetic_fg.update_feature_description(
    "ap2",
    "Linear ap geomagnetic index corresponding to kp2"
)
geomagnetic_fg.update_feature_description(
    "ap3",
    "Linear ap geomagnetic index corresponding to kp3"
)
geomagnetic_fg.update_feature_description(
    "ap4",
    "Linear ap geomagnetic index corresponding to kp4"
)
geomagnetic_fg.update_feature_description(
    "ap5",
    "Linear ap geomagnetic index corresponding to kp5"
)
geomagnetic_fg.update_feature_description(
    "ap6",
    "Linear ap geomagnetic index corresponding to kp6"
)
geomagnetic_fg.update_feature_description(
    "ap7",
    "Linear ap geomagnetic index corresponding to kp7"
)
geomagnetic_fg.update_feature_description(
    "ap8",
    "Linear ap geomagnetic index corresponding to kp8"
)

geomagnetic_fg.update_feature_description(
    "ap",
    "Daily average linear geomagnetic ap index"
)


<hsfs.feature_group.FeatureGroup at 0x15517c370>

### Weather data feature group

In [18]:
weather_fg = fs.get_or_create_feature_group(
    name="sweden_weather_daily",
    description="Daily weather conditions in Sweden relevant for aurora observability",
    version=1,
    primary_key=["date"],
    event_time="date",
    expectation_suite=weather_expectation_suite
)


## Insert the DataFrame into the Feature Group

In [19]:
weather_fg.insert(weather_df, wait=True)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1289364/fs/1278019/fg/1876511
2025-12-30 21:56:43,929 INFO: 	3 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1289364/fs/1278019/fg/1876511


Uploading Dataframe: 100.00% |██████████| Rows 2190/2190 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: sweden_weather_daily_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1289364/jobs/named/sweden_weather_daily_1_offline_fg_materialization/executions
2025-12-30 21:57:13,084 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-12-30 21:57:19,477 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-12-30 21:58:55,577 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-12-30 21:58:55,755 INFO: Waiting for log aggregation to finish.
2025-12-30 21:59:04,830 INFO: Execution finished successfully.


(Job('sweden_weather_daily_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_values_to_be_between",
         "kwargs": {
           "column": "sunshine_duration",
           "min_value": 0.0
         },
         "meta": {
           "expectationId": 799792
         }
       },
       "result": {
         "element_count": 2190,
         "missing_count": 0,
         "missing_percent": 0.0,
         "unexpected_count": 0,
         "unexpected_percent": 0.0,
         "unexpected_percent_total": 0.0,
         "unexpected_percent_nonmissing": 0.0,
         "partial_unexpected_list": []
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-12-30T08:56:43.000928Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       

## Add feature descriptions

In [20]:
weather_fg.update_feature_description(
    "date",
    "Date of daily weather observation for Sweden"
)

weather_fg.update_feature_description(
    "cloud_cover_mean",
    "Mean daily cloud cover in percent (0–100), affecting aurora visibility"
)

weather_fg.update_feature_description(
    "precipitation_sum",
    "Total daily precipitation in millimeters (rain or snow)"
)

weather_fg.update_feature_description(
    "sunshine_duration",
    "Total duration of sunshine during the day in seconds, proxy for sky clarity"
)


<hsfs.feature_group.FeatureGroup at 0x1550f2620>