# Notebook 3 – First baseline model

Purpose:
- Load truth data from Hopsworks feature store.
- Define feature set and train the first baseline model.
- Run predictions on a holdout set and evaluate (e.g., MAE/MSE).
- Log model version and metrics (Hopsworks/MLflow/file).
- Persist the trained model so Notebook 4 can load it.

Notes:
- Document chosen model type (e.g., XGBoost, RandomForest, LSTM) and tested hyperparameters.
- Record performance and next experiments to try.


In [1]:
from pathlib import Path
import os
import sys
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error, r2_score
import json
import warnings
warnings.filterwarnings("ignore")

from dotenv import load_dotenv
import hopsworks

# 1. Find project root (one level up from notebooks/)
root_dir = Path("..").resolve()

# 2. Add project root to PYTHONPATH so we can import the src package
if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# 3. Load .env from project root
env_path = root_dir / ".env"
load_dotenv(env_path)

# 4. Load settings and utility functions (after adjusting PYTHONPATH)
from src.config import ElectricitySettings
from src import util

settings = ElectricitySettings()

# 5. Log in to Hopsworks and get feature store
project = hopsworks.login(engine="python")
fs = project.get_feature_store()

print("Successfully logged in to Hopsworks project:", settings.HOPSWORKS_PROJECT)


ElectricitySettings initialized
2025-12-12 19:58:15,285 INFO: Initializing external client
2025-12-12 19:58:15,285 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2025-12-12 19:58:16,118 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/127
Successfully logged in to Hopsworks project: ScalableProject


In [2]:
secrets = hopsworks.get_secrets_api()
area = secrets.get_secret("ELECTRICITY_LOCATION_JSON").value
area = json.loads(area)
PRICE_AREA = area['price_area']
CITY = area['city']
LATITUDE = area['latitude']
LONGITUDE = area['longitude']

In [3]:
# Get the feature groups    
electricity_prices_fg = fs.get_feature_group('electricity_prices', version=1)
weather_hourly_fg = fs.get_feature_group('weather_hourly', version=1)

In [4]:
# Select features for training data and join on primary key (price_area, unix_time)
price_feats = electricity_prices_fg.select([
    "price_area",
    "unix_time",
    "price_sek",
])

weather_feats = weather_hourly_fg.select([
    "price_area",
    "unix_time",
    "date",
    "hour",
    "temperature_2m", "apparent_temperature",
    "precipitation", "rain", "snowfall",
    "cloud_cover",
    "wind_speed_10m", "wind_speed_100m",
    "wind_direction_10m", "wind_direction_100m",
    "wind_gusts_10m",
    "surface_pressure",
])

# Filter to target price area
price_feats = price_feats.filter(electricity_prices_fg["price_area"] == PRICE_AREA.lower())
weather_feats = weather_feats.filter(weather_hourly_fg["price_area"] == PRICE_AREA.lower())

# Join on PK
features = weather_feats.join(price_feats, on=["price_area", "unix_time"])





In [5]:
feature_view = fs.get_or_create_feature_view(
    name=f"electricity_prices_fv_{PRICE_AREA.lower()}",
    description=f"weather + electricity prices features for {PRICE_AREA}",
    version=1,
    labels=["price_sek"],
    query=features,
)



Feature view created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/127/fs/74/fv/electricity_prices_fv_se3/version/1


In [6]:
df = feature_view.get_batch_data().sort_values("date")
t_min, t_max = df["date"].min(), df["date"].max()
test_start = t_min + (t_max - t_min) * 0.8  # 80 % första delen blir train

# 2) Låt feature view göra splitten
X_train, X_test, y_train, y_test = feature_view.train_test_split(
    test_start=test_start
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.13s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.62s) 
2025-12-12 19:58:48,832 INFO: Computing insert statistics
2025-12-12 19:58:48,845 INFO: Computing insert statistics



In [7]:
X_train

Unnamed: 0,price_area,unix_time,date,hour,temperature_2m,apparent_temperature,precipitation,rain,snowfall,cloud_cover,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,surface_pressure,electricity_prices_price_area,electricity_prices_unix_time
0,se3,1765321200000,2025-12-09 23:00:00,23,4.4715,2.268217,0.0,0.0,0.0,100.0,5.315336,9.553576,208.300659,222.709366,11.159999,1006.049255,se3,1765321200000
1,se3,1765382400000,2025-12-10 16:00:00,16,8.1715,3.818823,0.0,0.0,0.0,19.0,22.796841,40.188805,248.702576,250.915741,47.16,999.106323,se3,1765382400000
2,se3,1765483200000,2025-12-11 20:00:00,20,5.0715,2.535372,0.0,0.0,0.0,100.0,9.0,18.080818,360.0,9.743584,16.559999,1005.257812,se3,1765483200000
4,se3,1765335600000,2025-12-10 03:00:00,3,3.4215,0.702617,0.0,0.0,0.0,100.0,6.842368,11.409785,181.507401,186.340103,19.08,1004.841919,se3,1765335600000
5,se3,1765342800000,2025-12-10 05:00:00,5,4.2215,1.07255,0.1,0.1,0.0,100.0,10.934166,20.969805,159.775055,168.111359,20.16,1003.454102,se3,1765342800000
6,se3,1765353600000,2025-12-10 08:00:00,8,5.8715,2.037219,0.4,0.4,0.0,100.0,19.011953,33.706169,161.221878,165.147369,36.0,1000.479431,se3,1765353600000
7,se3,1765422000000,2025-12-11 03:00:00,3,6.3715,2.88308,0.0,0.0,0.0,100.0,15.676542,30.007679,235.762482,239.743652,30.599998,998.889099,se3,1765422000000
8,se3,1765479600000,2025-12-11 19:00:00,19,5.3215,2.988918,0.0,0.0,0.0,80.0,7.895416,16.557064,335.772278,348.079285,14.04,1003.565186,se3,1765479600000
9,se3,1765328400000,2025-12-10 01:00:00,1,3.3715,0.321448,0.0,0.0,0.0,100.0,9.42382,14.602204,208.523026,210.358032,20.519999,1005.738708,se3,1765328400000
10,se3,1765360800000,2025-12-10 10:00:00,10,7.0215,3.811974,0.1,0.1,0.0,100.0,16.451052,30.04706,197.840302,204.415543,34.919998,998.296875,se3,1765360800000


In [12]:
cat_cols = [c for c in X_train.columns if 'price_area' in c]
X_features = X_train.drop(columns=['date'] + cat_cols)
X_test_features = X_test.drop(columns=['date'] + cat_cols)

In [13]:
y_train

Unnamed: 0,electricity_prices_price_sek
0,0.43914
1,0.44037
2,0.56815
4,0.29683
5,0.43282
6,0.52634
7,0.28789
8,0.65474
9,0.30064
10,0.45554


In [14]:
# Creating an instance of the XGBoost Regressor
xgb_regressor = XGBRegressor()

# Fitting the XGBoost Regressor to the training data
xgb_regressor.fit(X_features, y_train)

In [15]:
# Predicting target values on the test set
y_pred = xgb_regressor.predict(X_test_features)

# Calculating Mean Squared Error (MSE) using sklearn
mse = mean_squared_error(y_test.iloc[:,0], y_pred)
print("MSE:", mse)

# Calculating R squared using sklearn
r2 = r2_score(y_test.iloc[:,0], y_pred)
print("R squared:", r2)



MSE: 0.055522442


R squared: -4.584106805186575
