In [1]:
from Features.df_functions import *
from hsfs.feature import Feature
import datetime
import requests
import pandas as pd
import hopsworks
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
load_dotenv()

HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")

In [2]:
project = hopsworks.login()

2026-01-06 12:57:13,099 INFO: Initializing external client
2026-01-06 12:57:13,099 INFO: Base URL: https://c.app.hopsworks.ai:443






2026-01-06 12:57:15,429 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1327256


In [3]:
import great_expectations as ge
from great_expectations.core import ExpectationSuite, ExpectationConfiguration

birding_suite = ExpectationSuite(
    expectation_suite_name="birding_suite"
)

# RAIN: values >= 0
birding_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "rain",
            "min_value": 0,
        },))

# WIND: values >= 0
birding_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "wind",
            "min_value": 0,
        },))
# TEMPERATURE: reasonable physical range
birding_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "temperature",
            "min_value": -50,
            "max_value": 50,
        },))

# WEATHERCODE: must be integer
birding_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_of_type",
        kwargs={
            "column": "weathercode",
            "type_": "int",
        },))


{"expectation_type": "expect_column_values_to_be_of_type", "kwargs": {"column": "weathercode", "type_": "int"}, "meta": {}}

In [4]:
fs = project.get_feature_store() 
"""
birding_fg = fs.get_or_create_feature_group(
    name='birding',
    description='Birding characteristics of each day',
    version=1,
    primary_key=['bird_type', "region", "observation_date"],
    event_time="observation_date",
    expectation_suite=birding_suite
)
"""

birding_fg = fs.get_or_create_feature_group(
    name='birding_clean',            # NEW NAME
    description='Birding characteristics of each day (clean reset)',
    version=1,
    primary_key=['bird_type', 'region', 'observation_date'],
    event_time='observation_date',
    hudi_precombine_key='ingested_at',   # RECOMMENDED
    expectation_suite=birding_suite
)


In [5]:
history_df = historical()
hist = keep_until_yesterday(history_df)



df = to_hopsworks_df(hist)
df["ingested_at"] = pd.Timestamp.utcnow()
birding_fg.insert(df)


birding_fg.update_feature_description(
    "region",
    "Swedish administrative region (landskap) where the bird observation was recorded"
)

birding_fg.update_feature_description(
    "observation_date",
    "Calendar date on which the bird observation was made"
)

birding_fg.update_feature_description(
    "wind",
    "Average daily wind speed (km/h) for the region on the observation date"
)

birding_fg.update_feature_description(
    "rain",
    "Total daily precipitation (mm) for the region on the observation date"
)

birding_fg.update_feature_description(
    "weathercode",
    "Categorical weather condition code describing the dominant daily weather pattern"
)

birding_fg.update_feature_description(
    "temperature",
    "Average daily air temperature (°C) for the region on the observation date"
)

birding_fg.update_feature_description(
    "observation_count",
    "Number of individual birds observed for the given species, region, and date"
)

birding_fg.update_feature_description(
    "time_observations_started",
    "Time of day when the bird observation effort began (HH:MM:SS)"
)

birding_fg.update_feature_description(
    "bird_type",
    "Species identifier for the observed bird (e.g., whteag, goleag)"
)

birding_fg.update_feature_description(
    "year",
    "Year index normalized relative to the start year of the dataset"
)

# One-hot encoded month indicators
birding_fg.update_feature_description(
    "month_1", "Indicator variable equal to 1 if the observation occurred in January"
)
birding_fg.update_feature_description(
    "month_2", "Indicator variable equal to 1 if the observation occurred in February"
)
birding_fg.update_feature_description(
    "month_3", "Indicator variable equal to 1 if the observation occurred in March"
)
birding_fg.update_feature_description(
    "month_4", "Indicator variable equal to 1 if the observation occurred in April"
)
birding_fg.update_feature_description(
    "month_5", "Indicator variable equal to 1 if the observation occurred in May"
)
birding_fg.update_feature_description(
    "month_6", "Indicator variable equal to 1 if the observation occurred in June"
)
birding_fg.update_feature_description(
    "month_7", "Indicator variable equal to 1 if the observation occurred in July"
)
birding_fg.update_feature_description(
    "month_8", "Indicator variable equal to 1 if the observation occurred in August"
)
birding_fg.update_feature_description(
    "month_9", "Indicator variable equal to 1 if the observation occurred in September"
)
birding_fg.update_feature_description(
    "month_10", "Indicator variable equal to 1 if the observation occurred in October"
)
birding_fg.update_feature_description(
    "month_11", "Indicator variable equal to 1 if the observation occurred in November"
)
birding_fg.update_feature_description(
    "month_12", "Indicator variable equal to 1 if the observation occurred in December"
)


Loaded cached weather for region: Skåne
Loaded cached weather for region: Blekinge
Loaded cached weather for region: Öland
Loaded cached weather for region: Halland
Loaded cached weather for region: Småland
Loaded cached weather for region: Gotland
Loaded cached weather for region: Västergötland
Loaded cached weather for region: Östergötland
Loaded cached weather for region: Bohuslän
Loaded cached weather for region: Dalsland
Loaded cached weather for region: Närke
Loaded cached weather for region: Södermanland
Loaded cached weather for region: Värmland
Loaded cached weather for region: Västmanland
Loaded cached weather for region: Uppland
Loaded cached weather for region: Gästrikland
Loaded cached weather for region: Dalarna
Loaded cached weather for region: Hälsingland
Loaded cached weather for region: Härjedalen
Loaded cached weather for region: Medelpad
Loaded cached weather for region: Ångermanland
Loaded cached weather for region: Jämtland
Loaded cached weather for region: Väster

Uploading Dataframe: 95.86% |█████████▌| Rows 262033/273358 | Elapsed Time: 03:21 | Remaining Time: 00:09%6|1767700860.703|FAIL|rdkafka#producer-1| [thrd:ssl://51.161.81.208:9093/bootstrap]: ssl://51.161.81.208:9093/2: Disconnected (after 201362ms in state UP)
Uploading Dataframe: 100.00% |██████████| Rows 273358/273358 | Elapsed Time: 03:24 | Remaining Time: 00:00


Launching job: birding_clean_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1327256/jobs/named/birding_clean_1_offline_fg_materialization/executions


<hsfs.feature_group.FeatureGroup at 0x3022485c0>

In [None]:
birding_fg = fs.get_or_create_feature_group(
    name='birding',
    description='Birding characteristics of each day',
    version=1,
    primary_key=['bird_type', "region", "observation_date"],
    event_time="observation_date",
    expectation_suite=birding_suite
)



print(birding_fg.select(["region"]).read().head())

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.85s) 
         region
0      Medelpad
1      Medelpad
2         Närke
3      Värmland
4  Östergötland
