In [37]:
import os
import requests
import json
import pandas as pd
from datetime import date, datetime, timedelta
import sys
import hopsworks
import great_expectations as ge

In [38]:
with open("../city_config/gothenburg_femman.json") as f:
    city_config = json.load(f)

CITY_NAME = city_config["city_name"]
LAT = city_config["city_lat"]
LON = city_config["city_lon"]
SENSOR = city_config["sensors"][0]  # only one station
FG_VERSIONS = city_config["fg_versions"]

YESTERDAY = date.today() - timedelta(days=1)
START_DATE = END_DATE = YESTERDAY.isoformat()

In [39]:
yesterday_quality_url = (
    f"https://air-quality-api.open-meteo.com/v1/air-quality?"
    f"latitude={LAT}&longitude={LON}"
    f"&past_days=1"
    f"&hourly=pm2_5"     # problem, only 'hourly' measures exist
    "&timezone=Europe%2FBerlin"
)

yesterday_weather_url = (
    f"https://archive-api.open-meteo.com/v1/archive?"
    f"latitude={LAT}&longitude={LON}"
    f"&start_date={START_DATE}"
    f"&end_date={END_DATE}"
    f"&daily=wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,temperature_2m_max"
    "&timezone=Europe%2FBerlin"    # same timezone as Sweden     
)

weather_prediction_url = (
    f"https://api.open-meteo.com/v1/forecast?"
    f"latitude={LAT}&longitude={LON}"
    # f"&hourly=temperature_2m,wind_speed_10m,wind_direction_10m,wind_gusts_10m"   # problem, our forecast API call receives hourly forecasts
    f"&daily=wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,temperature_2m_max"
    "&timezone=Europe%2FBerlin"
)

In [40]:
print("Fetching yesterday's air quality data...")
aq_response = requests.get(yesterday_quality_url)
aq_response.raise_for_status()
aq_data = aq_response.json()

air_df = pd.DataFrame({
    "date": aq_data["hourly"]["time"],     # hourly values
    "pm2_5": aq_data["hourly"]["pm2_5"]
})
air_df["date"] = pd.to_datetime(air_df["date"], yearfirst=True)

print(air_df.columns)

# Select only the value at 12:00 to get a daily representative value
air_df = air_df[air_df["date"].dt.hour == 12].copy()

# Keep only date (remove hours)
air_df["date"] = air_df["date"].dt.floor("D")

print(air_df.columns)

# Convert back to datetime
air_df["date"] = pd.to_datetime(air_df["date"])

air_df["country"] = os.getenv("AQICN_COUNTRY")
air_df["city"] = os.getenv("AQICN_CITY")
air_df["street"] = os.getenv("AQICN_STREET") 

print(air_df.head())
print(f"Air quality data shape: {air_df.shape}")

Fetching yesterday's air quality data...
Index(['date', 'pm2_5'], dtype='object')
Index(['date', 'pm2_5'], dtype='object')
          date  pm2_5 country      city  street
12  2025-11-12    7.0  Sweden  Göteborg  Femman
36  2025-11-13    2.1  Sweden  Göteborg  Femman
60  2025-11-14    5.5  Sweden  Göteborg  Femman
84  2025-11-15    3.9  Sweden  Göteborg  Femman
108 2025-11-16    1.8  Sweden  Göteborg  Femman
Air quality data shape: (6, 5)


In [41]:
air_df.columns

Index(['date', 'pm2_5', 'country', 'city', 'street'], dtype='object')

In [42]:
print("Downloading yesterday's weather data...")
w_response = requests.get(yesterday_weather_url)
w_response.raise_for_status()
w_data = w_response.json()

weather_df = pd.DataFrame(w_data["daily"])
weather_df["date"] = pd.to_datetime(weather_df["time"])
weather_df.drop(columns=["time"], inplace=True)
weather_df["country"] = os.getenv("AQICN_COUNTRY")
weather_df["city"] = os.getenv("AQICN_CITY")
weather_df["street"] = os.getenv("AQICN_STREET")
print(f"Weather data shape: {weather_df.shape}")

Downloading yesterday's weather data...
Weather data shape: (1, 8)


In [43]:
print("Downloading 7 day weather forecast...")
f_response = requests.get(weather_prediction_url)
f_response.raise_for_status()
f_data = f_response.json()

forecast_df = pd.DataFrame(f_data["daily"])

# Standardize timestamp column name
forecast_df["date"] = pd.to_datetime(forecast_df["time"])
forecast_df.drop(columns=["time"], inplace=True)
print(f"Forecast data shape: {forecast_df.shape}")

Downloading 7 day weather forecast...
Forecast data shape: (7, 5)


In [44]:
# Data validation
aq_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="aq_expectation_suite"
)

# Check for null values
aq_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "pm2_5"}
    )
)

# Check for reasonable PM2.5 range
aq_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "pm2_5",
            "min_value": 0.0,
            "max_value": 500.0,
            "strict_min": True
        }
    )
)     

weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

# Check for null values
weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "temperature_2m_max"}
    )
)

# Check for reasonable range
weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "temperature_2m_max",
            "min_value": -60.0,
            "max_value": 60.0
        }
    )
)
    

{"expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "temperature_2m_max", "min_value": -60.0, "max_value": 60.0}, "meta": {}}

In [45]:
print("Connecting to Hopsworks...")
hopsworks_key = os.getenv("HOPSWORKS_API_KEY") 
hopsworks_project_name = os.getenv("HOPSWORKS_PROJECT")
project = hopsworks.login(project=hopsworks_project_name, api_key_value=hopsworks_key) 
fs = project.get_feature_store()

Connecting to Hopsworks...
2025-11-13 09:43:39,778 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-11-13 09:43:39,780 INFO: Initializing external client
2025-11-13 09:43:39,781 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2025-11-13 09:43:40,996 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1271989


In [46]:
print("Updating feature groups...")

# Hourly PM2.5 air quality data
aqi_fg = fs.get_or_create_feature_group(
        name="air_quality",
        description=f"Air Quality characteristics of each day for {CITY_NAME} ({SENSOR['display_name']})",
        version=FG_VERSIONS["air_quality"],
        primary_key=["city", 'date', 'street'],   
        expectation_suite = aq_expectation_suite,   
        event_time="date"
    )

aqi_fg.insert(air_df, write_options={"wait_for_job": False})

# Weather data, containing wind_speed_100m, wind_direction_100m, wind_gusts_10m, wind_direction_10m, wind_speed_10m, temperature_2m
weather_fg = fs.get_or_create_feature_group(
        name="weather",
        description=f"Historical weather data for {CITY_NAME}",
        version=FG_VERSIONS["weather"],
        primary_key=["city", 'date', 'street'],
        expectation_suite = weather_expectation_suite,
        event_time="date"
    )

weather_fg.insert(weather_df, write_options={"wait_for_job": False})

# Weather prediction for the next 7 days (default), containing temperature_2m, wind_speed_10m, wind_direction_10m, wind_gusts_10m
forecast_fg = fs.get_or_create_feature_group(
    name="weather_forecast_features",
    description=f"7-day weather forecast for {CITY_NAME}",
    version=FG_VERSIONS["weather_forecast_features"],
    primary_key=['date'],  
    expectation_suite = weather_expectation_suite
)
forecast_fg.insert(forecast_df, write_options={"wait_for_job": False})

print("All feature groups updated successfully!")

Updating feature groups...
2025-11-13 09:43:42,765 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1271989/fs/1258587/fg/1703345


Uploading Dataframe: 100.00% |██████████| Rows 6/6 | Elapsed Time: 00:00 | Remaining Time: 00:00


Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/ID2223_Lab_1_Axel_Kajsa/Resources/jobs/air_quality_1_offline_fg_materialization/config_1763023340436) to trigger the materialization job again.

2025-11-13 09:43:51,141 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1271989/fs/1258587/fg/1703344


Uploading Dataframe: 100.00% |██████████| Rows 1/1 | Elapsed Time: 00:00 | Remaining Time: 00:00


Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/ID2223_Lab_1_Axel_Kajsa/Resources/jobs/weather_1_offline_fg_materialization/config_1763023324385) to trigger the materialization job again.

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1271989/fs/1258587/fg/1703346
2025-11-13 09:44:00,753 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1271989/fs/1258587/fg/1703346


Uploading Dataframe: 100.00% |██████████| Rows 7/7 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_forecast_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1271989/jobs/named/weather_forecast_features_1_offline_fg_materialization/executions
All feature groups updated successfully!
