In [None]:
import os
import requests
import json
import pandas as pd
from datetime import date, datetime, timedelta
import sys
import hopsworks

In [None]:
with open("../city_config/gothenburg_femman.json") as f:
    city_config = json.load(f)

CITY_NAME = city_config["city_name"]
LAT = city_config["city_lat"]
LON = city_config["city_lon"]
SENSOR = city_config["sensors"][0]  # only one station
FG_VERSIONS = city_config["fg_versions"]

YESTERDAY = date.today() - timedelta(days=1)
START_DATE = END_DATE = YESTERDAY.isoformat()

In [None]:
yesterday_quality_url = (
    f"https://air-quality-api.open-meteo.com/v1/air-quality?"
    f"latitude={LAT}&longitude={LON}"
    f"&past_days=1"
    f"&hourly=pm2_5"     # problem, only 'hourly' measures exist
    "&timezone=Europe%2FBerlin"
)

yesterday_weather_url = (
    f"https://archive-api.open-meteo.com/v1/archive?"
    f"latitude={LAT}&longitude={LON}"
    f"&start_date={START_DATE}"
    f"&end_date={END_DATE}"
    f"&daily=wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,temperature_2m_max"
    "&timezone=Europe%2FBerlin"    # same timezone as Sweden     
)

weather_prediction_url = (
    f"https://api.open-meteo.com/v1/forecast?"
    f"latitude={LAT}&longitude={LON}"
    # f"&hourly=temperature_2m,wind_speed_10m,wind_direction_10m,wind_gusts_10m"   # problem, our forecast API call receives hourly forecasts
    f"&daily=wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,temperature_2m_max"
    "&timezone=Europe%2FBerlin"
)

In [None]:
print("Fetching yesterday's air quality data...")
aq_response = requests.get(yesterday_quality_url)
aq_response.raise_for_status()
aq_data = aq_response.json()

air_df = pd.DataFrame({
    "date": aq_data["hourly"]["time"],     # hourly values
    "pm2_5": aq_data["hourly"]["pm2_5"]
})
air_df["date"] = pd.to_datetime(air_df["date"], yearfirst=True).dt.date
air_df = air_df.groupby('date', as_index=False)['pm2_5'].mean()     # average pm2_5 for that day
air_df["country"] = os.getenv("AQICN_COUNTRY")
air_df["city"] = os.getenv("AQICN_CITY")
air_df["street"] = os.getenv("AQICN_STREET") 
print(f"Air quality data shape: {air_df.shape}")

In [None]:
print("Downloading yesterday's weather data...")
w_response = requests.get(yesterday_weather_url)
w_response.raise_for_status()
w_data = w_response.json()

weather_df = pd.DataFrame(w_data["daily"])
weather_df["date"] = pd.to_datetime(weather_df["time"])
weather_df.drop(columns=["time"], inplace=True)
weather_df["country"] = os.getenv("AQICN_COUNTRY")
weather_df["city"] = os.getenv("AQICN_CITY")
weather_df["street"] = os.getenv("AQICN_STREET")
print(f"Weather data shape: {weather_df.shape}")

In [None]:
print("Downloading 7 day weather forecast...")
f_response = requests.get(weather_prediction_url)
f_response.raise_for_status()
f_data = f_response.json()

forecast_df = pd.DataFrame(f_data["daily"])

# Standardize timestamp column name
forecast_df["date"] = pd.to_datetime(forecast_df["time"])
forecast_df.drop(columns=["time"], inplace=True)
print(f"Forecast data shape: {forecast_df.shape}")

In [None]:
# Data validation
aq_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="aq_expectation_suite"
)

# Check for null values
aq_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "pm2_5"}
    )
)

# Check for reasonable PM2.5 range
aq_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "pm2_5",
            "min_value": 0.0,
            "max_value": 500.0,
            "strict_min": True
        }
    )
)     

weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

# Check for null values
weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": "temperature_2m_max"}
    )
)

# Check for reasonable range
weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "temperature_2m_max",
            "min_value": -60.0,
            "max_value": 60.0
        }
    )
)
    

In [None]:
print("Connecting to Hopsworks...")
hopsworks_key = os.getenv("HOPSWORKS_API_KEY") 
hopsworks_project_name = os.getenv("HOPSWORKS_PROJECT")
project = hopsworks.login(project=hopsworks_project_name, api_key_value=hopsworks_key) 
fs = project.get_feature_store()

In [None]:
print("Updating feature groups...")

# Hourly PM2.5 air quality data
aqi_fg = fs.get_or_create_feature_group(
        name="air_quality",
        description=f"Air Quality characteristics of each day for {CITY_NAME} ({SENSOR['display_name']})",
        version=FG_VERSIONS["air_quality"],
        primary_key=["city"],   
        expectation_suite = aq_expectation_suite,   
        event_time="date"
    )

aqi_fg.insert(air_df, write_options={"wait_for_job": False})

# Weather data, containing wind_speed_100m, wind_direction_100m, wind_gusts_10m, wind_direction_10m, wind_speed_10m, temperature_2m
weather_fg = fs.get_or_create_feature_group(
        name="weather",
        description=f"Historical weather data for {CITY_NAME}",
        version=FG_VERSIONS["weather"],
        primary_key=["city"],
        expectation_suite = weather_expectation_suite,
        event_time="date"
    )

weather_fg.insert(weather_df, write_options={"wait_for_job": False})

# Weather prediction for the next 7 days (default), containing temperature_2m, wind_speed_10m, wind_direction_10m, wind_gusts_10m
forecast_fg = fs.get_or_create_feature_group(
    name="weather_forecast_features",
    description=f"7-day weather forecast for {CITY_NAME}",
    version=1,
    primary_key=["date"],  
    expectation_suite = weather_expectation_suite
)
forecast_fg.insert(forecast_df, write_options={"wait_for_job": False})

print("All feature groups updated successfully!")