# 1. Backfill

## 1.1. Setup

In [1]:
# Standard imports
import os
import sys
import json
import time
from datetime import datetime, timezone, date
import warnings
from pathlib import Path
warnings.filterwarnings("ignore", module="IPython")

#  Establish project root directory
def find_project_root(start: Path):
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    return start

root_dir = find_project_root(Path().absolute())
print("Project root dir:", root_dir)

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

# Third-party imports
import requests
import pandas as pd
import great_expectations as gx
import hopsworks

#  Project imports
from utils import cleaning, config, feature_engineering, fetchers, hopsworks_admin, incremental, metadata

#  Load settings 
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")
HOPSWORKS_API_KEY = settings.HOPSWORKS_API_KEY.get_secret_value()
GITHUB_USERNAME = settings.GH_USERNAME.get_secret_value()

# Login to Hopsworks
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Project root dir: c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn
HopsworksSettings initialized!
2026-01-05 16:35:11,836 INFO: Initializing external client
2026-01-05 16:35:11,837 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2026-01-05 16:35:13,635 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279184


Repository management

In [2]:
def clone_or_update_repo(username: str):
    repo_name = "pm25-forecast-openmeteo-aqicn"

    # 1. Detect if already inside the repo
    cwd = Path().absolute()
    for parent in [cwd] + list(cwd.parents):
        if (parent / ".git").exists() and parent.name == repo_name:
            print(f"Already inside repo at {parent}")
            return parent

    # 2. Detect if the repo exists in the current directory
    repo_dir = Path(repo_name)
    if repo_dir.exists():
        print(f"Repository exists at {repo_dir.absolute()}")
        os.system(f"git -C {repo_dir} pull")
        return repo_dir

    # 3. Otherwise clone it
    print("Cloning repository...")
    url = f"https://github.com/{username}/{repo_name}.git"
    exit_code = os.system(f"git clone {url}")

    if exit_code != 0:
        raise RuntimeError("Git clone failed.")

    print("Clone successful.")
    return repo_dir

repo_dir = clone_or_update_repo(GITHUB_USERNAME)
os.chdir(repo_dir)


Already inside repo at c:\Users\krist\Documents\GitHub\pm25-forecast-openmeteo-aqicn


In [3]:
today = date.today()

if settings.AQICN_API_KEY is None:
    print("AQICN_API_KEY missing.")
    sys.exit(1)

AQICN_API_KEY = settings.AQICN_API_KEY.get_secret_value()

secrets = hopsworks.get_secrets_api()
try:
    secret = secrets.get_secret("AQICN_API_KEY")
    if secret is not None:
        secret.delete()
except Exception:
    pass

secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


Secret('AQICN_API_KEY', 'PRIVATE')

## 1.2. Create Feature Groups

In [4]:
def create_feature_groups(fs):
    """
    Create all feature groups needed for the project.
    """
    air_quality_fg = fs.get_or_create_feature_group(
        name="air_quality",
        description="Air Quality characteristics of each day for all sensors",
        version=1,
        primary_key=["sensor_id", "datetime"],
        event_time="datetime",
        expectation_suite=None,
    )

    sensor_metadata_fg = fs.get_or_create_feature_group(
        name="sensor_metadata",
        description="Metadata for each air quality sensor",
        version=1,
        primary_key=["sensor_id"],
        expectation_suite=None,
    )

    weather_fg = fs.get_or_create_feature_group(
        name="weather",
        description="Weather characteristics of each day for all sensors",
        version=1,
        primary_key=["sensor_id", "datetime"],
        event_time="datetime",
        expectation_suite=None,
    )

    return air_quality_fg, sensor_metadata_fg, weather_fg


air_quality_fg, sensor_metadata_fg, weather_fg = create_feature_groups(fs)

## 1.3. Check and Backfill

In [5]:
# Check if data exists
try:
    aq_data = air_quality_fg.read()
    is_first_run = len(aq_data) == 0
except:
    is_first_run = True

# Process and insert data if first run
if is_first_run:
    all_aq_dfs = []
    all_weather_dfs = []
    locations = {}

    # Process CSV files in data directory
    data_dir = os.path.join(root_dir, "data")
    dir_list = os.listdir(data_dir)
    for file in dir_list:
        if file.endswith(".csv"):
            file_path = os.path.join(data_dir, file)
            aq_df_raw, street, city, country, feed_url, sensor_id = metadata.read_sensor_data(file_path, AQICN_API_KEY)
            
            # Clean and process
            aq_df = cleaning.clean_and_append_data(aq_df_raw, street, city, country, feed_url, sensor_id)
            aq_df["datetime"] = aq_df["datetime"].dt.tz_localize(None)

            # Fetch historical weather
            weather_df, latitude, longitude = fetchers.get_historical_weather(
                city, aq_df, today, feed_url, sensor_id, AQICN_API_KEY
            )

            if weather_df is None or len(weather_df) == 0:
                print(f"‚ö†Ô∏è No historical weather for sensor {sensor_id}, skipping.")
                continue

            weather_df["datetime"] = weather_df["datetime"].dt.tz_localize(None)

            all_aq_dfs.append(aq_df)
            all_weather_dfs.append(weather_df)
            locations[sensor_id] = {
                "country": country,
                "city": city,
                "street": street,
                "aqicn_url": feed_url,
                "latitude": latitude,
                "longitude": longitude,
            }

    if all_aq_dfs:
        # Combine and engineer features
        aq_df_all = pd.concat(all_aq_dfs, ignore_index=True)
        weather_df_all = pd.concat(all_weather_dfs, ignore_index=True)

        aq_df_all = feature_engineering.add_rolling_window_feature(aq_df_all, window_days=3, column="pm25", new_column="pm25_rolling_3d")
        aq_df_all = feature_engineering.add_lagged_features(aq_df_all, column="pm25", lags=[1, 2, 3])
        aq_df_all = feature_engineering.add_nearby_sensor_feature(aq_df_all, locations, column="pm25_lag_1d", n_closest=3)
        
        # Rename and insert
        aq_df_all = aq_df_all.rename(columns={"date": "datetime"})
        weather_df_all = weather_df_all.rename(columns={"date": "datetime"})
    
        air_quality_fg.insert(aq_df_all)
        weather_fg.insert(weather_df_all)

        # Insert sensor metadata
        metadata_records = []
        for sensor_id, loc in locations.items():
            metadata_records.append({
                "sensor_id": sensor_id,
                "country": loc["country"],
                "city": loc["city"],
                "street": loc["street"],
                "aqicn_url": loc["aqicn_url"],
                "latitude": loc["latitude"],
                "longitude": loc["longitude"],
            })
        sensor_metadata_fg.insert(pd.DataFrame(metadata_records))
    
        print(f"‚úÖ Inserted {len(aq_df_all)} air quality records")
        print(f"‚úÖ Inserted {len(weather_df_all)} weather records")
        print(f"‚úÖ Inserted {len(metadata_records)} sensor metadata records")
    else:
        print("‚ö†Ô∏è No CSV files processed")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (5.00s) 


## 1.4. Update Descriptions

In [6]:
def update_air_quality_description(air_quality_fg):
    air_quality_fg.update_feature_description("datetime", "Date and time of measurement of air quality")
    air_quality_fg.update_feature_description("sensor_id", "AQICN sensor identifier (e.g., 59893)")
    air_quality_fg.update_feature_description(
        "pm25",
        "Particles less than 2.5 micrometers in diameter (fine particles) pose health risk",
    )
    air_quality_fg.update_feature_description(
        "pm25_rolling_3d",
        "3-day rolling mean of PM2.5 from previous days (lagged by 1 day for point-in-time correctness).",
    )
    air_quality_fg.update_feature_description("pm25_lag_1d", "PM2.5 value from 1 day ago.")
    air_quality_fg.update_feature_description("pm25_lag_2d", "PM2.5 value from 2 days ago.")
    air_quality_fg.update_feature_description("pm25_lag_3d", "PM2.5 value from 3 days ago.")


def update_sensor_metadata_description(sensor_metadata_fg):
    sensor_metadata_fg.update_feature_description("sensor_id", "AQICN sensor identifier (e.g., 59893)")
    sensor_metadata_fg.update_feature_description("city", "City where the air quality was measured")
    sensor_metadata_fg.update_feature_description("street", "Street in the city where the air quality was measured")
    sensor_metadata_fg.update_feature_description(
        "country",
        "Country where the air quality was measured (sometimes a city in aqicn.org)",
    )
    sensor_metadata_fg.update_feature_description("aqicn_url", "URL to the AQICN feed for this sensor")
    sensor_metadata_fg.update_feature_description("latitude", "Latitude of the sensor location")
    sensor_metadata_fg.update_feature_description("longitude", "Longitude of the sensor location")


def update_weather_description(weather_fg):
    weather_fg.update_feature_description("datetime", "Date and time of measurement of weather")
    weather_fg.update_feature_description("sensor_id", "AQICN sensor identifier (e.g., 59893)")
    weather_fg.update_feature_description("city", "City where weather is measured/forecast for")
    weather_fg.update_feature_description("temperature_2m_mean", "Temperature in Celsius")
    weather_fg.update_feature_description("precipitation_sum", "Precipitation (rain/snow) in mm")
    weather_fg.update_feature_description("wind_speed_10m_max", "Wind speed at 10m above ground")
    weather_fg.update_feature_description("wind_direction_10m_dominant", "Dominant wind direction over the day")
    weather_fg.update_feature_description("latitude", "Latitude of sensor location used for weather retrieval")
    weather_fg.update_feature_description("longitude", "Longitude of sensor location used for weather retrieval")


update_air_quality_description(air_quality_fg)
update_sensor_metadata_description(sensor_metadata_fg)
update_weather_description(weather_fg)

## 1.5. Validation Setup
Creates Great Expectations validation suites for air quality and weather data with column value constraints.

In [8]:
aq_expectation_suite = gx.core.ExpectationSuite(
    expectation_suite_name="aq_expectation_suite"
)

# pm25 should be >= 0
aq_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "pm25",
            "min_value": -0.1,
            "max_value": None,
            "strict_min": True,
        },
    )
)

aq_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_dateutil_parseable",
        kwargs={"column": "datetime"},
    )
)

# sensor_id + date should be unique (PK)
aq_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_compound_columns_to_be_unique",
        kwargs={"column_list": ["sensor_id", "datetime"]},
    )
)

# rolling + lag features should be numeric (float or int)
for col in ["pm25_rolling_3d", "pm25_lag_1d", "pm25_lag_2d", "pm25_lag_3d"]:
    aq_expectation_suite.add_expectation(
        gx.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_type_list",
            kwargs={"column": col, "type_list": ["float", "int"]},
        )
    )

hopsworks_admin.save_or_replace_expectation_suite(air_quality_fg, aq_expectation_suite)


weather_expectation_suite = gx.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_dateutil_parseable",
        kwargs={"column": "datetime"},
    )
)

# temperature should be within physical range
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "temperature_2m_mean", "min_value": -80, "max_value": 60},
    )
)

# latitude/longitude must be valid
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "latitude", "min_value": -90, "max_value": 90},
    )
)
weather_expectation_suite.add_expectation(
    gx.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "longitude", "min_value": -180, "max_value": 180},
    )
)

# precipitation and wind speed should be >= 0 (but allow nulls)
for col in ["precipitation_sum", "wind_speed_10m_max"]:
    weather_expectation_suite.add_expectation(
        gx.core.ExpectationConfiguration(
            expectation_type="expect_column_min_to_be_between",
            kwargs={
                "column": col,
                "min_value": -0.1,
                "max_value": None,
                "strict_min": True,
            },
        )
    )

hopsworks_admin.save_or_replace_expectation_suite(weather_fg, weather_expectation_suite)

Deleted existing expectation suite for FG 'air_quality'.
Attached expectation suite to Feature Group, edit it at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1908050
Saved expectation suite for FG 'air_quality'.
Deleted existing expectation suite for FG 'weather'.
Attached expectation suite to Feature Group, edit it at https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fg/1893800
Saved expectation suite for FG 'weather'.


## 1.6. Create Feature View

In [10]:
air_quality_fv = fs.get_or_create_feature_view(
    name="air_quality_complete_fv",
    version=1,
    query=air_quality_fg.select_all()
        .join(weather_fg.select_all(), on=["sensor_id", "datetime"])
        .join(sensor_metadata_fg.select_all(), on="sensor_id"),
    labels=["pm25"]
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1279184/fs/1265800/fv/air_quality_complete_fv/version/1


## 1.7. Load Historical Data

In [None]:
try:
    metadata_df = sensor_metadata_fg.read()
    if len(metadata_df) == 0:
        print("‚ö†Ô∏è No sensor metadata found. Run first-time CSV processing first.")
    else:
        metadata_df = metadata_df.set_index("sensor_id")
        print(f"üìç Loaded metadata for {len(metadata_df)} sensors")
except Exception as e:
    print(f"‚ùå Error reading sensor metadata: {e}")
    metadata_df = pd.DataFrame()

Reading data from Hopsworks, using Hopsworks Feature Query Service..    

In [None]:
historical_df = air_quality_fv.get_batch_data()

## 1.8. Incremental Updates

Detect latest timestamp per sensor

In [None]:
latest_per_sensor = (
    historical_df.groupby("sensor_id")["date"]
    .max()
    .to_dict()
)

Incremental air quality fetcher

In [None]:
def fetch_latest_aq_data(sensor_id, feed_url, since):
    response = requests.get(feed_url)
    response.raise_for_status()
    data = response.json()

    if data.get("status") != "ok":
        print(f"[WARN] AQICN returned error for {sensor_id}: {data.get('data')}")
        return pd.DataFrame()

    time_info = data["data"].get("time")

    if isinstance(time_info, dict):
        ts_str = time_info.get("s")
    elif isinstance(time_info, str):
        ts_str = time_info
    else:
        print(f"[WARN] Unexpected time format for {sensor_id}: {time_info}")
        return pd.DataFrame()

    ts = pd.to_datetime(ts_str).tz_localize(None)

    if since is not None:
        since = since.tz_localize(None)

    if since is not None and ts <= since:
        return pd.DataFrame()

    pm25 = (
        data["data"]
        .get("iaqi", {})
        .get("pm25", {})
        .get("v", None)
    )

    return pd.DataFrame([{
        "sensor_id": sensor_id,
        "date": ts,
        "pm25": pm25,
        "aqicn_url": feed_url
    }])

Incremental weather fetcher

In [None]:
def get_latest_weather(latitude, longitude, since):
    if since is None:
        since = datetime.now(timezone.utc) - pd.Timedelta(days=7)

    since = since.replace(tzinfo=None)

    url = "https://api.open-meteo.com/v1/forecast"

    params = {
        "latitude": latitude,
        "longitude": longitude,
        "hourly": "temperature_2m,relative_humidity_2m,wind_speed_10m",
        "start_date": since.strftime("%Y-%m-%d"),
        "end_date": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
        "timezone": "UTC"
    }

    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()
    print(data)

    if "hourly" not in data or "time" not in data["hourly"]:
        print(f"[WARN] No weather data returned for lat={latitude}, lon={longitude}")
        return pd.DataFrame()

    df = pd.DataFrame(data["hourly"])

    df["date"] = pd.to_datetime(df["time"]).dt.tz_localize(None)

    df = df[df["date"] > since]

    return df

Incremental ingestion loop

In [None]:
air_quality_fg.schema
weather_fg.schema

In [None]:
incremental.run_incremental_update(
    sensor_metadata_fg,
    air_quality_fg,
    weather_fg,
    latest_per_sensor
)

In [None]:
# metadata_df = sensor_metadata_fg.read().set_index("sensor_id")

# if len(metadata_df) > 0:
#     for sensor_id, meta in metadata_df.iterrows():

#         last_ts = latest_per_sensor.get(sensor_id)

#         aq_new = fetch_latest_aq_data(
#             sensor_id=sensor_id,
#             feed_url=meta["aqicn_url"],
#             since=last_ts
#         )

#         if not aq_new.empty:
#             aq_new["date"] = aq_new["date"].dt.tz_localize(None)

#             # Feature engineering
#             aq_new = feature_engineering.add_rolling_window_feature(aq_new, window_days=3, column="pm25", new_column="pm25_rolling_3d")
#             aq_new = feature_engineering.add_lagged_features(aq_new, column="pm25", lags=[1, 2, 3])

#             # Clean schema
#             aq_new = aq_new.drop(columns=["aqicn_url"], errors="ignore")
#             aq_new["sensor_id"] = aq_new["sensor_id"].astype("int64")
#             aq_new["pm25"] = aq_new["pm25"].astype(float)

#             for col in ["pm25_rolling_3d", "pm25_lag_1d", "pm25_lag_2d", "pm25_lag_3d"]:
#                 if col in aq_new.columns:
#                     aq_new[col] = aq_new[col].astype(float)
            
#             air_quality_fg.insert(aq_new)

#         weather_new = get_latest_weather(
#             latitude=meta["latitude"],
#             longitude=meta["longitude"],
#             since=last_ts
#         )

#         weather_new = weather_new.rename(columns={
#         "time": "date",
#         "temperature_2m": "temperature_2m_mean",
#         "wind_speed_10m": "wind_speed_10m_max",
#         "wind_direction_10m": "wind_direction_10m_dominant",
#     })
#     weather_new["date"] = pd.to_datetime(weather_new["date"], errors="coerce")

#     bad_dates = weather_new["date"].isna().sum()
#     if bad_dates > 0:
#         print(f"‚ö†Ô∏è Warning: {bad_dates} rows had invalid dates and were dropped.")
        
#     weather_new = weather_new.dropna(subset=["date"])
# else:
#     print("‚è≠Ô∏è Skipping incremental updates - no sensors configured yet")

# # Ensure required columns exist
# if "precipitation_sum" not in weather_new.columns:
#     weather_new["precipitation_sum"] = 0.0

# if "wind_direction_10m_dominant" not in weather_new.columns:
#     weather_new["wind_direction_10m_dominant"] = weather_new.get(
#         "wind_direction_10m_dominant", 0.0
# )

#     # weather_new["wind_direction_10m_dominant"] = np.nan

# # Add metadata
# weather_new["city"] = meta["city"]
# weather_new["latitude"] = meta["latitude"]
# weather_new["longitude"] = meta["longitude"]
# weather_new["sensor_id"] = sensor_id

# # Cast types
# weather_new["sensor_id"] = weather_new["sensor_id"].astype("int64")
# weather_new["latitude"] = weather_new["latitude"].astype("float64")
# weather_new["longitude"] = weather_new["longitude"].astype("float64")
# weather_new["temperature_2m_mean"] = weather_new["temperature_2m_mean"].astype("float64")
# weather_new["precipitation_sum"] = weather_new["precipitation_sum"].astype("float64")
# weather_new["wind_speed_10m_max"] = weather_new["wind_speed_10m_max"].astype("float64")
# weather_new["wind_direction_10m_dominant"] = weather_new["wind_direction_10m_dominant"].astype("float64")

# # Final schema selection
# weather_new = weather_new[[
#     "sensor_id",
#     "date",
#     "temperature_2m_mean",
#     "precipitation_sum",
#     "wind_speed_10m_max",
#     "wind_direction_10m_dominant",
#     "city",
#     "latitude",
#     "longitude",
# ]]

# if not weather_new.empty:
#     weather_new["date"] = weather_new["date"].dt.tz_localize(None)
#     weather_fg.insert(weather_new)

rebuild feature view

## 1.9. Exploration

In [None]:
print("üîç AIR QUALITY DATA EXPLORATION")
print("="*40)
print(f"Shape: {aq_df_all.shape}")
print(f"Date range: {aq_df_all['date'].min().date()} to {aq_df_all['date'].max().date()}")
print(f"Number of unique sensors: {aq_df_all['sensor_id'].nunique()}")
print(f"Countries: {aq_df_all['country'].unique()}")
print(f"Cities: {aq_df_all['city'].nunique()} unique cities")

print("\nüìä PM2.5 Statistics:")
print(aq_df_all['pm25'].describe())
print(f"Missing values: {aq_df_all['pm25'].isna().sum()}")

print("\nüìà Engineered Features Statistics:")
for col in ['pm25_rolling_3d', 'pm25_lag_1d', 'pm25_lag_2d', 'pm25_lag_3d', 'pm25_nearby_avg']:
    if col in aq_df_all.columns:
        missing = aq_df_all[col].isna().sum()
        print(f"{col}: {missing} missing values ({missing/len(aq_df_all)*100:.1f}%)")

In [None]:
print("üå§Ô∏è WEATHER DATA EXPLORATION") 
print("="*40)
print(f"Shape: {weather_df_all.shape}")
print(f"Date range: {weather_df_all['date'].min().date()} to {weather_df_all['date'].max().date()}")
print(f"Number of unique sensors: {weather_df_all['sensor_id'].nunique()}")

print("\nüå°Ô∏è Weather Statistics:")
for col in ['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']:
    if col in weather_df_all.columns:
        print(f"{col}:")
        print(f"  Range: {weather_df_all[col].min():.2f} to {weather_df_all[col].max():.2f}, Mean: {weather_df_all[col].mean():.2f}, Missing: {weather_df_all[col].isna().sum()}")

print("\nüìç Geographic Coverage:")
print(f"Latitude range: {weather_df_all['latitude'].min():.3f} to {weather_df_all['latitude'].max():.3f}, Longitude range: {weather_df_all['longitude'].min():.3f} to {weather_df_all['longitude'].max():.3f}")

In [None]:
print("üîó DATA QUALITY & RELATIONSHIPS")
print("="*40)

# Overall data completeness
sensor_day_counts = aq_df_all.groupby('sensor_id')['date'].count()
total_records = len(aq_df_all)
data_completeness = (1 - aq_df_all['pm25'].isna().sum() / total_records) * 100

print(f"üìä Overall Data Quality:")
print(f"Total records: {total_records:,}")
print(f"Data completeness: {data_completeness:.1f}%")
print(f"Days per sensor - Min: {sensor_day_counts.min()}, Median: {sensor_day_counts.median():.0f}, Max: {sensor_day_counts.max()}")
print(f"Sensors with <30 days: {(sensor_day_counts < 30).sum()}, >365 days: {(sensor_day_counts > 365).sum()}")

# Extreme values summary
extreme_count = (aq_df_all['pm25'] > 100).sum()
very_high_count = (aq_df_all['pm25'] > 50).sum()
print(f"\n‚ö†Ô∏è Air Quality Levels:")
print(f"Extreme readings (>100 Œºg/m¬≥): {extreme_count} ({extreme_count/total_records*100:.1f}%)")
print(f"Very high readings (>50 Œºg/m¬≥): {very_high_count} ({very_high_count/total_records*100:.1f}%)")

# Seasonal patterns
if len(aq_df_all) > 0:
    # Create temporary month column without modifying original DataFrame
    temp_months = pd.to_datetime(aq_df_all['date']).dt.month
    monthly_pm25 = aq_df_all.groupby(temp_months)['pm25'].mean()
    print(f"\nüóìÔ∏è Seasonal Patterns (PM2.5 Œºg/m¬≥):")
    seasons = {(12,1,2): "Winter", (3,4,5): "Spring", (6,7,8): "Summer", (9,10,11): "Autumn"}
    for months, season in seasons.items():
        season_avg = monthly_pm25[monthly_pm25.index.isin(months)].mean()
        print(f"  {season}: {season_avg:.1f}")