# Urban Air Quality Forcaster
## Feature Expansion & Merging

This notebook integrates:
- Air Quality
- Weather
- Traffic data
for saptiothermal modeling

In [30]:
import pandas as pd
import numpy as np
from datetime import datetime

load grid AQI data

In [31]:
aq_df = pd.read_csv("C:/Users/Navyashree/Documents/urban-air-quality-forecaster/data/processed/grid_air_quality.csv")
aq_df["hour"] = pd.to_datetime(aq_df["hour"])
aq_df.head()

Unnamed: 0,grid_lat,grid_lon,hour,pm25,no2,o3
0,12.9,77.63,2026-01-19 10:00:00,96.479902,40.746327,33.020855
1,12.91,77.68,2026-01-18 22:00:00,128.436367,28.942489,101.960714
2,12.92,77.56,2026-01-18 19:00:00,146.515163,28.464076,93.622592
3,12.92,77.63,2026-01-19 15:00:00,105.413565,69.038529,44.56536
4,12.93,77.54,2026-01-19 13:00:00,13.666896,43.048675,52.656615


Simulate Weather data

In [32]:
weather_df = aq_df[["grid_lat","grid_lon","hour"]].copy()

weather_df["temperature"] = np.random.uniform(20, 35, len(weather_df))
weather_df["wind_speed"] = np.random.uniform(0.5, 6.0, len(weather_df))

weather_df.head()

Unnamed: 0,grid_lat,grid_lon,hour,temperature,wind_speed
0,12.9,77.63,2026-01-19 10:00:00,29.450234,2.058798
1,12.91,77.68,2026-01-18 22:00:00,34.094741,3.603606
2,12.92,77.56,2026-01-18 19:00:00,33.795365,0.738026
3,12.92,77.63,2026-01-19 15:00:00,31.26907,2.668005
4,12.93,77.54,2026-01-19 13:00:00,33.261975,1.500816


Simulate Traffic data

In [33]:
traffic_df = aq_df[["grid_lat", "grid_lon", "hour"]].copy()

traffic_df["traffic_index"] = np.random.uniform(1, 100, len(traffic_df))

traffic_df.head()

# 0 -> no traffic 
# 100 -> heavy congestion

Unnamed: 0,grid_lat,grid_lon,hour,traffic_index
0,12.9,77.63,2026-01-19 10:00:00,49.188388
1,12.91,77.68,2026-01-18 22:00:00,9.299782
2,12.92,77.56,2026-01-18 19:00:00,65.37523
3,12.92,77.63,2026-01-19 15:00:00,67.062964
4,12.93,77.54,2026-01-19 13:00:00,46.237521


Merge All features

In [34]:
merged_df = aq_df.merge(
    weather_df,
    on = ["grid_lat", "grid_lon", "hour"],
    how = "left"
).merge(
    traffic_df,
    on = ["grid_lat", "grid_lon", "hour"],
    how = "left"
)

merged_df.head()

# feature table ready

Unnamed: 0,grid_lat,grid_lon,hour,pm25,no2,o3,temperature,wind_speed,traffic_index
0,12.9,77.63,2026-01-19 10:00:00,96.479902,40.746327,33.020855,29.450234,2.058798,49.188388
1,12.91,77.68,2026-01-18 22:00:00,128.436367,28.942489,101.960714,34.094741,3.603606,9.299782
2,12.92,77.56,2026-01-18 19:00:00,146.515163,28.464076,93.622592,33.795365,0.738026,65.37523
3,12.92,77.63,2026-01-19 15:00:00,105.413565,69.038529,44.56536,31.26907,2.668005,67.062964
4,12.93,77.54,2026-01-19 13:00:00,13.666896,43.048675,52.656615,33.261975,1.500816,46.237521


In [35]:
# saving the model-ready data

merged_df.to_csv("C:/Users/Navyashree/Documents/urban-air-quality-forecaster/data/processed/model_features.csv", index = False)
print("Model-ready features saved")

Model-ready features saved


load model-ready data

In [36]:
import pandas as pd
import numpy as np

df = pd.read_csv("C:/Users/Navyashree/Documents/urban-air-quality-forecaster/data/processed/model_features.csv")
df.head()

Unnamed: 0,grid_lat,grid_lon,hour,pm25,no2,o3,temperature,wind_speed,traffic_index
0,12.9,77.63,2026-01-19 10:00:00,96.479902,40.746327,33.020855,29.450234,2.058798,49.188388
1,12.91,77.68,2026-01-18 22:00:00,128.436367,28.942489,101.960714,34.094741,3.603606,9.299782
2,12.92,77.56,2026-01-18 19:00:00,146.515163,28.464076,93.622592,33.795365,0.738026,65.37523
3,12.92,77.63,2026-01-19 15:00:00,105.413565,69.038529,44.56536,31.26907,2.668005,67.062964
4,12.93,77.54,2026-01-19 13:00:00,13.666896,43.048675,52.656615,33.261975,1.500816,46.237521


Identify unique nodes (grid cells)

In [37]:
nodes = df[["grid_lat","grid_lon"]].drop_duplicates().reset_index(drop=True)
nodes["node_id"] = nodes.index
nodes.head()

Unnamed: 0,grid_lat,grid_lon,node_id
0,12.9,77.63,0
1,12.91,77.68,1
2,12.92,77.56,2
3,12.92,77.63,3
4,12.93,77.54,4


Distance-based adjacency logic
- connecting grids that are spatially close

In [38]:
def is_neighbor(row1, row2, threshold = 0.02):
    return(
        abs(row1["grid_lat"] - row2["grid_lat"]) <= threshold and
        abs(row1["grid_lon"] - row2["grid_lon"]) <= threshold
    )

Building adjacency matrix

In [39]:
num_nodes = len(nodes)
adj_matrix = np.zeros((num_nodes,num_nodes))

for i in range(num_nodes):
    for j in range(num_nodes):
        if i!= j and is_neighbor(nodes.loc[i], nodes.loc[j]):
            adj_matrix[i,j] = 1

In [40]:
adj_matrix.shape

(23, 23)

In [41]:
adj_matrix[:5,:5]

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.]])

In [42]:
# save adjacency matrix
np.save("C:/Users/Navyashree/Documents/urban-air-quality-forecaster/data/processed/adjacency_matrix.npy",adj_matrix)
print("Adjacency matrix saved")

Adjacency matrix saved


Preparing STGNN Input Tensors

- STGNN needs X(Feature tensor) and A(Adjanceny matrix)

In [9]:
#load everthing

import numpy as np
import pandas as pd

df = pd.read_csv("C:/Users/Navyashree/Documents/urban-air-quality-forecaster/data/processed/model_features.csv")
adj_matrix = np.load("C:/Users/Navyashree/Documents/urban-air-quality-forecaster/data/processed/adjacency_matrix.npy")

df["hour"] = pd.to_datetime(df["hour"]).dt.floor("h")
df.head()

Unnamed: 0,grid_lat,grid_lon,hour,pm25,no2,o3,temperature,wind_speed,traffic_index
0,12.9,77.63,2026-01-19 10:00:00,96.479902,40.746327,33.020855,29.450234,2.058798,49.188388
1,12.91,77.68,2026-01-18 22:00:00,128.436367,28.942489,101.960714,34.094741,3.603606,9.299782
2,12.92,77.56,2026-01-18 19:00:00,146.515163,28.464076,93.622592,33.795365,0.738026,65.37523
3,12.92,77.63,2026-01-19 15:00:00,105.413565,69.038529,44.56536,31.26907,2.668005,67.062964
4,12.93,77.54,2026-01-19 13:00:00,13.666896,43.048675,52.656615,33.261975,1.500816,46.237521


In [10]:
# Graph models require consistent node ordering

nodes = (
    df[["grid_lat","grid_lon"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

nodes["node_id"] = nodes.index
nodes.head()

Unnamed: 0,grid_lat,grid_lon,node_id
0,12.9,77.63,0
1,12.91,77.68,1
2,12.92,77.56,2
3,12.92,77.63,3
4,12.93,77.54,4


In [11]:
# map node_id back to main dataframe

df = df.merge(
    nodes,
    on=["grid_lat","grid_lon"],
    how="left"
)

df.head()

Unnamed: 0,grid_lat,grid_lon,hour,pm25,no2,o3,temperature,wind_speed,traffic_index,node_id
0,12.9,77.63,2026-01-19 10:00:00,96.479902,40.746327,33.020855,29.450234,2.058798,49.188388,0
1,12.91,77.68,2026-01-18 22:00:00,128.436367,28.942489,101.960714,34.094741,3.603606,9.299782,1
2,12.92,77.56,2026-01-18 19:00:00,146.515163,28.464076,93.622592,33.795365,0.738026,65.37523,2
3,12.92,77.63,2026-01-19 15:00:00,105.413565,69.038529,44.56536,31.26907,2.668005,67.062964,3
4,12.93,77.54,2026-01-19 13:00:00,13.666896,43.048675,52.656615,33.261975,1.500816,46.237521,4


In [12]:
# selecting features for the model ,excluding location & time

feature_cols = [
    "pm25",
    "no2",
    "o3",
    "temperature",
    "wind_speed",
    "traffic_index"
] 

In [13]:
# sorting data

df = df.sort_values(by=["hour","node_id"])

In [17]:
# Create FULL hour Ã— node grid
all_hours = df["hour"].unique()
all_nodes = df["node_id"].unique()

full_index = pd.MultiIndex.from_product(
    [all_hours, all_nodes],
    names=["hour", "node_id"]
)


In [18]:
# Reindex dataframe

df = (
    df
    .set_index(["hour", "node_id"])
    .reindex(full_index)
    .reset_index()
)

In [19]:
# fill missing values

df[feature_cols] = df[feature_cols].fillna(0)

In [20]:
# Compute dimensions

time_steps = df["hour"].nunique()
num_nodes = df["node_id"].nunique()
num_features = len(feature_cols)

Build STGNN tensor 

In [21]:
X = df[feature_cols].to_numpy().reshape(
    time_steps,
    num_nodes,
    num_features
)

X.shape

(24, 23, 6)

In [22]:
# verifying

print(df.shape[0])
print(time_steps * num_nodes)

552
552


Y (Target Tensor)

In [23]:
# Sort
df = df.sort_values(by=["hour", "node_id"])

In [24]:
# Creating next-hour PM2.5 target
df["pm25_target"] = (
    df.groupby("node_id")["pm25"]
      .shift(-1)
)

In [None]:
# Drop last hour
df = df.dropna(subset=["pm25_target"])

# removes the final timestamp for each node.

In [27]:
# Recalculate dimensions

time_steps = df["hour"].nunique()
num_nodes = df["node_id"].nunique()

Build Y tensor

In [28]:
Y = df["pm25_target"].to_numpy().reshape(
    time_steps,
    num_nodes
)

In [29]:
# verify
print("Y shape:", Y.shape)


Y shape: (23, 23)
