In [201]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [202]:
station2022=pd.read_csv('./data/station/2022_MM_station.csv')
station2023=pd.read_csv('./data/station/2023_MM_station.csv')
station2024=pd.read_csv('./data/station/2024_MM_station.csv')
era5_2022=pd.read_csv('./data/satellite/ERA5/ERA5_2022_MM.csv')
era5_2023=pd.read_csv('./data/satellite/ERA5/ERA5_2023_MM.csv')
era5_2024=pd.read_csv('./data/satellite/ERA5/ERA5_2024_MM.csv')

In [203]:
train=pd.concat([station2022,station2023])
era5=pd.concat([era5_2022,era5_2023])


In [204]:
era5=era5.drop(columns=['system:index','time','.geo'])
era5_2024=era5_2024.drop(columns=['system:index','time','.geo'])


In [205]:
train_csv=pd.concat([train,era5],axis=1)
test_csv=pd.concat([station2024,era5_2024],axis=1)
train_csv=train_csv.dropna(axis=1,how='all')
test_csv=test_csv.dropna(axis=1,how='all')

In [206]:
print(train_csv.columns)
print(test_csv.columns)


Index(['Timestamp', 'PM2.5 (µg/m³)', 'PM10 (µg/m³)', 'NO (µg/m³)',
       'NO2 (µg/m³)', 'NOx (ppb)', 'NH3 (µg/m³)', 'SO2 (µg/m³)', 'CO (mg/m³)',
       'Ozone (µg/m³)', 'Benzene (µg/m³)', 'Toluene (µg/m³)', 'Xylene (µg/m³)',
       'AT (°C)', 'RH (%)', 'WS (m/s)', 'WD (deg)', 'TOT-RF (mm)',
       'SR (W/mt2)', 'BP (mmHg)', 'VWS (m/s)', 'total_cloud_cover',
       'total_precipitation'],
      dtype='object')
Index(['Timestamp', 'PM2.5 (µg/m³)', 'PM10 (µg/m³)', 'NO (µg/m³)',
       'NO2 (µg/m³)', 'NOx (ppb)', 'NH3 (µg/m³)', 'SO2 (µg/m³)', 'CO (mg/m³)',
       'Ozone (µg/m³)', 'Benzene (µg/m³)', 'Toluene (µg/m³)', 'Xylene (µg/m³)',
       'AT (°C)', 'RH (%)', 'WS (m/s)', 'WD (deg)', 'TOT-RF (mm)',
       'SR (W/mt2)', 'BP (mmHg)', 'VWS (m/s)', 'total_cloud_cover',
       'total_precipitation'],
      dtype='object')


In [207]:
train_csv["u_comp_wind"] = train_csv["WS (m/s)"] * np.cos(np.radians(train_csv["WD (deg)"]))
train_csv["v_comp_wind"] = train_csv["WS (m/s)"] * np.sin(np.radians(train_csv["WD (deg)"]))
train_csv['total_precipitation']=np.maximum(0,train_csv['total_precipitation'])
df_time = pd.to_datetime(train_csv["Timestamp"])
train_csv["hour"] = df_time.dt.hour
train_csv["day-of-week"] = df_time.dt.dayofweek
train_csv["day-of-year"] = df_time.dt.dayofyear
train_csv["month"] = df_time.dt.month
train_csv["quarter"] = df_time.dt.quarter
train_csv["is_weekend"] = train_csv["day-of-week"].isin([5, 6]).astype(int)
train_csv["dewpoint"] = train_csv["AT (°C)"] - ((100 - train_csv["RH (%)"]) / 5)


In [208]:
test_csv["u_comp_wind"] = test_csv["WS (m/s)"] * np.cos(np.radians(test_csv["WD (deg)"]))
test_csv["v_comp_wind"] = test_csv["WS (m/s)"] * np.sin(np.radians(test_csv["WD (deg)"]))
test_csv['total_precipitation']=np.maximum(0,test_csv['total_precipitation'])
df_time = pd.to_datetime(test_csv["Timestamp"])
test_csv["hour"] = df_time.dt.hour
test_csv["day-of-week"] = df_time.dt.dayofweek
test_csv["day-of-year"] = df_time.dt.dayofyear
test_csv["month"] = df_time.dt.month
test_csv["quarter"] = df_time.dt.quarter
test_csv["is_weekend"] = test_csv["day-of-week"].isin([5, 6]).astype(int)
test_csv["dewpoint"] = test_csv["AT (°C)"] - ((100 - test_csv["RH (%)"]) / 5)

In [209]:
train_csv=train_csv.drop(columns=["Timestamp", "RH (%)", "WS (m/s)", "WD (deg)","Xylene (µg/m³)","TOT-RF (mm)"])
print(train_csv.shape)
columns=train_csv.columns

(17520, 26)


In [210]:
test_csv=test_csv.drop(columns=["Timestamp", "RH (%)", "WS (m/s)", "WD (deg)","Xylene (µg/m³)","TOT-RF (mm)"])
print(test_csv.shape)
test_columns=test_csv.columns

(8784, 26)


In [211]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(
        n_neighbors=4, weights="distance", metric="nan_euclidean", missing_values=np.nan)

train_csv = imputer.fit_transform(train_csv)

train_csv=pd.DataFrame(train_csv,columns=columns)

In [212]:
test_csv = imputer.transform(test_csv)

test_csv=pd.DataFrame(test_csv,columns=test_columns)

In [213]:
print(train_csv.columns)

labels=train_csv[["NO2 (µg/m³)","Ozone (µg/m³)"]]

print(train_csv.columns)

Index(['PM2.5 (µg/m³)', 'PM10 (µg/m³)', 'NO (µg/m³)', 'NO2 (µg/m³)',
       'NOx (ppb)', 'NH3 (µg/m³)', 'SO2 (µg/m³)', 'CO (mg/m³)',
       'Ozone (µg/m³)', 'Benzene (µg/m³)', 'Toluene (µg/m³)', 'AT (°C)',
       'SR (W/mt2)', 'BP (mmHg)', 'VWS (m/s)', 'total_cloud_cover',
       'total_precipitation', 'u_comp_wind', 'v_comp_wind', 'hour',
       'day-of-week', 'day-of-year', 'month', 'quarter', 'is_weekend',
       'dewpoint'],
      dtype='object')
Index(['PM2.5 (µg/m³)', 'PM10 (µg/m³)', 'NO (µg/m³)', 'NO2 (µg/m³)',
       'NOx (ppb)', 'NH3 (µg/m³)', 'SO2 (µg/m³)', 'CO (mg/m³)',
       'Ozone (µg/m³)', 'Benzene (µg/m³)', 'Toluene (µg/m³)', 'AT (°C)',
       'SR (W/mt2)', 'BP (mmHg)', 'VWS (m/s)', 'total_cloud_cover',
       'total_precipitation', 'u_comp_wind', 'v_comp_wind', 'hour',
       'day-of-week', 'day-of-year', 'month', 'quarter', 'is_weekend',
       'dewpoint'],
      dtype='object')


In [214]:

test_labels=test_csv[["NO2 (µg/m³)","Ozone (µg/m³)"]]


In [215]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(train_csv)

X_scaled=scaler.transform(train_csv)

from sklearn.decomposition import PCA

pca=PCA(n_components=18)

pca.fit(X_scaled)
X_reduced=pca.transform(X_scaled)
print(X_reduced.shape)


(17520, 18)


In [216]:

test_scaled = scaler.transform(test_csv)

test_reduced=pca.transform(test_scaled)

In [217]:
test_reduced.shape

(8784, 18)

In [218]:
labels=labels.to_numpy()
np.save('./processed_data/station/train.npy',X_reduced)
np.save('./processed_data/labels/train_labels.npy',labels)
test_labels=test_labels.to_numpy()
np.save('./processed_data/station/test.npy',test_reduced)
np.save('./processed_data/labels/test_labels.npy',test_labels)