In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./aqi-2022-2024.csv")
df.rename(columns={
    'polluant': 'pollutant',
    'valeur': 'value',
    'date': 'date',
    'heure': 'hour'
}, inplace=True)
df.head()

Unnamed: 0,stationId,pollutant,value,date,hour
0,103,O3,15,2022-01-15,3
1,103,NO2,2,2022-01-15,3
2,103,PM,12,2022-01-15,3
3,17,CO,1,2022-02-04,21
4,17,O3,17,2022-02-04,21


In [3]:
# Creating a datetime column for easier manipulation later on
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['hour'].astype(str) + ':00:00',
                                format = '%Y-%m-%d %H:%M:%S',
                                errors = 'coerce')
# Dropping columns we won't need anymore
df.drop(["hour", "pollutant", "date"], axis=1, inplace=True)

# Indexing by station and datetime and grab the maximum value for each index (related to our assumption earlier)
df = df.groupby(['stationId', 'datetime']).max("value")

df

Unnamed: 0_level_0,Unnamed: 1_level_0,value
stationId,datetime,Unnamed: 2_level_1
3,2022-01-01 00:00:00,57
3,2022-01-01 01:00:00,58
3,2022-01-01 02:00:00,60
3,2022-01-01 03:00:00,62
3,2022-01-01 04:00:00,68
...,...,...
103,2024-06-15 19:00:00,21
103,2024-06-15 20:00:00,19
103,2024-06-15 21:00:00,19
103,2024-06-15 22:00:00,15


In [4]:
df = df.groupby("datetime").mean("value").reset_index()
# Sorting the dataframe by datetime for better visualization
df.sort_values("datetime", inplace=True, ascending=False)
df

Unnamed: 0,datetime,value
21527,2024-06-15 23:00:00,12.545455
21526,2024-06-15 22:00:00,14.727273
21525,2024-06-15 21:00:00,17.090909
21524,2024-06-15 20:00:00,18.363636
21523,2024-06-15 19:00:00,19.636364
...,...,...
4,2022-01-01 04:00:00,65.363636
3,2022-01-01 03:00:00,65.454545
2,2022-01-01 02:00:00,60.272727
1,2022-01-01 01:00:00,53.727273


In [5]:
# This station is located close to the airport and is the one I found holds the most interesting data
station_id="30165"
year="2024"
month="6"
# Beginning of the month
day="1"
# Hourly data
timeframe="1"
weather_df = pd.read_csv(f"https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID={station_id}&Year={year}&Month={month}&Day={day}&timeframe={timeframe}&submit=Download+Data")
weather_df.rename(columns={'Date/Time (LST)': 'datetime', "Temp (°C)": "temp", "Precip. Amount (mm)": "precip", "Rel Hum (%)": "rel_humid"}, inplace=True)
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])
weather_df = weather_df[['datetime', "temp", "rel_humid", "precip"]]
weather_df.head()

Unnamed: 0,datetime,temp,rel_humid,precip
0,2024-06-01 00:00:00,17.2,58.0,0.0
1,2024-06-01 01:00:00,16.6,61.0,0.0
2,2024-06-01 02:00:00,15.1,66.0,0.0
3,2024-06-01 03:00:00,14.5,67.0,0.0
4,2024-06-01 04:00:00,14.1,67.0,0.0


In [12]:
merged_df = pd.merge(df, weather_df, on="datetime", how="left")
merged_df.head()

Unnamed: 0,datetime,value,temp,rel_humid,precip
0,2024-06-15 23:00:00,12.545455,13.6,61.0,0.0
1,2024-06-15 22:00:00,14.727273,13.4,59.0,0.0
2,2024-06-15 21:00:00,17.090909,15.1,51.0,0.0
3,2024-06-15 20:00:00,18.363636,16.3,47.0,0.0
4,2024-06-15 19:00:00,19.636364,18.2,40.0,0.0


In [13]:
from sklearn.preprocessing import MinMaxScaler

merged_df['year'] = merged_df['datetime'].dt.year
# Year has a bigger range than the rest so we divide it up by its maximum to scale it down.
merged_df['year'] = merged_df['year'] / merged_df['year'].max()
merged_df['month'] = merged_df['datetime'].dt.month
merged_df['day'] = merged_df['datetime'].dt.day
merged_df['hour'] = merged_df['datetime'].dt.hour
merged_df['weekday'] = merged_df['datetime'].dt.weekday

# Values above 100 are extreme outliers (and very rare for Montreal). Clamping helps the model not being influenced too much from these rare events.
merged_df['value'] = merged_df['value'].clip(upper=100)

# We will need to retain a reference to this scaler so remember this!
scaler = MinMaxScaler()

merged_df['value'] = scaler.fit_transform(merged_df[['value']])
merged_df.head()

Unnamed: 0,datetime,value,temp,rel_humid,precip,year,month,day,hour,weekday
0,2024-06-15 23:00:00,0.089015,13.6,61.0,0.0,1.0,6,15,23,5
1,2024-06-15 22:00:00,0.111742,13.4,59.0,0.0,1.0,6,15,22,5
2,2024-06-15 21:00:00,0.136364,15.1,51.0,0.0,1.0,6,15,21,5
3,2024-06-15 20:00:00,0.149621,16.3,47.0,0.0,1.0,6,15,20,5
4,2024-06-15 19:00:00,0.162879,18.2,40.0,0.0,1.0,6,15,19,5
