In [1]:
import pandas as pd
from fastai.tabular.all import *

# Experiment

In [None]:
df1 = pd.read_csv("./rsqa-indice-qualite-air-2022-2024.csv")
df2 = pd.read_csv("./rsqa-indice-qualite-air-2019-2021.csv")
df = pd.concat([df1, df2], ignore_index=True)

In [None]:
df.isna().sum()

stationId    0
valeur       0
dtype: int64

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['datetime'] = df['date'] + pd.to_timedelta(df['heure'], unit='h')
df.head()

Unnamed: 0,stationId,polluant,valeur,date,heure,datetime
0,103,O3,15,2022-01-15,3,2022-01-15 03:00:00
1,103,NO2,2,2022-01-15,3,2022-01-15 03:00:00
2,103,PM,12,2022-01-15,3,2022-01-15 03:00:00
3,17,CO,1,2022-02-04,21,2022-02-04 21:00:00
4,17,O3,17,2022-02-04,21,2022-02-04 21:00:00


In [None]:
df = df.sort_values(by='datetime')
df.set_index('datetime', inplace=True)

In [None]:
df.drop(['date', 'heure', 'polluant'], axis=1, inplace=True)
df

Unnamed: 0_level_0,stationId,valeur
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01 00:00:00,50,36
2019-01-01 00:00:00,55,37
2019-01-01 00:00:00,28,7
2019-01-01 00:00:00,7,3
2019-01-01 00:00:00,80,1
...,...,...
2024-04-09 23:00:00,3,7
2024-04-09 23:00:00,3,1
2024-04-09 23:00:00,3,2
2024-04-09 23:00:00,50,10


In [None]:
# Feature engineering
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['weekday'] = df.index.weekday
df['hour'] = df.index.hour

Index([2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
       ...
       2024, 2024, 2024, 2024, 2024, 2024, 2024, 2024, 2024, 2024],
      dtype='int32', name='datetime', length=1789045)

# Run

In [5]:
def process_csv(x):
  x['date'] = pd.to_datetime(df['date'])
  x['datetime'] = df['date'] + pd.to_timedelta(df['heure'], unit='h')
  x.sort_values(by='datetime', inplace=True)
  x.set_index('datetime', inplace=True)
  x.drop(['date', 'heure', 'polluant', 'stationId'], axis=1, inplace=True)
  x['year'] = df.index.year
  x['month'] = df.index.month
  x['day'] = df.index.day
  x['weekday'] = df.index.weekday
  x['hour'] = df.index.hour

def split(x):
  x_train = x[x.index.year < 2024]
  x_test = x[x.index.year >= 2024]
  return x_train, x_test

# https://donnees.montreal.ca/dataset/rsqa-iqa-historique
df = pd.read_csv("https://donnees.montreal.ca/dataset/547b8052-1710-4d69-8760-beaa3aa35ec6/resource/0c325562-e742-4e8e-8c36-971f3c9e58cd/download/rsqa-indice-qualite-air-2022-2024.csv")
# df2 = pd.read_csv("https://donnees.montreal.ca/dataset/547b8052-1710-4d69-8760-beaa3aa35ec6/resource/e43dc1d6-fbdd-49c3-a79f-83f63404c281/download/rsqa-indice-qualite-air-2019-2021.csv")
# df = pd.concat([df1, df2], ignore_index=True)

process_csv(df)
df_train, df_test = split(df)


In [6]:
df_train

Unnamed: 0_level_0,valeur,year,month,day,weekday,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-01 00:00:00,45,2022,1,1,5,0
2022-01-01 00:00:00,48,2022,1,1,5,0
2022-01-01 00:00:00,47,2022,1,1,5,0
2022-01-01 00:00:00,1,2022,1,1,5,0
2022-01-01 00:00:00,4,2022,1,1,5,0
...,...,...,...,...,...,...
2023-12-31 23:00:00,1,2023,12,31,6,23
2023-12-31 23:00:00,7,2023,12,31,6,23
2023-12-31 23:00:00,1,2023,12,31,6,23
2023-12-31 23:00:00,15,2023,12,31,6,23


In [7]:
dls = to = TabularPandas(
  df_train,
  procs=[Normalize],
  cat_names=[],
  cont_names=['year', 'month', 'day', 'weekday', 'hour'],
  y_names=['valeur'],
  splits=RandomSplitter(valid_pct=0.2, seed=42)(range_of(df_train))
).dataloaders()

dls.show_batch()

Unnamed: 0,year,month,day,weekday,hour,valeur
0,2023.0,10.0,5.0,3.0,10.0,2.0
1,2023.0,4.0,17.0,9.52896e-08,4.0,5.0
2,2022.0,1.0,23.0,6.0,23.0,15.0
3,2022.0,5.0,24.0,0.9999999,4.0,0.0
4,2023.0,8.0,22.0,0.9999999,14.0,1.0
5,2022.0,2.0,19.0,5.0,18.0,19.0
6,2023.0,8.0,11.0,4.0,17.0,1.0
7,2023.0,6.0,12.0,9.52896e-08,4.0,19.0
8,2023.0,6.0,13.0,0.9999999,5.0,1.0
9,2023.0,10.0,14.0,5.0,18.0,0.0


In [8]:
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(5)

epoch,train_loss,valid_loss,accuracy,time
0,92.591698,88.888367,0.016779,00:49
1,95.413315,88.127533,0.016779,00:50
2,92.883026,87.151207,0.016779,00:51
3,86.662842,86.635536,0.016779,00:50
4,91.103134,86.342896,0.016779,00:49
