In [81]:
import pandas as pd
import numpy as np
import os

We label appropriately the columns we need from the CSVs.

We also load the `.csv` files data set from the `/data/` folder of the repository.

Finally we prepare the data set to use timestamp as index of the dataframe.

In [82]:
WATER_LEVEL_COL="Water Level (meters) - Niveau d'eau (metres)"
DATE_COL="Date - Date"
OUTFLOW_COL="m^3/s"

# load csvs
cornwall_water_lvl = pd.read_csv("./data/raw/Cornwall_WaterLevelChanges_2024_CLEAN.csv")
longsault_water_lvl = pd.read_csv("./data/raw/LongSaultDam_WaterLevelChanges_2024_CLEAN.csv")
lakeontario_outflow_changes = pd.read_csv("./data/raw/LakeOntarioOutflowChanges_2024_CLEAN.csv")

# convert date columns to datetime
cornwall_water_lvl[DATE_COL] = pd.to_datetime(cornwall_water_lvl[DATE_COL])
longsault_water_lvl[DATE_COL] = pd.to_datetime(longsault_water_lvl[DATE_COL])
lakeontario_outflow_changes[DATE_COL] = pd.to_datetime(lakeontario_outflow_changes[DATE_COL])

# set date columns as index
cornwall_water_lvl.set_index(DATE_COL, inplace=True)
longsault_water_lvl.set_index(DATE_COL, inplace=True)
lakeontario_outflow_changes.set_index(DATE_COL, inplace=True)

In [83]:
# reindex to hourly frequency and fill missing timestamps with the previous known value
start = lakeontario_outflow_changes.index[0]
# end is set to the last timestamp in 2024 plus one minute since datasets are recorded hourly plus one minute
end = pd.Timestamp("2024-12-31 23:01:00")

hourly_index = pd.date_range(start=start, end=end, freq="h")
lakeontario_outflow_changes = lakeontario_outflow_changes.reindex(hourly_index)
lakeontario_outflow_changes = lakeontario_outflow_changes.ffill()

In [84]:
os.makedirs("./data/processed/", exist_ok=True)
lakeontario_outflow_changes.to_csv("./data/processed/LakeOntarioOutflowChanges_2024_HOURLY.csv")