In [5]:
import pandas as pd
from pytorch_tcn import TCN
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler

model = TCN(
    num_inputs = 2,
    num_channels=[16, 32],
    kernel_size = 4,
    # dilation_reset: Optional[ int ] = None,
    dropout = 0.1,
    causal = True,
    use_norm = 'weight_norm',
    activation = 'relu',
    kernel_initializer = 'xavier_uniform',
    use_skip_connections = False,
    # input_shape: str = 'NCL',
    # embedding_shapes: Optional[ ArrayLike ] = None,
    # embedding_mode: str = 'add',
    # use_gate: bool = False,
    output_projection = 1,
    output_activation = 'relu',
)
df = pd.read_csv("dataset.csv")


In [6]:

pd.options.display.max_rows = 999
count_by_sec_and_mrc = df.groupby(["sector", "mrc"]).count().unstack().tavg.T.reset_index()

for i, row in count_by_sec_and_mrc.iterrows():
    mrc = row["mrc"]
    row = row.drop("mrc")
    if (row == 96).all():
        pass
    else:
        print(mrc)
        print((row == 96))


def remove_incomplete_mrc_sectors(df):
    unwanted_mrc_sectors = [
        ("Administration régionale Kativik", "AGRICOLE"),
        ("Administration régionale Kativik", "INDUSTRIEL"),
        ("Caniapiscau", "AGRICOLE"),
        ("Le Golfe-du-Saint-Laurent", "AGRICOLE")
    ]

    for mrc, sector in unwanted_mrc_sectors:
        df = df[~((df["mrc"] == mrc) & (df["sector"] == sector))]


    return df

df = remove_incomplete_mrc_sectors(df)
df["sector_mrc"] = df["sector"] + df["mrc"]
sector_mrcs = df["sector_mrc"].unique()

Administration régionale Kativik
sector
AGRICOLE          False
COMMERCIAL         True
INDUSTRIEL        False
INSTITUTIONNEL     True
RÉSIDENTIEL        True
Name: 3, dtype: bool
Caniapiscau
sector
AGRICOLE          False
COMMERCIAL         True
INDUSTRIEL         True
INSTITUTIONNEL     True
RÉSIDENTIEL        True
Name: 15, dtype: bool
Le Golfe-du-Saint-Laurent
sector
AGRICOLE          False
COMMERCIAL         True
INDUSTRIEL         True
INSTITUTIONNEL     True
RÉSIDENTIEL        True
Name: 45, dtype: bool


In [7]:


def preprocess(df):
    new_df = pd.DataFrame()
    for sector_mrc in sector_mrcs:
        sector_mrc_df = df[df["sector_mrc"] == sector_mrc].sort_index()
        sector_mrc_df["total_kwh"] = sector_mrc_df["total_kwh"].interpolate(method="index")

        new_df = pd.concat([new_df, sector_mrc_df])

    return new_df
df = preprocess(df)

df = df.set_index(pd.to_datetime(df.date, format="%Y-%m-%d")).sort_index()


train_df = df["2016":"2022"]
test_df = df["2023":]
df = df.drop(columns=["date"])
df = df.sort_values(["mrc", "sector", "date"])

feature_df = df[["total_kwh", "tavg"]]

# scaler = StandardScaler()
# scaled_df = scaler.fit_transform(df)
# scaled_df = torch.tensor(scaled_df).float()


In [8]:
import random


def get_sequence(df: pd.DataFrame):
    mrc = random.choice(sector_mrcs)
    # print(mrc)
    sequence = df[df.sector_mrc == mrc]
    sequence = sequence[["total_kwh", "tavg"]]

    # Label is the next total_kwh
    sequence["label"] = sequence.total_kwh.shift(-1)
    # Drop the last row
    sequence = sequence.dropna()

    sequence = torch.tensor(sequence.values).float()

    # print(sequence.shape)

    return sequence

def get_n_sequences(n):
    stacked = torch.stack([get_sequence(train_df) for _ in range(n)])
    return stacked


def get_windowed_sequence(sequence, window_size=10):
    '''
    Args:
        - Sequence: first dimension is the batch size, second dimension is the sequence length, third dimension is the number of features
    Returns:
        - Windowed sequence: first dimension is the batch size, second dimension is the sequence length, third dimension is the number of features * window_size
    '''
    batch_size, sequence_length, num_features = sequence.shape
    windowed_sequence = torch.zeros((batch_size, sequence_length, num_features * (1 + window_size)))
    windowed_sequence[:, :, :num_features] = sequence
    for i in range(1, window_size + 1):
        windowed_sequence[:, i:, num_features * i:num_features * (i + 1)] = sequence[:, :-i, :]

    # Drop rows for which the window is incomplete
    windowed_sequence = windowed_sequence[:, window_size:, :]

    # print(windowed_sequence.shape)

    return windowed_sequence


In [9]:
EPOCHS = 20
BATCH_SIZE = 2
WINDOW_SIZE = 11
STRIDE = 1


INPUT_FEATURES = 2 * (1 + WINDOW_SIZE)

optim = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()

for epoch in range(EPOCHS):
    for i in range(0, len(train_df), BATCH_SIZE):
        batch = get_n_sequences(BATCH_SIZE)
        # windowed_batch = get_windowed_sequence(batch, WINDOW_SIZE)
        batch = batch.permute(0, 2, 1)
        print(batch.shape)
        optim.zero_grad()
        output = model(batch)
        loss = criterion(output, batch[:, -1, -1])
        loss.backward()
        optim.step()
        print(f"Epoch {epoch} Batch {i} Loss {loss.item()}")

torch.Size([2, 3, 83])


RuntimeError: Given groups=1, weight of size [16, 2, 4], expected input[2, 3, 86] to have 2 channels, but got 3 channels instead

In [None]:
windowed_batch.shape

In [None]:
# Reverse dimensions to go from (batch_size, sequence_length, num_features) to (batch_size, num_features, sequence_length)
# windowed_batch = windowed_batch.permute(0, 2, 1)
windowed_batch.shape