In [63]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [64]:
import pandas as pd
import torch
from torch import nn
from scipy import stats
import numpy as np
from sklearn import preprocessing
from datetime import datetime

In [65]:
df = pd.read_csv("./data/no1_train.csv")
df["y_prev"] = df["y"].shift(1)
df.describe()

Unnamed: 0,hydro,micro,thermal,wind,river,total,y,sys_reg,flow,y_prev
count,225088.0,225088.0,225088.0,225088.0,225088.0,225088.0,225088.0,225088.0,225088.0,225087.0
mean,1888.334127,224.189826,21.803497,55.703266,0.0,2190.031963,8.907921,-8.284305,-1999.055081,8.907633
std,401.000305,58.559855,3.579229,47.053033,0.0,444.07773,320.262539,43.748315,1311.682286,320.263221
min,683.438,80.371,0.0,0.0,0.0,849.732,-1579.680903,-828.0,-5541.2,-1579.680903
25%,1625.029,185.017,22.1,15.704,0.0,1916.523,-142.2267,0.0,-2996.7,-142.22696
50%,1933.916,225.938,22.1,41.848,0.0,2248.874,-11.282351,0.0,-1953.7,-11.283806
75%,2175.595,269.558,24.1,88.723,0.0,2497.792,127.454943,0.0,-906.9,127.455918
max,2995.524,349.271,25.7,176.0,0.0,3351.974,2956.333317,474.0,723.4,2956.333317


In [66]:
preprocess_columns = ["hydro", "micro", "thermal", "wind", "total", "sys_reg", "flow", "y_prev", "y"]

# Data preprocessing

### Clamp then normalize

In [62]:
def filter_column_based_on_quantile(df, q, col):
    q_low = df[col].quantile(q)
    q_hi  = df[col].quantile(1-q)
    df = df[(df[col] < q_hi) & (df[col] > q_low)]
    return df

for col in preprocess_columns:
    df = filter_column_based_on_quantile(df, 0.01, col)
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

df.describe()

0

In [67]:
df

Unnamed: 0,start_time,hydro,micro,thermal,wind,river,total,y,sys_reg,flow,y_prev
0,2019-01-09 14:10:00,1591.839,141.928,11.5,74.428,0.0,1819.695,341.330021,10.0,-4417.4,
1,2019-01-09 14:15:00,1591.839,141.928,11.5,74.428,0.0,1819.695,330.114330,10.0,-4417.4,341.330021
2,2019-01-09 14:20:00,1591.839,141.928,11.5,74.428,0.0,1819.695,323.877221,10.0,-4417.4,330.114330
3,2019-01-09 14:25:00,1591.839,141.928,11.5,74.428,0.0,1819.695,296.438463,10.0,-4417.4,323.877221
4,2019-01-09 14:30:00,1591.839,141.928,11.5,74.428,0.0,1819.695,269.097132,10.0,-4417.4,296.438463
...,...,...,...,...,...,...,...,...,...,...,...
225083,2021-03-01 03:05:00,1942.486,191.881,20.1,106.431,0.0,2266.098,141.618915,0.0,-2252.9,141.026229
225084,2021-03-01 03:10:00,1942.486,191.881,20.1,106.431,0.0,2266.098,129.043820,0.0,-2252.9,141.618915
225085,2021-03-01 03:15:00,1942.486,191.881,20.1,106.431,0.0,2266.098,94.907548,0.0,-2252.9,129.043820
225086,2021-03-01 03:20:00,1942.486,191.881,20.1,106.431,0.0,2266.098,102.589040,0.0,-2252.9,94.907548


### Add time features

In [79]:
df["start_time"]

0         2019-01-09 14:10:00
1         2019-01-09 14:15:00
2         2019-01-09 14:20:00
3         2019-01-09 14:25:00
4         2019-01-09 14:30:00
                 ...         
225083    2021-03-01 03:05:00
225084    2021-03-01 03:10:00
225085    2021-03-01 03:15:00
225086    2021-03-01 03:20:00
225087    2021-03-01 03:25:00
Name: start_time, Length: 225088, dtype: object

In [89]:
df['start_time'] = pd.to_datetime(df['start_time'], format="%Y-%m-%d %H:%M:%S")

In [96]:
print(df["start_time"].dt.dayofweek)

0         2
1         2
2         2
3         2
4         2
         ..
225083    0
225084    0
225085    0
225086    0
225087    0
Name: start_time, Length: 225088, dtype: int64


In [98]:
df["time_of_day"] = df["start_time"].dt.hour
df["time_of_week"] = df["start_time"].dt.dayofweek
df["time_of_year"] = df["start_time"].dt.month

In [99]:
df

Unnamed: 0,start_time,hydro,micro,thermal,wind,river,total,y,sys_reg,flow,y_prev,time_of_day,time_of_week,time_of_year
0,2019-01-09 14:10:00,1591.839,141.928,11.5,74.428,0.0,1819.695,341.330021,10.0,-4417.4,,14,2,1
1,2019-01-09 14:15:00,1591.839,141.928,11.5,74.428,0.0,1819.695,330.114330,10.0,-4417.4,341.330021,14,2,1
2,2019-01-09 14:20:00,1591.839,141.928,11.5,74.428,0.0,1819.695,323.877221,10.0,-4417.4,330.114330,14,2,1
3,2019-01-09 14:25:00,1591.839,141.928,11.5,74.428,0.0,1819.695,296.438463,10.0,-4417.4,323.877221,14,2,1
4,2019-01-09 14:30:00,1591.839,141.928,11.5,74.428,0.0,1819.695,269.097132,10.0,-4417.4,296.438463,14,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225083,2021-03-01 03:05:00,1942.486,191.881,20.1,106.431,0.0,2266.098,141.618915,0.0,-2252.9,141.026229,3,0,3
225084,2021-03-01 03:10:00,1942.486,191.881,20.1,106.431,0.0,2266.098,129.043820,0.0,-2252.9,141.618915,3,0,3
225085,2021-03-01 03:15:00,1942.486,191.881,20.1,106.431,0.0,2266.098,94.907548,0.0,-2252.9,129.043820,3,0,3
225086,2021-03-01 03:20:00,1942.486,191.881,20.1,106.431,0.0,2266.098,102.589040,0.0,-2252.9,94.907548,3,0,3


False

In [None]:
class LSTM(nn.Module):
    def __init__(self, hidden_layers=64):
        super(LSTM, self).__init__()
        self.hidden_layers = hidden_layers
        # lstm1, lstm2, linear are all layers in the network
        self.lstm1 = nn.LSTMCell(1, self.hidden_layers)
        self.lstm2 = nn.LSTMCell(self.hidden_layers, self.hidden_layers)
        self.linear = nn.Linear(self.hidden_layers, 1)
        
    def forward(self, y, future_preds=0):
        outputs, n_samples = [], y.size(0)
        h_t = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        c_t = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        h_t2 = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        c_t2 = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        
        for time_step in y.split(1, dim=1):
            # N, 1
            h_t, c_t = self.lstm1(input_t, (h_t, c_t)) # initial hidden and cell states
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2)) # new hidden and cell states
            output = self.linear(h_t2) # output from the last FC layer
            outputs.append(output)
            
        for i in range(future_preds):
            # this only generates future predictions if we pass in future_preds>0
            # mirrors the code above, using last output/prediction as input
            h_t, c_t = self.lstm1(output, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t2)
            outputs.append(output)
        # transform list to tensor    
        outputs = torch.cat(outputs, dim=1)
        return outputs