Imports

In [22]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import iplot
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

device = "cpu"

making dataframe

In [23]:
df = pd.read_csv("dataset.csv").iloc[:, :2]
df.columns = ["Date", "Value"]
df = df.set_index('Date')
df.index = pd.to_datetime(df.index)
if not df.index.is_monotonic_increasing:
    df = df.sort_index()

print(df)

              Value
Date               
2008-01-11  24.4796
2008-01-12  24.3671
2008-01-15  24.2913
2008-01-16  24.2858
2008-01-17  24.3367
...             ...
2023-11-28  88.7045
2023-11-29  88.6102
2023-11-30  88.8841
2023-12-01  88.5819
2023-12-02  89.7619

[3936 rows x 1 columns]


lag check

In [24]:
def time_lags(df: pd.DataFrame, n_lags: int) -> pd.DataFrame:
    """Generation of observations with a time delay

    Args:
        df (pd.DataFrame): current dataframe
        n_lags (int): count of lags

    Returns:
        pd.DataFrame: dataframe with lags {shift = 1 --> n_lags}
    """
    df_n = df.copy()
    for n in range(1, n_lags + 1):
        df_n[f"lag{n}"] = df_n["Value"].shift(n)
    df_n = df_n.iloc[n_lags:]
    return df_n

input_dim = 100

df_timelags = time_lags(df, input_dim)
print(df_timelags)

              Value     lag1     lag2     lag3     lag4     lag5     lag6  \
Date                                                                        
2008-06-05  23.8019  23.6968  23.7473  23.7384  23.6659  23.5847  23.5513   
2008-06-06  23.8116  23.8019  23.6968  23.7473  23.7384  23.6659  23.5847   
2008-06-07  23.6809  23.8116  23.8019  23.6968  23.7473  23.7384  23.6659   
2008-06-08  23.5651  23.6809  23.8116  23.8019  23.6968  23.7473  23.7384   
2008-06-10  23.5210  23.5651  23.6809  23.8116  23.8019  23.6968  23.7473   
...             ...      ...      ...      ...      ...      ...      ...   
2023-11-28  88.7045  88.8133  88.1206  88.1648  87.8701  88.4954  89.1237   
2023-11-29  88.6102  88.7045  88.8133  88.1206  88.1648  87.8701  88.4954   
2023-11-30  88.8841  88.6102  88.7045  88.8133  88.1206  88.1648  87.8701   
2023-12-01  88.5819  88.8841  88.6102  88.7045  88.8133  88.1206  88.1648   
2023-12-02  89.7619  88.5819  88.8841  88.6102  88.7045  88.8133  88.1206   


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



split data

In [25]:
def feature_label_split(df: pd.DataFrame, target_col: str) -> tuple[pd.DataFrame]:
    """Separation of validation and training sets

    Args:
        df (pd.DataFrame): current dataframe
        target_col (str): column in dataframe

    Returns:
        tuple[pd.DataFrame]: origin column and split column
    """
    y = df[[target_col]]
    X = df.drop(columns=[target_col])
    return X, y

def train_val_test_split(df: pd.DataFrame, target_col: str, test_ratio: float) -> tuple[pd.DataFrame]:
    """Separation of validation and training sets

    Args:
        df (pd.DataFrame): current dataframe
        target_col (str): column in dataframe for RNN
        test_ratio (float): num for val_ratio

    Returns:
        tuple[pd.DataFrame]: data for scaling
    """
    val_ratio = test_ratio / (1 - test_ratio)
    X, y = feature_label_split(df, target_col)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio, shuffle=False)
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(df_timelags, 'Value', 0.1)
print(len(X_train))
print(len(X_test))
print(len(X_val))
print(X_train)

3068
384
384
               lag1     lag2     lag3     lag4     lag5     lag6     lag7  \
Date                                                                        
2008-06-05  23.6968  23.7473  23.7384  23.6659  23.5847  23.5513  23.5483   
2008-06-06  23.8019  23.6968  23.7473  23.7384  23.6659  23.5847  23.5513   
2008-06-07  23.8116  23.8019  23.6968  23.7473  23.7384  23.6659  23.5847   
2008-06-08  23.6809  23.8116  23.8019  23.6968  23.7473  23.7384  23.6659   
2008-06-10  23.5651  23.6809  23.8116  23.8019  23.6968  23.7473  23.7384   
...             ...      ...      ...      ...      ...      ...      ...   
2020-10-21  77.9241  77.9644  77.9461  77.2759  77.2855  77.0239  77.0284   
2020-10-22  77.7780  77.9241  77.9644  77.9461  77.2759  77.2855  77.0239   
2020-10-23  77.0322  77.7780  77.9241  77.9644  77.9461  77.2759  77.2855   
2020-10-24  77.0809  77.0322  77.7780  77.9241  77.9644  77.9461  77.2759   
2020-10-27  76.4667  77.0809  77.0322  77.7780  77.9241  77.964

Load and scale data

In [26]:
def get_scaler(scaler: str) -> 'sklearn.preprocessing._data.MinMaxScaler':
    """ selection of scaling type

    Args:
        scaler (str): key for scalers-dict

    Returns:
        sklearn.preprocessing._data.MinMaxScaler: type of scaling method
    """
    scalers = {
        "minmax": MinMaxScaler,
        "standard": StandardScaler,
        "maxabs": MaxAbsScaler,
        "robust": RobustScaler,
    }
    return scalers.get(scaler.lower())()

scaler = get_scaler('minmax')
X_train_arr = scaler.fit_transform(X_train)
X_val_arr = scaler.transform(X_val)
X_test_arr = scaler.transform(X_test)

y_train_arr = scaler.fit_transform(y_train)
y_val_arr = scaler.transform(y_val)
y_test_arr = scaler.transform(y_test)

batch_size = 64

train_features = torch.Tensor(X_train_arr)
train_targets = torch.Tensor(y_train_arr)
val_features = torch.Tensor(X_val_arr)
val_targets = torch.Tensor(y_val_arr)
test_features = torch.Tensor(X_test_arr)
test_targets = torch.Tensor(y_test_arr)

train = TensorDataset(train_features, train_targets)
val = TensorDataset(val_features, val_targets)
test = TensorDataset(test_features, test_targets)

train_loader = DataLoader(train, batch_size=batch_size, shuffle=False, drop_last=True)
val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader_one = DataLoader(test, batch_size=1, shuffle=False, drop_last=True)

RNN model

In [27]:
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):
        """The __init__ method that initiates an RNN instance.

        Args:
            input_dim (int): The number of nodes in the input layer(входной слой)
            hidden_dim (int): The number of nodes in each layer(скрытый слой)
            layer_dim (int): The number of layers in the network(кол-во слоев нейронки)
            output_dim (int): The number of nodes in the output layer(выходной слой)
            dropout_prob (float): The probability of nodes being dropped out(вер-ть исключить эл-т)

        """
        super(RNNModel, self).__init__()

        # Defining the number of layers and the nodes in each layer
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        # RNN layers
        self.rnn = nn.RNN(
            input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
        )
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        """The forward method takes input tensor x and does forward propagation

        Args:
            x (torch.Tensor): The input tensor of the shape (batch size, sequence length, input_dim)

        Returns:
            torch.Tensor: The output tensor of the shape (batch size, output_dim)

        """
        # Initializing hidden state for first input with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # Forward propagation by passing in the input and hidden state into the model
        out, h0 = self.rnn(x, h0.detach())

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        out = out[:, -1, :]

        # Convert the final state to our desired output shape (batch_size, output_dim)
        out = self.fc(out)
        return out