In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import pandas as pd
import numpy as np
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler
import pypfopt
#
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [35]:
df_complete = pd.read_csv('../../data/df_monthly_returns_complete.csv', index_col='Date')
df_overview = pd.read_csv('../../data/df_overview.csv', index_col=0)

In [36]:
df_overview[['company_name', 'stock_ticker_symbol' , 'score']].sort_values(by='score', ascending=False)

Unnamed: 0,company_name,stock_ticker_symbol,score
734,Sony Group Corp.,6758.T,0.929032
456,KEYENCE Corp.,6861.T,0.882153
486,"FAST RETAILING CO., LTD.",9983.T,0.788997
1627,"Daiichi Sankyo Co., Ltd.",4568.T,0.762761
1631,KDDI Corp.,9433.T,0.707565
...,...,...,...
1037,Triton International Ltd.,TRTN.PRE,
1135,National Rural Utilities Cooperative Finance Corp.,NRUC,
1200,Kontron AG,KTN.DE,
1236,BT Group Plc,BT.A.L,


In [90]:
timeseries = df_complete[["NVDA"]].dropna().values.astype('float32')

# train-test split for time series
train_size = int(len(timeseries) * 0.67)
test_size = len(timeseries) - train_size
train, test = timeseries[:train_size], timeseries[train_size:]

def create_dataset(dataset, lookback):
    """Transform a time series into a prediction dataset
    
    Args:
        dataset: A numpy array of time series, first dimension is the time steps
        lookback: Size of window for prediction
    """
    X, y = [], []
    for i in range(len(dataset)-lookback):
        feature = dataset[i:i+lookback]
        target = dataset[i+1:i+lookback+1]
        X.append(feature)
        y.append(target)
    return torch.tensor(X), torch.tensor(y)

lookback = 4
X_train, y_train = create_dataset(train, lookback=lookback)
X_test, y_test = create_dataset(test, lookback=lookback)

class LSTMBasicModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=50, num_layers=1, batch_first=True)
        self.linear = nn.Linear(50, 1)
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.linear(x)
        return x

model = LSTMBasicModel()
optimizer = optim.Adam(model.parameters())
loss_fn = nn.MSELoss()
loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=8)

n_epochs = 2000
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader: 
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    if epoch % 100 != 0:
        continue
    model.eval()
    with torch.no_grad():
        y_pred = model(X_train)
        train_rmse = np.sqrt(loss_fn(y_pred, y_train))
        y_pred = model(X_test)
        test_rmse = np.sqrt(loss_fn(y_pred, y_test))
    print("Epoch %d: train RMSE %.4f, test RMSE %.4f" % (epoch, train_rmse, test_rmse))

# shift train predictions for plotting
with torch.no_grad():
    train_plot = np.ones_like(timeseries) * np.nan
    y_pred = model(X_train)
    y_pred = y_pred[:, -1, :]
    train_plot[lookback:train_size] = model(X_train)[:, -1, :]
    # shift test predictions for plotting
    test_plot = np.ones_like(timeseries) * np.nan
    test_plot[train_size+lookback:len(timeseries)] = model(X_test)[:, -1, :]

Epoch 0: train RMSE 0.8477, test RMSE 0.8629
Epoch 100: train RMSE 0.1846, test RMSE 0.1468
Epoch 200: train RMSE 0.1858, test RMSE 0.1431
Epoch 300: train RMSE 0.1854, test RMSE 0.1431
Epoch 400: train RMSE 0.1835, test RMSE 0.1467
Epoch 500: train RMSE 0.1834, test RMSE 0.1446
Epoch 600: train RMSE 0.1828, test RMSE 0.1457
Epoch 700: train RMSE 0.1817, test RMSE 0.1455
Epoch 800: train RMSE 0.1797, test RMSE 0.1458
Epoch 900: train RMSE 0.1778, test RMSE 0.1486
Epoch 1000: train RMSE 0.1768, test RMSE 0.1472
Epoch 1100: train RMSE 0.1760, test RMSE 0.1503
Epoch 1200: train RMSE 0.1754, test RMSE 0.1516
Epoch 1300: train RMSE 0.1746, test RMSE 0.1501
Epoch 1400: train RMSE 0.1738, test RMSE 0.1503
Epoch 1500: train RMSE 0.1729, test RMSE 0.1510
Epoch 1600: train RMSE 0.1722, test RMSE 0.1506
Epoch 1700: train RMSE 0.1707, test RMSE 0.1545
Epoch 1800: train RMSE 0.1692, test RMSE 0.1562
Epoch 1900: train RMSE 0.1682, test RMSE 0.1606


In [91]:
# Create the plot
fig = go.Figure()

# Add the timeseries line
fig.add_trace(go.Scatter(y=np.hstack(timeseries), mode='lines', name='Timeseries',
                         line=dict(color='#5c839f', width=2)))  #, line=dict(color='red'))
# Add the training plot in red
fig.add_trace(go.Scatter(y=np.hstack(train_plot), mode='lines', name='Train Plot',
                         line=dict(color='red', width=2)))  #, line=dict(color='red')
# Add the testing plot in green
fig.add_trace(go.Scatter(y=np.hstack(test_plot), mode='lines', name='Test Plot',
                         line=dict(color='green', width=2)))  # , line=dict(color='green')

# Update layout with labels
fig.update_layout(
    title='Timeseries Plot with Train and Test',
    xaxis_title='Date',
    yaxis_title='Values',
    legend=dict(title="Legend")
)

# Show plot
fig.show()