In [25]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

In [26]:
def null_analysis(df):
    col_names = df[df.isnull().any(axis=1)]['Name'].unique()
    for name in col_names:
        comp_data = df[df['Name'] == name]
        print(f'\n---------{name}---------')
        print(comp_data.isnull().sum() *100 / len(comp_data))

def modified_null_analysis(df, col, threshold):
    col_names = df[df.isnull().any(axis=1)]['Name'].unique()
    comp_list = []
    for name in col_names:
        comp_data = df[df['Name'] == name]
        if comp_data.isnull().sum()[col] *100 / len(comp_data) > threshold:
            comp_list.append(name)
    return comp_list

In [27]:
working_dir = './'
df = pd.read_csv(working_dir+'preprocessed_CAC40.csv', parse_dates=['Date'])
df.drop('Unnamed: 0', axis=1, inplace=True)
drop_list = modified_null_analysis(df, 'Volume', 50)
df = df.query(f'Name != {drop_list}')

In [28]:
df2 = df.copy() # Saving our work till here
df2['Volume'] = df2['Volume'].str.replace(',', '')
df2['Volume'] = df2['Volume'].astype(float)
trial = IterativeImputer(random_state=999, estimator=BayesianRidge())

In [29]:
df3 = pd.DataFrame(trial.fit_transform(df2.iloc[:, 2:]))
df3.columns = df2.iloc[:, 2:].columns
df4 = df3.copy()



In [30]:
for i in range(1, 5 + 1):
    df4[f'Open(t-{i})'] = df4['Open'].shift(i)
    df4[f'Closing_Price(t-{i})'] = df4['Closing_Price'].shift(i)
    df4[f'Daily_High(t-{i})'] = df4['Daily_High'].shift(i)
    df4[f'Daily_Low(t-{i})'] = df4['Daily_Low'].shift(i)
    df4[f'Volume(t-{i})'] = df4['Volume'].shift(i)

df4.dropna(inplace=True)
df4.drop(['Open', 'Daily_High', 'Daily_Low', 'Volume'], axis=1, inplace=True)
df4 = df4.reset_index(drop=True)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(df4.drop('Closing_Price', axis=1).values, df4['Closing_Price'].values, test_size=0.2, random_state=999)
scaler = MinMaxScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_train, X_test = X_train.reshape(-1, 5, 5), X_test.reshape(-1, 5, 5)
X_train, X_test, y_train, y_test = torch.tensor(X_train, dtype=torch.float), torch.tensor(X_test, dtype=torch.float), torch.tensor(y_train, dtype=torch.float), torch.tensor(y_test, dtype=torch.float)
train_set = TensorDataset(X_train, y_train)
test_set = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_set, batch_size=128, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False)

In [33]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [34]:
best_params = {'optimizer':'Adam','n_layerlstm': 2, 'hidden_state': 82, 'n_layersfc': 2, 'n_units_l0': 91, 'dropout_l0': 0.14192353580674466, 'n_units_l1': 57, 'dropout_l1': 0.12740415994127938, 'lr': 0.0071326214304045015}

In [35]:
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(5, best_params['hidden_state'], batch_first=True, num_layers=best_params['n_layerlstm'])
        self.fc = nn.Sequential(
            nn.Linear(best_params['hidden_state'], best_params['n_units_l0']),
            nn.ReLU(),
            nn.Dropout(best_params['dropout_l0']),
            nn.Linear(best_params['n_units_l0'], best_params['n_units_l1']),
            nn.ReLU(),
            nn.Dropout(best_params['dropout_l1']),
            nn.Linear(best_params['n_units_l1'], 1)
        )

    def forward(self, x):
        output, (hn, cn) = self.lstm(x)
        return self.fc(hn[-1])

model = MyModel()
model = model.to(device)

In [37]:
optimizer = getattr(torch.optim, best_params['optimizer'])(model.parameters(), best_params['lr'])
loss_fn = nn.MSELoss()
TRAIN_BATCH_SIZE = 128
TEST_BATCH_SIZE = 64
EPOCHS = 40

In [38]:
print('Epoch | Train Loss | Test Loss')
for epoch in range(EPOCHS):
  model.train()
  train_loss = 0
  for X, y in train_loader:
    X, y = X.to(device), y.to(device)
    out = model(X).squeeze(1)
    loss = loss_fn(out, y)

    train_loss += (loss/TRAIN_BATCH_SIZE).item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


  model.eval()
  with torch.inference_mode():
    test_loss = 0
    for X, y in test_loader:
      X, y = X.to(device), y.to(device)
      out = model(X).squeeze(1)
      loss = loss_fn(out, y)
      test_loss += (loss/TEST_BATCH_SIZE).item()

  print(f'{epoch} {train_loss/len(train_loader)} {test_loss/len(test_loader)} ')

Epoch | Train Loss | Test Loss
0 56.07621495767695 3.654884304668083 
1 1.3144201401804314 0.26087923497572296 
2 1.1316780632392305 0.3519186398289243 
3 1.0421631915769616 0.47276472507930195 
4 1.1063320232463665 1.0943430645055459 
5 0.9888653786944561 0.11760301567369798 
6 1.0056542081788915 0.17126675623423251 
7 1.014162886582437 0.9576932580256071 
8 1.0302165090182767 0.6066819783910865 
9 0.9786084799981508 0.23727981092744185 
10 0.9904279021821061 0.3258759190679574 
11 0.9742587963271825 0.3337278529757359 
12 0.9832355831368048 0.35118687458214215 
13 0.9322684235015853 0.3184953963719919 
14 0.93751780749833 0.3193758849787419 
15 0.9480630930696354 0.181317804320181 
16 0.9447283491370131 0.3709682552601959 
17 0.9516753399225532 0.20284845438770582 
18 0.8948523618097677 0.30771615716521855 
19 0.8788467691753243 0.14707482997022692 
20 0.8609523177757615 0.7204474826938794 
21 0.922529155296869 0.5980521323250942 
22 0.8187487834423292 0.13857758258942698 
23 0.89300

In [39]:
torch.save(model.state_dict(), './lstmModel.pt')

In [40]:
import joblib
joblib.dump(scaler, 'scaler.gz')

['scaler.gz']