# Setup and Explore Data

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

import optuna
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_param_importances

In [None]:
working_dir = './'
df = pd.read_csv(working_dir+'preprocessed_CAC40.csv', parse_dates=['Date'])
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Date,Open,Closing_Price,Daily_High,Daily_Low,Volume
0,0,Accor,2020-04-03,22.99,23.4,23.4,22.99,67
1,1,Accor,2020-04-02,23.91,22.99,23.91,22.99,250
2,2,Accor,2020-04-01,24.1,23.83,24.1,23.83,37
3,3,Accor,2020-03-31,25.04,25.0,25.24,24.99,336
4,4,Accor,2020-03-30,26.5,25.02,26.5,24.99,415


In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,Name,Date,Open,Closing_Price,Daily_High,Daily_Low,Volume
0,Accor,2020-04-03,22.99,23.4,23.4,22.99,67
1,Accor,2020-04-02,23.91,22.99,23.91,22.99,250
2,Accor,2020-04-01,24.1,23.83,24.1,23.83,37
3,Accor,2020-03-31,25.04,25.0,25.24,24.99,336
4,Accor,2020-03-30,26.5,25.02,26.5,24.99,415


# EDA

In [None]:
print(f'There are total {df.shape[0]} rows and {df.shape[1]} features\n')
print(f'There are total {len(df["Name"].unique())} Companies Data\n')
print('------NULL Values-------')
print(df.isnull().sum())

There are total 97648 rows and 7 features

There are total 38 Companies Data

------NULL Values-------
Name                 0
Date                 0
Open               230
Closing_Price        2
Daily_High         204
Daily_Low          204
Volume           20453
dtype: int64


In [None]:
def null_analysis(df):
    col_names = df[df.isnull().any(axis=1)]['Name'].unique()
    for name in col_names:
        comp_data = df[df['Name'] == name]
        print(f'\n---------{name}---------')
        print(comp_data.isnull().sum() *100 / len(comp_data))


In [None]:
null_analysis(df)


---------Accor---------
Name              0.000000
Date              0.000000
Open              0.076953
Closing_Price     0.000000
Daily_High        0.038476
Daily_Low         0.038476
Volume           53.943825
dtype: float64

---------Air Liquide---------
Name             0.000000
Date             0.000000
Open             0.038462
Closing_Price    0.000000
Daily_High       0.000000
Daily_Low        0.000000
Volume           5.153846
dtype: float64

---------Airbus ---------
Name             0.000000
Date             0.000000
Open             0.038462
Closing_Price    0.000000
Daily_High       0.000000
Daily_Low        0.000000
Volume           0.000000
dtype: float64

---------ArcelorMittal---------
Name             0.000000
Date             0.000000
Open             0.000000
Closing_Price    0.000000
Daily_High       0.000000
Daily_Low        0.000000
Volume           0.422914
dtype: float64

---------Atos---------
Name              0.000000
Date              0.000000
Open       

In [None]:
# There are companies which has many nan values of Volumne, modifying above null_analysis
def modified_null_analysis(df, col, threshold):
    col_names = df[df.isnull().any(axis=1)]['Name'].unique()
    comp_list = []
    for name in col_names:
        comp_data = df[df['Name'] == name]
        if comp_data.isnull().sum()[col] *100 / len(comp_data) > threshold:
            comp_list.append(name)
    return comp_list

In [None]:
#  Dropped companies having values more then 50% of as nan
drop_list = modified_null_analysis(df, 'Volume', 50)
df = df.query(f'Name != {drop_list}')

In [None]:
df2 = df.copy() # Saving our work till here
df2['Volume'] = df2['Volume'].str.replace(',', '')
df2['Volume'] = df2['Volume'].astype(float)
df2.shape

(77994, 7)

In [None]:
trial = IterativeImputer(random_state=999, estimator=BayesianRidge())

In [None]:
df3 = pd.DataFrame(trial.fit_transform(df2.iloc[:, 2:]))
df3.columns = df2.iloc[:, 2:].columns
df3.head()



Unnamed: 0,Open,Closing_Price,Daily_High,Daily_Low,Volume
0,116.0,112.95,116.0,110.0,331.0
1,115.95,114.05,115.95,112.3,480.0
2,113.9,114.6,115.35,112.95,784.0
3,110.25,112.15,112.7,106.9,485.0
4,110.2,107.0,110.2,105.8,525.0


In [None]:
df4 = df3.copy()

In [None]:
for i in range(1, 10 + 1):
    df4[f'Open(t-{i})'] = df4['Open'].shift(i)
    df4[f'Closing_Price(t-{i})'] = df4['Closing_Price'].shift(i)
    df4[f'Daily_High(t-{i})'] = df4['Daily_High'].shift(i)
    df4[f'Daily_Low(t-{i})'] = df4['Daily_Low'].shift(i)
    df4[f'Volume(t-{i})'] = df4['Volume'].shift(i)

df4.dropna(inplace=True)
df4.drop(['Open', 'Daily_High', 'Daily_Low', 'Volume'], axis=1, inplace=True)
df4 = df4.reset_index(drop=True)

In [None]:
df4.head()

Unnamed: 0,Closing_Price,Open(t-1),Closing_Price(t-1),Daily_High(t-1),Daily_Low(t-1),Volume(t-1),Open(t-2),Closing_Price(t-2),Daily_High(t-2),Daily_Low(t-2),...,Open(t-9),Closing_Price(t-9),Daily_High(t-9),Daily_Low(t-9),Volume(t-9),Open(t-10),Closing_Price(t-10),Daily_High(t-10),Daily_Low(t-10),Volume(t-10)
0,104.55,108.4,103.0,110.75,101.85,699.0,103.0,106.95,106.95,100.95,...,115.95,114.05,115.95,112.3,480.0,116.0,112.95,116.0,110.0,331.0
1,98.5,100.0,104.55,104.55,99.66,266.0,108.4,103.0,110.75,101.85,...,113.9,114.6,115.35,112.95,784.0,115.95,114.05,115.95,112.3,480.0
2,105.8,102.85,98.5,103.05,98.0,432.0,100.0,104.55,104.55,99.66,...,110.25,112.15,112.7,106.9,485.0,113.9,114.6,115.35,112.95,784.0
3,98.86,100.0,105.8,106.3,99.68,944.0,102.85,98.5,103.05,98.0,...,110.2,107.0,110.2,105.8,525.0,110.25,112.15,112.7,106.9,485.0
4,105.0,105.0,98.86,105.0,95.0,1666.0,100.0,105.8,106.3,99.68,...,108.55,111.7,111.7,106.5,653.0,110.2,107.0,110.2,105.8,525.0


# Tuning Hyperparameter

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df4.drop('Closing_Price', axis=1).values, df4['Closing_Price'].values, test_size=0.2, random_state=999)

In [None]:
scaler = MinMaxScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train, X_test = X_train.reshape(-1, 10, 5), X_test.reshape(-1, 10, 5)
X_train, X_test, y_train, y_test = torch.tensor(X_train, dtype=torch.float), torch.tensor(X_test, dtype=torch.float), torch.tensor(y_train, dtype=torch.float), torch.tensor(y_test, dtype=torch.float)

In [None]:
train_set = TensorDataset(X_train, y_train)
test_set = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_set, batch_size=128, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 20

## Optuna 1

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, trial):
        super(LSTMModel, self).__init__()

        input_seq = 5  # Input sequence length (can be adjusted based on your data)

        # LSTM layer parameters from Optuna
        n_layerlstm = trial.suggest_int("n_layerlstm", 1, 3)
        hidden_state = trial.suggest_int("hidden_state", 50, 150)

        # LSTM layer
        self.lstm = nn.LSTM(input_seq, hidden_state, num_layers=n_layerlstm, batch_first=True)

        # Fully connected layer parameters
        n_layersfc = trial.suggest_int("n_layersfc", 2, 4)

        layers = []
        input_feature = hidden_state  # Starting size is the hidden state output from LSTM

        for i in range(n_layersfc):
            out_features = trial.suggest_int(f"n_units_l{i}", 32, 256)
            layers.append(nn.Linear(input_feature, out_features))
            layers.append(nn.ReLU())
            p = trial.suggest_float(f"dropout_l{i}", 0.1, 0.3)
            layers.append(nn.Dropout(p))
            input_feature = out_features

        # Final output layer (for regression)
        layers.append(nn.Linear(input_feature, 1))

        # Define fully connected layers
        self.fc = nn.Sequential(*layers)

    def forward(self, x):
        output, (hn, cn) = self.lstm(x)
        return self.fc(hn[-1])


In [None]:
def objective(trial):
    # Generate the model.
    model = LSTMModel(trial).to(device)

    lr = trial.suggest_float("lr", 1e-3, 1e-1, log=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    for _ in range(10):
        model.train()
        for X, y in train_loader:
            X, y = X.to(device), y.to(device)
            out = model(X).squeeze(1)
            loss = loss_fn(out, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


        model.eval()
        with torch.inference_mode():
            test_loss = 0
            for X, y in test_loader:
                X, y = X.to(device), y.to(device)
                out = model(X).squeeze(1)
                loss = loss_fn(out, y)
                test_loss += (loss/64).item()
    return test_loss/len(test_loader)

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[I 2024-10-09 07:40:33,336] A new study created in memory with name: no-name-a227d81d-18a7-4311-a72b-14f01808e5f2
[I 2024-10-09 07:41:12,787] Trial 0 finished with value: 117.69762060290478 and parameters: {'n_layerlstm': 3, 'hidden_state': 132, 'n_layersfc': 2, 'n_units_l0': 100, 'dropout_l0': 0.2696253420796084, 'n_units_l1': 185, 'dropout_l1': 0.24636810679594015, 'lr': 0.012607959312966112}. Best is trial 0 with value: 117.69762060290478.
[I 2024-10-09 07:41:33,406] Trial 1 finished with value: 3.2047477983793273 and parameters: {'n_layerlstm': 1, 'hidden_state': 50, 'n_layersfc': 3, 'n_units_l0': 53, 'dropout_l0': 0.19166833466199001, 'n_units_l1': 117, 'dropout_l1': 0.10591755876175662, 'n_units_l2': 119, 'dropout_l2': 0.24519467508099382, 'lr': 0.0024537498774849856}. Best is trial 1 with value: 3.2047477983793273.
[I 2024-10-09 07:41:58,537] Trial 2 finished with value: 0.6196888112997423 and parameters: {'n_layerlstm': 3, 'hidden_state': 114, 'n_layersfc': 2, 'n_units_l0': 119

In [None]:
print(f'Best Trial Acc: {study.best_trial.values[0]}')
print(f'Best Hyperparameter: {study.best_trial.params}')

Best Trial Acc: 0.5228803125286444
Best Hyperparameter: {'n_layerlstm': 2, 'hidden_state': 82, 'n_layersfc': 2, 'n_units_l0': 91, 'dropout_l0': 0.14192353580674466, 'n_units_l1': 57, 'dropout_l1': 0.12740415994127938, 'lr': 0.0071326214304045015}


In [None]:
plot_optimization_history(study).show()

In [None]:
plot_parallel_coordinate(study).show()

In [None]:
# There are too many combination possible we should increase trials
plot_slice(study).show()

In [None]:
plot_param_importances(study).show()

## Optuna 2

In [None]:
# Changes from above graphs
# 1. Fix number of n_layerfc, dropout layers and nunits
# 2. Try more optimizer for next study

In [None]:
best = study.best_trial.params
class LSTMModel2(nn.Module):
    def __init__(self, trial):
        super(LSTMModel2, self).__init__()

        input_seq = 5  # Input sequence length (can be adjusted based on your data)

        # LSTM layer parameters from Optuna
        n_layerlstm = trial.suggest_int("n_layerlstm", 1, 3)
        hidden_state = trial.suggest_int("hidden_state", 50, 150)

        # LSTM layer
        self.lstm = nn.LSTM(input_seq, hidden_state, num_layers=n_layerlstm, batch_first=True)

        # Fully connected layer parameters
        n_layersfc = 2 # trial.suggest_int("n_layersfc", 2, 4)

        layers = []
        input_feature = hidden_state  # Starting size is the hidden state output from LSTM

        for i in range(n_layersfc):
            out_features = best[f'n_units_l{i}'] #trial.suggest_int(f"n_units_l{i}", 32, 256)
            layers.append(nn.Linear(input_feature, out_features))
            layers.append(nn.ReLU())
            if i==0:
              p = best[f'dropout_l{i}']#trial.suggest_float(f"dropout_l{i}", 0.1, 0.3)
              layers.append(nn.Dropout(p))
            input_feature = out_features

        # Final output layer (for regression)
        layers.append(nn.Linear(input_feature, 1))

        # Define fully connected layers
        self.fc = nn.Sequential(*layers)

    def forward(self, x):
        output, (hn, cn) = self.lstm(x)
        return self.fc(hn[-1])

In [None]:
def objective2(trial):
    # Generate the model.
    model = LSTMModel2(trial).to(device)

    optimizer_name = trial.suggest_categorical("optimizer", ["Adadelta", "RMSprop", "Adam"])

    lr = trial.suggest_float("lr", 1e-3, 1e-1, log=True)
    optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr)
    loss_fn = nn.MSELoss()
    for _ in range(EPOCHS):
        model.train()
        for X, y in train_loader:
            X, y = X.to(device), y.to(device)
            out = model(X).squeeze(1)
            loss = loss_fn(out, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


        model.eval()
        with torch.inference_mode():
            test_loss = 0
            for X, y in test_loader:
                X, y = X.to(device), y.to(device)
                out = model(X).squeeze(1)
                loss = loss_fn(out, y)
                test_loss += (loss/64).item()
    return test_loss/len(test_loader)

In [None]:
study2 = optuna.create_study(direction="minimize")
study2.optimize(objective2, n_trials=50)

[I 2024-10-09 08:17:57,877] A new study created in memory with name: no-name-b7043209-7a23-4cb4-a65d-933f06032f46
[I 2024-10-09 08:18:39,460] Trial 0 finished with value: 0.9495221446039247 and parameters: {'n_layerlstm': 1, 'hidden_state': 119, 'optimizer': 'Adam', 'lr': 0.0035923321664155065}. Best is trial 0 with value: 0.9495221446039247.
[I 2024-10-09 08:19:32,518] Trial 1 finished with value: 118.35251987957564 and parameters: {'n_layerlstm': 3, 'hidden_state': 68, 'optimizer': 'Adam', 'lr': 0.013806659250702548}. Best is trial 0 with value: 0.9495221446039247.
[I 2024-10-09 08:20:18,253] Trial 2 finished with value: 3.0631155657475113 and parameters: {'n_layerlstm': 2, 'hidden_state': 98, 'optimizer': 'RMSprop', 'lr': 0.009965568586076966}. Best is trial 0 with value: 0.9495221446039247.
[I 2024-10-09 08:21:12,510] Trial 3 finished with value: 1.5742807273493438 and parameters: {'n_layerlstm': 3, 'hidden_state': 78, 'optimizer': 'Adadelta', 'lr': 0.0023007006514631193}. Best is 

In [None]:
print(f'Best Trial Acc: {study2.best_trial.values[0]}')
print(f'Best Hyperparameter: {study2.best_trial.params}')

Best Trial Acc: 0.6017204042615705
Best Hyperparameter: {'n_layerlstm': 2, 'hidden_state': 139, 'optimizer': 'Adam', 'lr': 0.003357862963698128}


In [None]:
plot_optimization_history(study2).show()

In [None]:
plot_parallel_coordinate(study2).show()

In [None]:
plot_slice(study2).show()

In [None]:
plot_param_importances(study2).show()

# Training Final Model

In [None]:
# Best Hyperparameter: {'hidden_state': 109, 'n_units_l0': 201, 'dropout_l0': 0.1315249549881753, 'n_units_l1': 222, 'optimizer': 'Adadelta', 'lr': 0.06658907185863887}
best_params = study2.best_trial.params
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(5, best_params['hidden_state'], batch_first=True, num_layers=3)
        self.fc = nn.Sequential(
            nn.Linear(best_params['hidden_state'], best_params['n_units_l0']),
            nn.ReLU(),
            nn.Dropout(best_params['dropout_l0']),
            nn.Linear(best_params['n_units_l0'], best_params['n_units_l1']),
            nn.ReLU(),
            nn.Linear(best_params['n_units_l1'], 1)
        )

    def forward(self, x):
        output, (hn, cn) = self.lstm(x)
        return self.fc(hn[-1])

model = MyModel()
model = model.to(device)

In [None]:
loss_fn = nn.MSELoss()
optimizer = getattr(torch.optim, best_params['optimizer'])(model.parameters(), lr=best_params['lr'])
EPOCHS = 50

print('Epoch | Train Loss | Test Loss')
for epoch in range(EPOCHS):
  model.train()
  train_loss = 0
  for X, y in train_loader:
    X, y = X.to(device), y.to(device)
    out = model(X).squeeze(1)
    loss = loss_fn(out, y)

    train_loss += (loss/128).item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


  model.eval()
  with torch.inference_mode():
    test_loss = 0
    for X, y in test_loader:
      X, y = X.to(device), y.to(device)
      out = model(X).squeeze(1)
      loss = loss_fn(out, y)
      test_loss += (loss/64).item()

  print(f'{epoch} {train_loss/len(train_loader)} {test_loss/len(test_loader)} ')


Epoch | Train Loss | Test Loss
0 10.198093950840049 1.4278715759760043 
1 0.7022804665363959 1.9080641775346192 
2 0.5946076524550797 0.9590508222396745 
3 0.5623158547477644 1.7109805594702236 
4 0.5128935735642177 1.0240056881161987 
5 0.5113909575843909 1.1118057139950697 
6 0.47346495389633003 1.1628876163120396 
7 0.46355991109014777 0.9088400503254084 
8 0.43510481821312036 0.9831353734995498 
9 0.4323548810006898 1.5543293970041587 
10 0.4268712861860385 1.003584508097074 
11 0.4202838986654018 0.7510220113011902 
12 0.40500555121049775 0.8902543195996617 
13 0.38732638162728705 0.8127961987171506 
14 0.38332136905920067 0.7795039639029591 
15 0.36203239546691784 0.677042146129381 
16 0.37215120904911003 0.7430896969878527 
17 0.35042244927255345 1.4920203941034489 
18 0.34585901256650686 1.535438133983827 
19 0.33180557205112743 0.8557451079736967 
20 0.33061164923653497 0.6022832721899278 
21 0.3349269623333802 0.8447721178231181 
22 0.32403188762178675 0.9782419728084666 
23 