In [1]:
import torch
from torch import nn

In [2]:
class Transformer_LSTM(nn.Module):
    def __init__(self, dimention, factor_num, sequence, fully_connect_layer_neural, layer_num=2,transformer = False):
        super(Transformer_LSTM, self).__init__()
        self.factor_num = factor_num  # 108
        self.sequence = sequence  #
        self.dimention = dimention  #
        self.fc2_neuron = fully_connect_layer_neural  # 32
        self.transformer = transformer

        # Layer
        self.bn1 = torch.nn.BatchNorm1d(self.dimention * self.factor_num * self.sequence)
        self.bn2 = torch.nn.BatchNorm1d(self.fc2_neuron * 2 * self.sequence)
        self.bn3 = torch.nn.BatchNorm1d(self.fc2_neuron * 2)
        if self.transformer == True:
            self.q_metrix = nn.Linear(self.factor_num, self.factor_num)
            self.k_metrix = nn.Linear(self.factor_num, self.factor_num)
            self.v_metrix = nn.Linear(self.factor_num, self.factor_num)
            self.MultiheadAttention = nn.MultiheadAttention(self.factor_num, layer_num, batch_first=True)
        
        self.lstm = nn.LSTM(self.factor_num, self.fc2_neuron, layer_num, batch_first=True, bidirectional=True,
                            dropout=0.2)
        self.dropout = nn.Dropout(0.2)
        self.LeakyReLU = nn.LeakyReLU()
        self.out = nn.Linear(self.fc2_neuron * 2, 1)

    def forward(self, x):
        x = self.norm_1(x)
        x = torch.transpose(x, 0, 1)  # x.shape: torch.Size([3, 512, 20, 108])

        final, (hn, cn) = self.transformer_lstm(x[0], 2)  # torch.Size([512, 20, 128])
        for i in range(1, x.shape[0]):
            add, _ = self.transformer_lstm(x[i], 2)
            final = self.skip_connection(final, add)
        # start = torch.Size([512, 20, 128])

        x, _ = self.attention_net(final, hn)
        x = self.bn3(x)
        x = self.LeakyReLU(x)
        x = self.dropout(x)
        y_pred = self.out(x)
        return y_pred

    def norm_1(self, x):
        batch_num, original_shape = x.shape[0], x.shape
        x = x.reshape(batch_num, -1)
        x = self.bn1(x)
        x = x.reshape(original_shape)
        return x

    def norm_2(self, x):
        batch_num, original_shape = x.shape[0], x.shape
        x = x.reshape(batch_num, -1)
        x = self.bn2(x)
        x = x.reshape(original_shape)
        return x

    def transformer_lstm(self, x, layer_num):
        if self.transformer == True:
            q = self.q_metrix(x)
            k = self.k_metrix(x)
            v = self.v_metrix(x)
            x, x_weight = self.MultiheadAttention(q, k, v)  # attn_output = torch.Size([512, 20, 128])
        else:
            continue
        # out = torch.Size([512, 20, 128])
        out, (hn, cn) = self.lstm(x)
        return out, (hn, cn)

    def skip_connection(self, origin, add):
        return self.norm_2(origin + add)

    def attention_net(self, lstm_output, final_state):
        # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
        # final_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        hidden = torch.cat((final_state[0], final_state[1]), dim=1).unsqueeze(
            2)  # hidden : [batch_size, n_hidden * num_directions(=2), n_layer(=1)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)  # [batch_size,sequence]
        attn_weights = torch.nn.functional.softmax(attn_weights, 1)  # [batch_size,sequence]   # torch.Size([512, 20])
        # context: [batch_size, n_hidden * num_directions(=2)]
        output = torch.bmm(lstm_output.transpose(1, 2), attn_weights.unsqueeze(2)).squeeze(
            2)  # [batch_size, n_hidden * num_directions(=2)]
        return output, attn_weights

In [10]:
class AlphaNet_LSTM_V1(nn.Module):
    def __init__(self, factor_num, sequence, fully_connect_layer_neural, attention=False,transformer = False):
        super(AlphaNet_LSTM_V1, self).__init__()
        self.factor_num = factor_num  # 108
        self.sequence = sequence
        self.fc2_neuron = fully_connect_layer_neural  # 32
        self.attention = attention
        self.transformer = transformer
        # Layer
        self.batch = torch.nn.BatchNorm1d(self.sequence * self.factor_num)
        self.lstm = nn.LSTM(self.factor_num, self.fc2_neuron, 3, batch_first=True, bidirectional=True, dropout=0.2)
        self.lstm2 = nn.LSTM(int(self.fc2_neuron * 2), int(self.fc2_neuron / 2), 3, batch_first=True,
                             bidirectional=True, dropout=0.2)
        self.batch2 = torch.nn.BatchNorm1d(int(self.fc2_neuron * 2))
        self.batch3 = torch.nn.BatchNorm1d(self.fc2_neuron)
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.LeakyReLU = nn.LeakyReLU()
        self.out = nn.Linear(self.fc2_neuron, 1)
        if self.transformer == True:
            self.q_metrix = nn.Linear(self.fc2_neuron * 2, self.fc2_neuron * 2)
            self.k_metrix = nn.Linear(self.fc2_neuron * 2, self.fc2_neuron * 2)
            self.v_metrix = nn.Linear(self.fc2_neuron * 2, self.fc2_neuron * 2)
            self.MultiheadAttention = nn.MultiheadAttention(self.fc2_neuron * 2, 2, batch_first=True)
    def forward(self, x):
        x = x.reshape(x.shape[0], -1).float()
        x = self.batch(x)
        x = x.reshape(x.shape[0], self.sequence, self.factor_num)
        
        
        x, _ = self.lstm(x)  # x.shape: torch.Size([6182, 10, 128])
        if self.transformer == True:
            q = self.q_metrix(x)
            k = self.k_metrix(x)
            v = self.v_metrix(x)
            x, _ = self.MultiheadAttention(q, k, v)  # attn_output = torch.Size([512, 20, 128])
        else:
            x = self.LeakyReLU(x)
        
        x = torch.transpose(x, 1, 2)  # x.shape: torch.Size([6182, 128, 10])
        x = self.batch2(x)
        x = torch.transpose(x, 1, 2)

        x, (hn, cn) = self.lstm2(x)  # torch.Size([6182, 10, 64])
        if self.attention == True:
            x, _ = self.attention_net(x, hn)
        else:
            x = x[:, -1]  # torch.Size([6182, 64])
        x = self.batch3(x)  # torch.Size([6182, 64])
        x = self.relu(x)
        x = self.dropout(x)
        y_pred = self.out(x)
        return y_pred

    def attention_net(self, lstm_output, final_state):
        # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
        # final_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        hidden = torch.cat((final_state[0], final_state[1]), dim=1).unsqueeze(
            2)  # hidden : [batch_size, n_hidden * num_directions(=2), n_layer(=1)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)  # [batch_size,sequence]
        attn_weights = torch.nn.functional.softmax(attn_weights, 1)  # [batch_size,sequence]   # torch.Size([512, 20])
        # context: [batch_size, n_hidden * num_directions(=2)]
        output = torch.bmm(lstm_output.transpose(1, 2), attn_weights.unsqueeze(2)).squeeze(
            2)  # [batch_size, n_hidden * num_directions(=2)]
        return output, attn_weights

In [4]:
import os
from os import walk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.optim as optim
from torch.autograd import Variable
from progressbar import ProgressBar
from tqdm import tqdm
import time
import multiprocessing as mp
# trainx = torch.randn(10000,3,20,108)
trainx = torch.randn(10000,20,108)
trainy = torch.randn(10000,1)
print("trainx.shape: " , trainx.shape)

trainx.shape:  torch.Size([10000, 20, 108])


In [5]:
train_dataset = Data.TensorDataset(trainx, trainy)
batch_size = 1024
train_loader = Data.DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=16,
    pin_memory=True
)

In [6]:
alphanet = AlphaNet_LSTM_V1(108,20,64,attention=True)
alphanet = alphanet.cuda()
# alphanet = torch.nn.parallel.DataParallel(alphanet)
print(alphanet)
total_length = trainx.shape[0]
LR = 0.01
loss_function = nn.MSELoss().cuda()
optimizer = optim.Adam(alphanet.parameters(), lr=LR)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=2,gamma = 0.5)
epoch_num = 50
loss_list = []

min_loss = float("inf")
for epoch in tqdm(range(epoch_num)):
    total_loss = 0
    for _, (inputs, outputs) in enumerate(train_loader):
        inputs = Variable(inputs).float().cuda()
        outputs = Variable(outputs).float().cuda()
        optimizer.zero_grad() # noticed:  the grad return to zero before starting the loop
        
        # forward + backward +update
        pred = alphanet(inputs.cuda())
        pred = pred.cuda()
        loss = loss_function(pred, outputs)
        loss.backward()
        optimizer.step()
        
#         lr_list.append(optimizer.state_dict()['param_groups'][0]['lr'])
        total_loss += loss.item()
    total_loss = total_loss * batch_size / total_length
    print('Epoch: ', epoch + 1, ' loss: ', total_loss)
    loss_list.append(total_loss)

  0%|          | 0/50 [00:00<?, ?it/s]

AlphaNet_LSTM_V1(
  (batch): BatchNorm1d(2160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(108, 64, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (lstm2): LSTM(128, 32, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (batch2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
  (LeakyReLU): LeakyReLU(negative_slope=0.01)
  (out): Linear(in_features=64, out_features=1, bias=True)
)


  2%|▏         | 1/50 [00:04<04:03,  4.98s/it]

Epoch:  1  loss:  1.1018763671875


  4%|▍         | 2/50 [00:08<03:23,  4.24s/it]

Epoch:  2  loss:  1.0418413024902344


  6%|▌         | 3/50 [00:12<03:08,  4.01s/it]

Epoch:  3  loss:  1.033728106689453


  8%|▊         | 4/50 [00:15<02:55,  3.81s/it]

Epoch:  4  loss:  1.016056024169922


 10%|█         | 5/50 [00:22<03:33,  4.74s/it]

Epoch:  5  loss:  0.9731095764160156


 12%|█▏        | 6/50 [00:26<03:17,  4.49s/it]

Epoch:  6  loss:  0.8978715881347656


 14%|█▍        | 7/50 [00:31<03:16,  4.57s/it]

Epoch:  7  loss:  0.7369903503417968


 16%|█▌        | 8/50 [00:34<03:01,  4.33s/it]

Epoch:  8  loss:  0.57525751953125


 18%|█▊        | 9/50 [00:39<02:56,  4.30s/it]

Epoch:  9  loss:  0.41289476013183596


 20%|██        | 10/50 [00:45<03:13,  4.85s/it]

Epoch:  10  loss:  0.28321459655761716


 22%|██▏       | 11/50 [00:50<03:16,  5.04s/it]

Epoch:  11  loss:  0.1937798370361328


 24%|██▍       | 12/50 [00:54<03:01,  4.78s/it]

Epoch:  12  loss:  0.14945919189453125


 26%|██▌       | 13/50 [00:58<02:43,  4.42s/it]

Epoch:  13  loss:  0.11800237731933594


 28%|██▊       | 14/50 [01:03<02:48,  4.69s/it]

Epoch:  14  loss:  0.1004978385925293


 30%|███       | 15/50 [01:07<02:38,  4.54s/it]

Epoch:  15  loss:  0.08560333938598633


 32%|███▏      | 16/50 [01:11<02:28,  4.38s/it]

Epoch:  16  loss:  0.07583615112304687


 34%|███▍      | 17/50 [01:15<02:17,  4.16s/it]

Epoch:  17  loss:  0.06886293754577637


 36%|███▌      | 18/50 [01:18<02:01,  3.79s/it]

Epoch:  18  loss:  0.061893176651000976


 38%|███▊      | 19/50 [01:21<01:49,  3.53s/it]

Epoch:  19  loss:  0.06322058448791504


 40%|████      | 20/50 [01:23<01:29,  2.99s/it]

Epoch:  20  loss:  0.05671741981506347


 40%|████      | 20/50 [01:24<02:06,  4.22s/it]


KeyboardInterrupt: 

In [11]:
alphanet = AlphaNet_LSTM_V1(108,20,64,attention=True,transformer = True)
alphanet = alphanet.cuda()
# alphanet = torch.nn.parallel.DataParallel(alphanet)
print(alphanet)
total_length = trainx.shape[0]
LR = 0.01
loss_function = nn.MSELoss().cuda()
optimizer = optim.Adam(alphanet.parameters(), lr=LR)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=2,gamma = 0.5)
epoch_num = 20
loss_list = []

min_loss = float("inf")
for epoch in tqdm(range(epoch_num)):
    total_loss = 0
    for _, (inputs, outputs) in enumerate(train_loader):
        inputs = Variable(inputs).float().cuda()
        outputs = Variable(outputs).float().cuda()
        optimizer.zero_grad() # noticed:  the grad return to zero before starting the loop
        
        # forward + backward +update
        pred = alphanet(inputs.cuda())
        pred = pred.cuda()
        loss = loss_function(pred, outputs)
        loss.backward()
        optimizer.step()
        
#         lr_list.append(optimizer.state_dict()['param_groups'][0]['lr'])
        total_loss += loss.item()
    total_loss = total_loss * batch_size / total_length
    print('Epoch: ', epoch + 1, ' loss: ', total_loss)
    loss_list.append(total_loss)

  0%|          | 0/20 [00:00<?, ?it/s]

AlphaNet_LSTM_V1(
  (batch): BatchNorm1d(2160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(108, 64, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (lstm2): LSTM(128, 32, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (batch2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
  (LeakyReLU): LeakyReLU(negative_slope=0.01)
  (out): Linear(in_features=64, out_features=1, bias=True)
  (q_metrix): Linear(in_features=128, out_features=128, bias=True)
  (k_metrix): Linear(in_features=128, out_features=128, bias=True)
  (v_metrix): Linear(in_features=128, out_features=128, bias=True)
  (MultiheadAttention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
  )
)


  5%|▌         | 1/20 [00:02<00:41,  2.17s/it]

Epoch:  1  loss:  1.1139741821289062


 10%|█         | 2/20 [00:04<00:37,  2.09s/it]

Epoch:  2  loss:  1.0530594177246093


 15%|█▌        | 3/20 [00:06<00:33,  1.98s/it]

Epoch:  3  loss:  1.0436497802734375


 20%|██        | 4/20 [00:07<00:30,  1.90s/it]

Epoch:  4  loss:  1.0428310607910156


 25%|██▌       | 5/20 [00:09<00:28,  1.89s/it]

Epoch:  5  loss:  1.0397243041992188


 30%|███       | 6/20 [00:11<00:26,  1.88s/it]

Epoch:  6  loss:  1.0409728332519532


 35%|███▌      | 7/20 [00:13<00:24,  1.90s/it]

Epoch:  7  loss:  1.0375355712890626


 40%|████      | 8/20 [00:15<00:22,  1.87s/it]

Epoch:  8  loss:  1.0361664855957031


 45%|████▌     | 9/20 [00:17<00:20,  1.89s/it]

Epoch:  9  loss:  1.039566162109375


 50%|█████     | 10/20 [00:19<00:18,  1.85s/it]

Epoch:  10  loss:  1.0366649475097656


 55%|█████▌    | 11/20 [00:21<00:17,  1.95s/it]

Epoch:  11  loss:  1.0397498291015626


 60%|██████    | 12/20 [00:23<00:15,  1.92s/it]

Epoch:  12  loss:  1.0372514404296875


 65%|██████▌   | 13/20 [00:25<00:14,  2.00s/it]

Epoch:  13  loss:  1.0388130310058594


 70%|███████   | 14/20 [00:27<00:11,  1.97s/it]

Epoch:  14  loss:  1.0365106384277343


 75%|███████▌  | 15/20 [00:29<00:09,  1.95s/it]

Epoch:  15  loss:  1.0349675048828124


 80%|████████  | 16/20 [00:30<00:07,  1.93s/it]

Epoch:  16  loss:  1.0379776245117187


 85%|████████▌ | 17/20 [00:32<00:05,  1.91s/it]

Epoch:  17  loss:  1.0362622131347656


 90%|█████████ | 18/20 [00:34<00:03,  1.99s/it]

Epoch:  18  loss:  1.0362174621582032


 95%|█████████▌| 19/20 [00:36<00:01,  1.94s/it]

Epoch:  19  loss:  1.0369715270996094


100%|██████████| 20/20 [00:38<00:00,  1.94s/it]

Epoch:  20  loss:  1.0387520568847657



