In [1]:
import torch
from torch import nn

In [16]:
class Res_LSTM(nn.Module):
    def __init__(self, dimention, factor_num, sequence, fully_connect_layer_neural, layer_num=2,transformer = False):
        super(Res_LSTM, self).__init__()
        self.factor_num = factor_num  # 108
        self.sequence = sequence  #
        self.dimention = dimention  #
        self.fc2_neuron = fully_connect_layer_neural  # 32
        self.transformer = transformer

        # Layer
        self.bn1 = torch.nn.BatchNorm1d(self.dimention * self.factor_num * self.sequence)
        self.bn2 = torch.nn.BatchNorm1d(self.fc2_neuron * 2 * self.sequence)
        self.bn3 = torch.nn.BatchNorm1d(self.fc2_neuron * 2)
        if self.transformer == True:
            self.q_metrix = nn.Linear(self.factor_num, self.factor_num)
            self.k_metrix = nn.Linear(self.factor_num, self.factor_num)
            self.v_metrix = nn.Linear(self.factor_num, self.factor_num)
            self.MultiheadAttention = nn.MultiheadAttention(self.factor_num, layer_num, batch_first=True)
        
        self.lstm = nn.LSTM(self.factor_num, self.fc2_neuron, layer_num, batch_first=True, bidirectional=True,
                            dropout=0.2)
        self.dropout = nn.Dropout(0.2)
        self.LeakyReLU = nn.LeakyReLU()
        self.out = nn.Linear(self.fc2_neuron * 2, 1)

    def forward(self, x):
        x = self.norm_1(x)
        x = torch.transpose(x, 0, 1)  # x.shape: torch.Size([3, 512, 20, 108])

        final, (hn, cn) = self.lstm_layer(x[0], 2)  # torch.Size([512, 20, 128])
        for i in range(1, x.shape[0]):
            add, _ = self.lstm_layer(x[i], 2)
            final = self.skip_connection(final, add)
        # start = torch.Size([512, 20, 128])

        x, _ = self.attention_net(final, hn)
        x = self.bn3(x)
        x = self.LeakyReLU(x)
        x = self.dropout(x)
        y_pred = self.out(x)
        return y_pred

    def norm_1(self, x):
        batch_num, original_shape = x.shape[0], x.shape
        x = x.reshape(batch_num, -1)
        x = self.bn1(x)
        x = x.reshape(original_shape)
        return x

    def norm_2(self, x):
        batch_num, original_shape = x.shape[0], x.shape
        x = x.reshape(batch_num, -1)
        x = self.bn2(x)
        x = x.reshape(original_shape)
        return x

    def lstm_layer(self, x, layer_num):
        if self.transformer == True:
            q = self.q_metrix(x)
            k = self.k_metrix(x)
            v = self.v_metrix(x)
            x, x_weight = self.MultiheadAttention(q, k, v)  # attn_output = torch.Size([512, 20, 128])
        else:
            pass
        # out = torch.Size([512, 20, 128])
        out, (hn, cn) = self.lstm(x)
        return out,(hn, cn)

    def skip_connection(self, origin, add):
        return self.norm_2(origin + add)

    def attention_net(self, lstm_output, final_state):
        # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
        # final_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        hidden = torch.cat((final_state[0], final_state[1]), dim=1).unsqueeze(
            2)  # hidden : [batch_size, n_hidden * num_directions(=2), n_layer(=1)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)  # [batch_size,sequence]
        attn_weights = torch.nn.functional.softmax(attn_weights, 1)  # [batch_size,sequence]   # torch.Size([512, 20])
        # context: [batch_size, n_hidden * num_directions(=2)]
        output = torch.bmm(lstm_output.transpose(1, 2), attn_weights.unsqueeze(2)).squeeze(
            2)  # [batch_size, n_hidden * num_directions(=2)]
        return output, attn_weights

In [30]:
class AlphaNet_LSTM_V1(nn.Module):
    def __init__(self, factor_num, sequence, fully_connect_layer_neural, attention=False,transformer = False):
        super(AlphaNet_LSTM_V1, self).__init__()
        self.factor_num = factor_num  # 108
        self.sequence = sequence
        self.fc2_neuron = fully_connect_layer_neural  # 32
        self.attention = attention
        self.transformer = transformer
        # Layer
        self.batch = torch.nn.BatchNorm1d(self.sequence * self.factor_num)
        self.lstm = nn.LSTM(self.factor_num, self.fc2_neuron, 3, batch_first=True, bidirectional=True, dropout=0.2)
        self.lstm2 = nn.LSTM(int(self.fc2_neuron * 2), int(self.fc2_neuron / 2), 3, batch_first=True,
                             bidirectional=True, dropout=0.2)
        self.batch2 = torch.nn.BatchNorm1d(int(self.fc2_neuron * 2))
        self.batch3 = torch.nn.BatchNorm1d(self.fc2_neuron)
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.LeakyReLU = nn.LeakyReLU()
        self.out = nn.Linear(self.fc2_neuron, 1)
        if self.transformer == True:
            self.TransformerLayer = nn.TransformerEncoderLayer(d_model=self.fc2_neuron * 2, nhead=2,batch_first=True)
    def forward(self, x):
        x = x.reshape(x.shape[0], -1).float()
        x = self.batch(x)
        x = x.reshape(x.shape[0], self.sequence, self.factor_num)
        
        
        x, _ = self.lstm(x)  # x.shape: torch.Size([6182, 10, 128])
        if self.transformer == True:
            x = self.TransformerLayer(x)  # attn_output = torch.Size([512, 20, 128])
        else:
            x = self.LeakyReLU(x)
        
        x = torch.transpose(x, 1, 2)  # x.shape: torch.Size([6182, 128, 10])
        x = self.batch2(x)
        x = torch.transpose(x, 1, 2)

        x, (hn, cn) = self.lstm2(x)  # torch.Size([6182, 10, 64])
        if self.attention == True:
            x, _ = self.attention_net(x, hn)
        else:
            x = x[:, -1]  # torch.Size([6182, 64])
        x = self.batch3(x)  # torch.Size([6182, 64])
        x = self.relu(x)
        x = self.dropout(x)
        y_pred = self.out(x)
        return y_pred

    def attention_net(self, lstm_output, final_state):
        # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
        # final_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        hidden = torch.cat((final_state[0], final_state[1]), dim=1).unsqueeze(
            2)  # hidden : [batch_size, n_hidden * num_directions(=2), n_layer(=1)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)  # [batch_size,sequence]
        attn_weights = torch.nn.functional.softmax(attn_weights, 1)  # [batch_size,sequence]   # torch.Size([512, 20])
        # context: [batch_size, n_hidden * num_directions(=2)]
        output = torch.bmm(lstm_output.transpose(1, 2), attn_weights.unsqueeze(2)).squeeze(
            2)  # [batch_size, n_hidden * num_directions(=2)]
        return output, attn_weights

In [25]:
import os
from os import walk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.optim as optim
from torch.autograd import Variable
from progressbar import ProgressBar
from tqdm import tqdm
import time
import multiprocessing as mp
# trainx = torch.randn(10000,3,20,108)
trainx = torch.randn(10000,20,108)
trainy = torch.randn(10000,1)
print("trainx.shape: " , trainx.shape)

trainx.shape:  torch.Size([10000, 20, 108])


In [26]:
train_dataset = Data.TensorDataset(trainx, trainy)
batch_size = 1024
train_loader = Data.DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=16,
    pin_memory=True
)

In [32]:
alphanet = AlphaNet_LSTM_V1(108,20,64,attention=True)
# alphanet = Transformer_LSTM(3,108,20,64)
alphanet = alphanet.cuda()
# alphanet = torch.nn.parallel.DataParallel(alphanet)
print(alphanet)
total_length = trainx.shape[0]
LR = 0.01
loss_function = nn.MSELoss().cuda()
optimizer = optim.Adam(alphanet.parameters(), lr=LR)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=2,gamma = 0.5)
epoch_num = 50
loss_list = []

min_loss = float("inf")
for epoch in tqdm(range(epoch_num)):
    total_loss = 0
    for _, (inputs, outputs) in enumerate(train_loader):
        inputs = Variable(inputs).float().cuda()
        outputs = Variable(outputs).float().cuda()
        optimizer.zero_grad() # noticed:  the grad return to zero before starting the loop
        
        # forward + backward +update
        pred = alphanet(inputs.cuda())
        pred = pred.cuda()
        loss = loss_function(pred, outputs)
        loss.backward()
        optimizer.step()
        
#         lr_list.append(optimizer.state_dict()['param_groups'][0]['lr'])
        total_loss += loss.item()
    total_loss = total_loss * batch_size / total_length
    print('Epoch: ', epoch + 1, ' loss: ', total_loss)
    loss_list.append(total_loss)

  0%|          | 0/50 [00:00<?, ?it/s]

AlphaNet_LSTM_V1(
  (batch): BatchNorm1d(2160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(108, 64, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (lstm2): LSTM(128, 32, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (batch2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
  (LeakyReLU): LeakyReLU(negative_slope=0.01)
  (out): Linear(in_features=64, out_features=1, bias=True)
)


  2%|▏         | 1/50 [00:02<01:42,  2.09s/it]

Epoch:  1  loss:  1.1039344848632813


  4%|▍         | 2/50 [00:04<01:37,  2.03s/it]

Epoch:  2  loss:  1.0544122314453126


  6%|▌         | 3/50 [00:06<01:41,  2.17s/it]

Epoch:  3  loss:  1.0416991271972655


  8%|▊         | 4/50 [00:08<01:40,  2.18s/it]

Epoch:  4  loss:  1.0212333374023437


 10%|█         | 5/50 [00:10<01:40,  2.24s/it]

Epoch:  5  loss:  0.9709564697265625


 12%|█▏        | 6/50 [00:13<01:39,  2.26s/it]

Epoch:  6  loss:  0.8870690368652344


 14%|█▍        | 7/50 [00:15<01:33,  2.18s/it]

Epoch:  7  loss:  0.7571882995605469


 16%|█▌        | 8/50 [00:17<01:29,  2.13s/it]

Epoch:  8  loss:  0.5748836364746094


 18%|█▊        | 9/50 [00:19<01:28,  2.16s/it]

Epoch:  9  loss:  0.41707685546875


 20%|██        | 10/50 [00:21<01:24,  2.12s/it]

Epoch:  10  loss:  0.2955154968261719


 22%|██▏       | 11/50 [00:23<01:24,  2.17s/it]

Epoch:  11  loss:  0.21298082733154297


 24%|██▍       | 12/50 [00:26<01:23,  2.20s/it]

Epoch:  12  loss:  0.15649063415527345


 26%|██▌       | 13/50 [00:28<01:21,  2.20s/it]

Epoch:  13  loss:  0.11655426025390625


 28%|██▊       | 14/50 [00:30<01:20,  2.24s/it]

Epoch:  14  loss:  0.0969146369934082


 30%|███       | 15/50 [00:33<01:21,  2.32s/it]

Epoch:  15  loss:  0.08604855651855468


 32%|███▏      | 16/50 [00:35<01:18,  2.29s/it]

Epoch:  16  loss:  0.07651798629760742


 34%|███▍      | 17/50 [00:37<01:12,  2.21s/it]

Epoch:  17  loss:  0.0717460578918457


 36%|███▌      | 18/50 [00:39<01:08,  2.14s/it]

Epoch:  18  loss:  0.06568725090026856


 38%|███▊      | 19/50 [00:41<01:05,  2.11s/it]

Epoch:  19  loss:  0.059992130661010744


 40%|████      | 20/50 [00:43<01:02,  2.08s/it]

Epoch:  20  loss:  0.05891641540527344


 42%|████▏     | 21/50 [00:45<01:00,  2.09s/it]

Epoch:  21  loss:  0.0524703914642334


 44%|████▍     | 22/50 [00:47<00:58,  2.11s/it]

Epoch:  22  loss:  0.05455800132751465


 46%|████▌     | 23/50 [00:49<00:55,  2.07s/it]

Epoch:  23  loss:  0.05162196006774902


 48%|████▊     | 24/50 [00:51<00:55,  2.12s/it]

Epoch:  24  loss:  0.05290520973205566


 50%|█████     | 25/50 [00:54<00:55,  2.21s/it]

Epoch:  25  loss:  0.05326764640808106


 52%|█████▏    | 26/50 [00:56<00:52,  2.17s/it]

Epoch:  26  loss:  0.046090283203125


 54%|█████▍    | 27/50 [00:58<00:50,  2.18s/it]

Epoch:  27  loss:  0.045197893142700195


 56%|█████▌    | 28/50 [01:00<00:47,  2.16s/it]

Epoch:  28  loss:  0.04462129936218262


 58%|█████▊    | 29/50 [01:02<00:44,  2.12s/it]

Epoch:  29  loss:  0.04852020263671875


 60%|██████    | 30/50 [01:04<00:41,  2.09s/it]

Epoch:  30  loss:  0.048447804260253906


 62%|██████▏   | 31/50 [01:06<00:39,  2.06s/it]

Epoch:  31  loss:  0.046422080993652345


 64%|██████▍   | 32/50 [01:08<00:36,  2.04s/it]

Epoch:  32  loss:  0.0466089111328125


 66%|██████▌   | 33/50 [01:10<00:34,  2.03s/it]

Epoch:  33  loss:  0.04538141403198242


 68%|██████▊   | 34/50 [01:13<00:34,  2.17s/it]

Epoch:  34  loss:  0.04222370376586914


 70%|███████   | 35/50 [01:15<00:32,  2.17s/it]

Epoch:  35  loss:  0.040612707138061525


 72%|███████▏  | 36/50 [01:17<00:30,  2.15s/it]

Epoch:  36  loss:  0.03980071563720703


 74%|███████▍  | 37/50 [01:19<00:27,  2.10s/it]

Epoch:  37  loss:  0.039920468902587894


 76%|███████▌  | 38/50 [01:21<00:26,  2.18s/it]

Epoch:  38  loss:  0.04115127639770508


 78%|███████▊  | 39/50 [01:23<00:23,  2.11s/it]

Epoch:  39  loss:  0.04193022804260254


 80%|████████  | 40/50 [01:26<00:21,  2.18s/it]

Epoch:  40  loss:  0.038758163452148435


 82%|████████▏ | 41/50 [01:28<00:19,  2.17s/it]

Epoch:  41  loss:  0.037868044662475585


 84%|████████▍ | 42/50 [01:30<00:18,  2.25s/it]

Epoch:  42  loss:  0.039712248611450195


 86%|████████▌ | 43/50 [01:32<00:15,  2.20s/it]

Epoch:  43  loss:  0.0378000394821167


 88%|████████▊ | 44/50 [01:35<00:13,  2.25s/it]

Epoch:  44  loss:  0.04109605140686035


 90%|█████████ | 45/50 [01:37<00:11,  2.27s/it]

Epoch:  45  loss:  0.043775876235961915


 92%|█████████▏| 46/50 [01:39<00:08,  2.22s/it]

Epoch:  46  loss:  0.043284220504760745


 94%|█████████▍| 47/50 [01:41<00:06,  2.14s/it]

Epoch:  47  loss:  0.04088177452087402


 96%|█████████▌| 48/50 [01:43<00:04,  2.11s/it]

Epoch:  48  loss:  0.03866119270324707


 98%|█████████▊| 49/50 [01:45<00:02,  2.09s/it]

Epoch:  49  loss:  0.039142678833007816


100%|██████████| 50/50 [01:47<00:00,  2.15s/it]

Epoch:  50  loss:  0.035422092819213866





In [33]:
alphanet = AlphaNet_LSTM_V1(108,20,64,attention=True,transformer = True)
# alphanet = Transformer_LSTM(3,108,20,64)
alphanet = alphanet.cuda()
# alphanet = torch.nn.parallel.DataParallel(alphanet)
print(alphanet)
total_length = trainx.shape[0]
LR = 0.01
loss_function = nn.MSELoss().cuda()
optimizer = optim.Adam(alphanet.parameters(), lr=LR)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=2,gamma = 0.5)
epoch_num = 50
loss_list = []

min_loss = float("inf")
for epoch in tqdm(range(epoch_num)):
    total_loss = 0
    for _, (inputs, outputs) in enumerate(train_loader):
        inputs = Variable(inputs).float().cuda()
        outputs = Variable(outputs).float().cuda()
        optimizer.zero_grad() # noticed:  the grad return to zero before starting the loop
        
        # forward + backward +update
        pred = alphanet(inputs.cuda())
        pred = pred.cuda()
        loss = loss_function(pred, outputs)
        loss.backward()
        optimizer.step()
        
#         lr_list.append(optimizer.state_dict()['param_groups'][0]['lr'])
        total_loss += loss.item()
    total_loss = total_loss * batch_size / total_length
    print('Epoch: ', epoch + 1, ' loss: ', total_loss)
    loss_list.append(total_loss)

  0%|          | 0/50 [00:00<?, ?it/s]

AlphaNet_LSTM_V1(
  (batch): BatchNorm1d(2160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(108, 64, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (lstm2): LSTM(128, 32, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (batch2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
  (LeakyReLU): LeakyReLU(negative_slope=0.01)
  (out): Linear(in_features=64, out_features=1, bias=True)
  (TransformerLayer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
    )
    (linear1): Linear(in_features=128, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=128, bi

  2%|▏         | 1/50 [00:02<01:51,  2.28s/it]

Epoch:  1  loss:  1.1155769775390625


  4%|▍         | 2/50 [00:04<01:48,  2.26s/it]

Epoch:  2  loss:  1.058502471923828


  6%|▌         | 3/50 [00:06<01:46,  2.27s/it]

Epoch:  3  loss:  1.0536162109375


  8%|▊         | 4/50 [00:09<01:44,  2.27s/it]

Epoch:  4  loss:  1.0476709533691406


 10%|█         | 5/50 [00:11<01:42,  2.27s/it]

Epoch:  5  loss:  1.04472685546875


 12%|█▏        | 6/50 [00:13<01:42,  2.32s/it]

Epoch:  6  loss:  1.0482998413085938


 14%|█▍        | 7/50 [00:16<01:44,  2.44s/it]

Epoch:  7  loss:  1.0453374633789063


 16%|█▌        | 8/50 [00:18<01:40,  2.40s/it]

Epoch:  8  loss:  1.045341827392578


 18%|█▊        | 9/50 [00:21<01:36,  2.36s/it]

Epoch:  9  loss:  1.0449890502929688


 20%|██        | 10/50 [00:23<01:33,  2.33s/it]

Epoch:  10  loss:  1.0437402954101562


 22%|██▏       | 11/50 [00:25<01:30,  2.31s/it]

Epoch:  11  loss:  1.0445158203125


 24%|██▍       | 12/50 [00:27<01:28,  2.33s/it]

Epoch:  12  loss:  1.0488438049316406


 26%|██▌       | 13/50 [00:30<01:28,  2.40s/it]

Epoch:  13  loss:  1.0471238525390625


 28%|██▊       | 14/50 [00:32<01:25,  2.39s/it]

Epoch:  14  loss:  1.0472048645019532


 30%|███       | 15/50 [00:35<01:25,  2.45s/it]

Epoch:  15  loss:  1.0447251220703124


 32%|███▏      | 16/50 [00:38<01:25,  2.52s/it]

Epoch:  16  loss:  1.0446816040039062


 34%|███▍      | 17/50 [00:40<01:25,  2.60s/it]

Epoch:  17  loss:  1.04547255859375


 36%|███▌      | 18/50 [00:43<01:21,  2.55s/it]

Epoch:  18  loss:  1.040444061279297


 38%|███▊      | 19/50 [00:45<01:19,  2.56s/it]

Epoch:  19  loss:  1.0462697692871095


 40%|████      | 20/50 [00:48<01:14,  2.47s/it]

Epoch:  20  loss:  1.0448757934570312


 42%|████▏     | 21/50 [00:50<01:10,  2.43s/it]

Epoch:  21  loss:  1.0438264709472656


 44%|████▍     | 22/50 [00:52<01:06,  2.39s/it]

Epoch:  22  loss:  1.0453653381347656


 46%|████▌     | 23/50 [00:55<01:03,  2.37s/it]

Epoch:  23  loss:  1.044213916015625


 48%|████▊     | 24/50 [00:57<01:01,  2.36s/it]

Epoch:  24  loss:  1.0454440551757813


 50%|█████     | 25/50 [01:00<01:01,  2.48s/it]

Epoch:  25  loss:  1.0467505310058594


 52%|█████▏    | 26/50 [01:02<00:59,  2.47s/it]

Epoch:  26  loss:  1.0449428955078126


 54%|█████▍    | 27/50 [01:04<00:55,  2.41s/it]

Epoch:  27  loss:  1.0436807861328126


 56%|█████▌    | 28/50 [01:07<00:52,  2.40s/it]

Epoch:  28  loss:  1.0440911376953126


 58%|█████▊    | 29/50 [01:10<00:52,  2.51s/it]

Epoch:  29  loss:  1.0461356628417968


 60%|██████    | 30/50 [01:12<00:49,  2.48s/it]

Epoch:  30  loss:  1.0459106262207032


 62%|██████▏   | 31/50 [01:14<00:46,  2.45s/it]

Epoch:  31  loss:  1.0438552978515625


 64%|██████▍   | 32/50 [01:17<00:43,  2.41s/it]

Epoch:  32  loss:  1.0421221313476563


 66%|██████▌   | 33/50 [01:19<00:40,  2.37s/it]

Epoch:  33  loss:  1.0448488647460938


 68%|██████▊   | 34/50 [01:22<00:38,  2.42s/it]

Epoch:  34  loss:  1.044237744140625


 70%|███████   | 35/50 [01:24<00:38,  2.55s/it]

Epoch:  35  loss:  1.0413337890625


 72%|███████▏  | 36/50 [01:27<00:35,  2.55s/it]

Epoch:  36  loss:  1.0417837280273436


 74%|███████▍  | 37/50 [01:29<00:32,  2.53s/it]

Epoch:  37  loss:  1.0438722412109376


 76%|███████▌  | 38/50 [01:32<00:29,  2.47s/it]

Epoch:  38  loss:  1.0460872253417968


 78%|███████▊  | 39/50 [01:34<00:26,  2.42s/it]

Epoch:  39  loss:  1.0436357543945312


 80%|████████  | 40/50 [01:36<00:24,  2.42s/it]

Epoch:  40  loss:  1.044443603515625


 82%|████████▏ | 41/50 [01:39<00:21,  2.37s/it]

Epoch:  41  loss:  1.0414888244628906


 84%|████████▍ | 42/50 [01:41<00:18,  2.35s/it]

Epoch:  42  loss:  1.0440006652832032


 86%|████████▌ | 43/50 [01:43<00:16,  2.31s/it]

Epoch:  43  loss:  1.0432277465820312


 88%|████████▊ | 44/50 [01:46<00:14,  2.42s/it]

Epoch:  44  loss:  1.0439085876464844


 90%|█████████ | 45/50 [01:48<00:11,  2.37s/it]

Epoch:  45  loss:  1.04424599609375


 92%|█████████▏| 46/50 [01:51<00:09,  2.38s/it]

Epoch:  46  loss:  1.040772735595703


 94%|█████████▍| 47/50 [01:53<00:07,  2.45s/it]

Epoch:  47  loss:  1.0472406005859376


 96%|█████████▌| 48/50 [01:56<00:04,  2.49s/it]

Epoch:  48  loss:  1.0418911193847655


 98%|█████████▊| 49/50 [01:58<00:02,  2.43s/it]

Epoch:  49  loss:  1.0430485595703125


100%|██████████| 50/50 [02:01<00:00,  2.42s/it]

Epoch:  50  loss:  1.0408672729492188





In [20]:
encoder_layer = nn.TransformerEncoderLayer(d_model=108, nhead=2,batch_first=True)
src = torch.rand(512, 20, 108)
out = encoder_layer(src)

In [21]:
out.shape

torch.Size([512, 20, 108])