In [None]:
!rm -r ./*
!git clone https://github.com/Mamiglia/BNN_Human_motion
!mv BNN_Human_motion/* .

In [None]:
!pip install bayesian-torch lightning

In [3]:
from funcs.dataloader import load_dataset
from funcs.pos_embed_p import Pos_Embed
from funcs.loss import *
from funcs.utils import h36motion3d as datasets
from funcs.utils.data_utils import *

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
import time
from torch.utils.data import Dataset
import torch
from torch.utils.data import DataLoader

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device,  '- Type:', torch.cuda.get_device_name(0))

Using device: cuda - Type: Tesla T4


In [6]:
dataset, vald_dataset = load_dataset(output_n=15, batch_size=32)

Loading Train Dataset...
Loading Validation Dataset...
>>> Training dataset length: 181577
>>> Validation dataset length: 28410


# Costum Model from HW3
- Encoder: Transfomer + CNN
- Decoder: LSTM

## Definition

### Encoder

In [6]:
class Attention(nn.Module):
    # CLASSICAL ATTENTION MECHANISM
    # IT'S THE SAME OF THE TEORICAL PART

    def __init__(self, attn_dropout):
        super().__init__()
        self.dropout = nn.Dropout(attn_dropout)
    def forward(self, query, key, value, mask=None):
        attn = torch.matmul(query, key.transpose(-2, -1))
        d_k = query.size(-1)
        attn = attn / (d_k ** 0.5)
        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e9)
        attn = self.dropout(F.softmax(attn,-1))
        output = torch.matmul(attn, value)
        return output, attn

In [7]:
class MultiHeadAttention(nn.Module):
    # ALSO THE MULTIHEAD ATTENTION MECHANISM IS THE SAME OF TEORICAL PART

    def __init__(self, num_heads, d_model, dropout):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        #  We assume d_v always equals d_k
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.query_ff = nn.Linear(d_model, d_model)
        self.key_ff = nn.Linear(d_model, d_model)
        self.value_ff = nn.Linear(d_model, d_model)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        self.attention = Attention(attn_dropout=dropout)
    def forward(self, query, key, value, mask=None, return_attention=False):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        query = self.query_ff(query).view(nbatches, -1, self.num_heads, self.d_k).transpose(1, 2)
        key = self.key_ff(key).view(nbatches, -1, self.num_heads, self.d_k).transpose(1, 2)
        value = self.value_ff(value).view(nbatches, -1, self.num_heads, self.d_k).transpose(1, 2)
        x, self.attn = self.attention(query, key, value, mask)
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.num_heads * self.d_k)
        if return_attention:
            return self.attn_ff(x), self.attn
        return x

In [8]:
class EncoderBlock(nn.Module):
    # THE ENCODER BLOCK IS A TIPICAL TRANSFORMER ENCODER,
    # BUT WE HAVE ADDED A CONVOLUTIONAL LAYER AT THE END:
    # self.conv IS USED TO SHRINK THE INFORMATION ALONG THE ENCODER BLOCKS
    # THIS IS A METHOD TO DISTILL INFORMATION SHOULD BE PASSED TO THE DECODER
    # AND MOREOVER TO SAVE A LOT OF TIME

    def __init__(self,num_heads,d_model,time_in,time_out,num_joints,dropout):
        super().__init__()

        self.num_joints = num_joints
        self.d_model = d_model

        self.self_attn = MultiHeadAttention(num_heads, d_model, dropout)

        # LAYERNORM LAYERS AND DROPOUT
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

        # MLP
        self.lin_net = nn.Sequential(
            nn.Linear(d_model,2*d_model),
            nn.ReLU(),
            nn.Linear(2*d_model,d_model),
            nn.ReLU()
        )

        # A CNN SHRINKS THE FRAMES FROM time_it TO time_out
        # AND THE d_model IS HALVED AFTER EACH BLOCK
        self.conv = nn.Conv1d(time_in,time_out,3,padding=1)

    def forward(self, xs, mask=None):
        x = xs
        att = self.self_attn(x, x, x, mask)
        x = self.relu(x + att)
        x = self.norm1(x)

        lin_output = self.lin_net(x)
        x = self.relu(x + self.dropout(lin_output))

        # WE ADD A RESIDUAL CONNECTION ALSO HERE
        # IT REDUCES A BIT THE VANISH GRADIENT
        x = self.norm2(x + xs)

        # SHRINK
        x = self.conv(x)

        return x

### Decoder

In [27]:
class Decoder(nn.Module):
    # THE DECODER BLOCK IS A SIMPLE LSTM,
    # WHICH TAKES THE DISTILLED INFORMATION FROM THE
    # ENCODER AS HIDDEN STATE AND AUTOREGRESSIVLY
    # FORSEES THE NEXT FRAMES
    def __init__(self, hidden_dim, num_layers=1):
        super(Decoder, self).__init__()
        # LSTM
        self.lstm = nn.LSTM(66, hidden_dim, num_layers=num_layers)
        # LINEAR self.out TO RETURN THE ORIGINAL DIMENSIONS
        self.out = nn.Linear(hidden_dim, 66)

    def forward(self, hidden, num_steps):
        # CELL STATE IS INITIALIZED TO ZEROS
        cell = nn.Parameter(torch.zeros(*hidden.shape)).to(device)
        batch_size = hidden.size(1)
        # THE <START> IS INITIALIZED TO ONES
        input = torch.ones((batch_size,66), dtype=torch.float).unsqueeze(0).to(device)
        outputs = torch.zeros((num_steps,batch_size,66), dtype=torch.float).to(device)

        for t in range(num_steps):
            # FORCAST
            decoder_output, (_,_) = self.lstm(input, (hidden,cell))
            # SET TO THE ORIGINAL DIMENSIONS
            decoder_output = self.out(decoder_output[-1])
            outputs[t] = decoder_output
            # STACK TO THE INPUT
            input = torch.cat((input,decoder_output.unsqueeze(0)),0)
        return outputs

### Model

In [28]:
def conv_init(conv):
    nn.init.kaiming_normal_(conv.weight, nonlinearity='relu')

In [29]:
def fc_init(fc):
    nn.init.kaiming_normal_(fc.weight, nonlinearity='relu')

In [30]:
def bn_init(bn, scale):
    nn.init.constant_(bn.weight, scale)
    nn.init.constant_(bn.bias, 0)

In [31]:
class Model(nn.Module):
    # THIS IS THE FINAL MODEL IN WHICH WE MERGED THE ENCODER AND DECODER PARTS
    def __init__(self, num_channels, num_frames_out,
                 old_frames, num_joints, num_heads, drop,
                 d_model = 512, config=None):
        super().__init__()

        # CONFIGURATION FOR THE ENCODER BLOCKS
        if config==None:
            self.config = [[d_model,10,8],[d_model,8,6],[d_model,6,4],[d_model,4,2],[d_model,2,1]]

        self.num_channels = num_channels
        self.num_frames_out = num_frames_out
        self.num_heads = num_heads
        self.num_joints = num_joints
        self.old_frames = old_frames
        self.d_model = d_model

        # LINEAR BLOCK TO PASS FROM INITIAL DIMENSION 66 TO d_model
        self.lin = nn.Sequential(nn.Linear(self.num_channels*self.num_joints,d_model),nn.BatchNorm1d(self.old_frames))
        self.norm = nn.BatchNorm2d(self.num_channels)

        # ENCODER
        self.blocks = nn.ModuleList()
        # d_ : d_model of the block;
        # in_ : number of frames of the sequences at the begin;
        # out_ : number of frames of the sequences at the end
        for index, (d_,in_,out_) in enumerate(self.config):
            self.blocks.append(EncoderBlock(num_heads=self.num_heads,
                                            d_model=d_, time_in=in_, time_out=out_,
                                            num_joints=self.num_joints,dropout=drop))

        # SINUSOIDAL POSITIONAL EMBEDDINGS
        self.pos = Pos_Embed(self.num_channels,self.old_frames,self.num_joints)
        # DECODER: HIDDEN STATE DIMENSION IS THE DIMENSION OF THE LAST ENCODER
        self.dec = Decoder(self.d_model)

        # WEIGHTS INITIALIZATION
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                conv_init(m)
            if isinstance(m, nn.Conv1d):
                conv_init(m)
            elif isinstance(m, nn.BatchNorm2d):
                bn_init(m,1)
            elif isinstance(m, nn.BatchNorm1d):
                bn_init(m,1)
            elif isinstance(m, nn.Linear):
                fc_init(m)

    def forward(self,x):
        x = x.view(-1,self.old_frames,self.num_joints,self.num_channels).permute(0,3,1,2)
        # SUM POSITIONAL EMBEDDING
        x = (x + self.pos(x)).permute(0,2,3,1).view(-1,self.old_frames,self.num_joints*self.num_channels)
        # LINEAR LAYER
        x = self.lin(x)
        # ENCODER BLOCKS
        for i, block in enumerate(self.blocks):
            x = block(x)
        # RETRIEVE CONTEXT FROM THE ENCODER
        context = x.view(-1,self.d_model).unsqueeze(0)
        # PASS TO THE DECODER AND RETURN THE RESULTS
        results = self.dec(hidden = context,num_steps = self.num_frames_out)
        results = results.permute(1,0,2)
        return results

## Training

In [32]:
# Argument for training
n_epochs = 5
log_step = 100
input_n = 10
output_n = 15

# The model name to save/load
datas = 'h36m'
model_path = datas + '_3d_' + str(output_n) + 'frames_ckpt'

model = Model(num_channels=3,
              num_frames_out=output_n,
              old_frames=input_n,
              num_joints=22,
              num_heads=8,
              drop=0.3).to(device)

# Arguments to setup the optimizer
lr = 5e-04 # learning rate
use_scheduler = True # use MultiStepLR scheduler
milestones = [2, 2, 2, 5, 5]   # the epochs after which the learning rate is adjusted by gamma
gamma = 0.5 #gamma correction to the learning rate, after reaching the milestone epochs
weight_decay = 0.00003 # weight decay (L2 penalty)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

if use_scheduler:
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=gamma)

In [33]:
# WE HAVE REWRITE THE TRAIN FUNCTION
# TO IMPLEMENT THE SPEED REPRESENTATION

save_and_plot = True
def train_final(data_loader, vald_loader, clip_grad=None):
  train_loss = []
  val_loss = []
  val_loss_best = 1000

  # Initialize lists to store data from each checkpoint
  train_losses = []
  val_losses = []

  dim_used = np.array([6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 21, 22, 23, 24, 25,
                    26, 27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
                    46, 47, 51, 52, 53, 54, 55, 56, 57, 58, 59, 63, 64, 65, 66, 67, 68,
                    75, 76, 77, 78, 79, 80, 81, 82, 83, 87, 88, 89, 90, 91, 92])

  for epoch in range(n_epochs):
      running_loss = 0
      n = 0
      model.train()
      for cnt, batch in enumerate(data_loader):
          batch = batch.float().to(device)
          batch_dim = batch.shape[0]
          n += batch_dim

          # GET SPEED REPRESENTATION:
          # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
          # Y_{N+1} = X_{N+1} - X_{N}
          sequences_train = torch.cat((torch.zeros(*batch[:, :1, dim_used].size()).to(device),batch[:, 1:10, dim_used]-batch[:, :9, dim_used]), 1)
          sequences_gt = batch[:, input_n:input_n + output_n, dim_used]

          optimizer.zero_grad()
          sequences_predict=model(sequences_train)
          # COME BACK POSITIONAL REPRESENTATION:
          # SUM EACH FRAME TO THE NEXT
          # X_{N+1} = Y_{N+1} + Y_{N}
          # ADD THE LAST FRAME OF THE TRAIN ONES
          sequences_predict[:, 1:output_n, :] = sequences_predict[:, 1:output_n, :] + sequences_predict[:, :output_n-1, :]
          sequences_predict = (sequences_predict + batch[:, (input_n-1):input_n, dim_used])


          loss = mpjpe_error(sequences_predict, sequences_gt) / output_n


          if cnt % log_step == 0:
            print('[Epoch: %d, Iteration: %5d]  Training loss: %.3f' %(epoch+1, cnt+1, loss.item()*output_n))

          loss.backward()
          if clip_grad is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

          optimizer.step()
          running_loss += loss*batch_dim*output_n

      train_loss.append(running_loss.detach().cpu()/n)
      model.eval()
      with torch.no_grad():
          running_loss = 0
          n = 0
          for cnt, batch in enumerate(vald_loader):
              batch = batch.float().to(device)
              batch_dim = batch.shape[0]
              n += batch_dim

              # GET SPEED REPRESENTATION:
              # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
              # Y_{N+1} = X_{N+1} - X_{N}
              sequences_train = torch.cat((torch.zeros(*batch[:, :1, dim_used].size()).to(device), batch[:, 1:input_n, dim_used] - batch[:, :input_n-1, dim_used]), 1)
              sequences_gt = batch[:, input_n:input_n + output_n, dim_used]

              sequences_predict = model(sequences_train)

              # COME BACK POSITIONAL REPRESENTATION:
              # SUM EACH FRAME TO THE NEXT
              # X_{N+1} = Y_{N+1} + Y_{N}
              # ADD THE LAST FRAME OF THE TRAIN ONES
              sequences_predict[:, 1:output_n, :] = sequences_predict[:, 1:output_n, :] + sequences_predict[:, :(output_n-1), :]
              sequences_predict = (sequences_predict + batch[:, (input_n-1):input_n, dim_used])
              loss = mpjpe_error(sequences_predict, sequences_gt) / output_n

              if cnt % log_step == 0:
                print('[Epoch: %d, Iteration: %5d]  Validation loss: %.3f' %(epoch+1, cnt+1, loss.item()*output_n))
              running_loss += loss * batch_dim * output_n
          val_loss.append(running_loss.detach().cpu()/n)
          if running_loss/n < val_loss_best:
            val_loss_best = running_loss/n
          if (epoch+1) % 5 == 0:
            torch.save(model.state_dict(), './checkpoints/LSTM_final_checkpoint_' + str(epoch+1) + '.pt')
            train_losses.append(train_loss[-1])
            val_losses.append(val_loss[-1])

  if use_scheduler:
    scheduler.step()

  epochs=[5]#,10,15,20]

  # Create the plot
  plt.figure(figsize=(10, 6))
  plt.plot(epochs, train_losses, label='Train Loss', marker='o')
  plt.plot(epochs, val_losses, label='Validation Loss', marker='o')

  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.title('Training and Validation Loss Over Epochs')

  plt.legend()

  # Display the plot
  plt.grid(True)
  plt.show()
  return train_losses,val_losses

In [34]:
t_loss, v_loss = train_final(dataset, vald_dataset)

[Epoch: 1, Iteration:     1]  Training loss: 89.595


KeyboardInterrupt: ignored

### Analysis and test

In [None]:
def test(ckpt_path=None):
    model.load_state_dict(torch.load(ckpt_path))
    print('model loaded')
    model.eval()
    accum_loss = 0
    n_batches = 0
    actions = define_actions(actions_to_consider_test)
    dim_used = np.array([ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 21, 22, 23, 24, 25,
                          26, 27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
                          46, 47, 51, 52, 53, 54, 55, 56, 57, 58, 59, 63, 64, 65, 66, 67, 68,
                          75, 76, 77, 78, 79, 80, 81, 82, 83, 87, 88, 89, 90, 91, 92 ])
    # joints at same loc
    joint_to_ignore = np.array([16, 20, 23, 24, 28, 31])

    index_to_ignore = np.concatenate((joint_to_ignore * 3, joint_to_ignore * 3 + 1, joint_to_ignore * 3 + 2))
    joint_equal = np.array([13, 19, 22, 13, 27, 30])
    index_to_equal = np.concatenate((joint_equal*3, joint_equal*3+1, joint_equal*3+2))
    totalll = 0
    counter = 0
    for action in actions:
      running_loss = 0
      n = 0
      dataset_test = datasets.Datasets(path, input_n, 25, skip_rate, split=2, actions=[action])
      #print('>>> test action for sequences: {:d}'.format(dataset_test.__len__()))

      test_loader = DataLoader(dataset_test, batch_size=batch_size_test, shuffle=False, num_workers=0, pin_memory=True)
      for cnt,batch in enumerate(test_loader):
        with torch.no_grad():

          batch=batch.to(device)
          batch_dim=batch.shape[0]
          n+=batch_dim

          all_joints_seq=batch.clone()[:, input_n:input_n+25,:]

          # GET SPEED REPRESENTATION OF THE FIRST FRAMES:
          # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
          # Y_{N+1} = X_{N+1} - X_{N}
          sequences_train1=torch.cat((torch.zeros(*batch[:,:1,dim_used].size()).to(device),batch[:,1:input_n,dim_used]-batch[:,:(input_n-1),dim_used]),1)
          sequences_gt=batch[:,10:35,:]


          running_time = time.time()
          sequences_predict1 = model(sequences_train1)

          # COME BACK POSITIONAL REPRESENTATION OF THE FIRST PREDICTION:
          # SUM EACH FRAME TO THE NEXT
          # X_{N+1} = Y_{N+1} + Y_{N}
          # ADD THE LAST FRAME OF THE TRAIN ONES
          sequences_predict1[:,1:output_n,:]=sequences_predict1[:,1:output_n,:]+sequences_predict1[:,:(output_n-1),:]
          sequences_predict1=(sequences_predict1+batch[:,(input_n-1):input_n,dim_used])

          # GET SPEED REPRESENTATION OF THE PREDICTED STEPS:
          # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
          # Y_{N+1} = X_{N+1} - X_{N}
          sequences_train2=torch.cat((torch.zeros(*batch[:,:1,dim_used].size()).to(device),
                                     sequences_predict1[:,6:15,:]-sequences_predict1[:,5:14,:]),1)

          sequences_predict2=model(sequences_train2)

          # COME BACK POSITIONAL REPRESENTATION OF THE SECOND PREDICTION:
          # SUM EACH FRAME TO THE NEXT
          # X_{N+1} = Y_{N+1} + Y_{N}
          # ADD THE LAST FRAME OF THE TRAIN ONES
          sequences_predict2[:,1:output_n,:]=sequences_predict2[:,1:output_n,:]+sequences_predict2[:,:(output_n-1),:]
          sequences_predict2=(sequences_predict2+sequences_predict1[:,(output_n-1):output_n,:])

          # STACK THE TWO PREDICTED SEQUENCES
          sequences_predict = torch.cat((sequences_predict1,sequences_predict2[:,:10,:]),1)

          totalll += time.time()-running_time
          counter += 1

          all_joints_seq[:,:,dim_used] = sequences_predict


          all_joints_seq[:,:,index_to_ignore] = all_joints_seq[:,:,index_to_equal]

          loss = mpjpe_error(all_joints_seq.view(-1,25,32,3),sequences_gt.view(-1,25,32,3))
          running_loss += loss*batch_dim
          accum_loss += loss*batch_dim

      #print('loss at test subject for action : '+str(action)+ ' is: '+ str(running_loss/n))
      print(str(action),': ', str(np.round((running_loss/n).item(),1)))
      n_batches+=n
    print('Average: ' + str(np.round((accum_loss/n_batches).item(),1)))
    print('Prediction time: ', totalll/counter)

In [None]:
path = './data/h3.6m/h3.6m/dataset'
skip_rate = 1
batch_size_test = 8
actions_to_consider_test = 'all'
ckpt_path = './checkpoints/LSTM_final_checkpoint_5.pt'

test(ckpt_path)

# Bayesian LSTM version
- Encoder: Transformer + CNN
- Decoder: Bayesian LSTM

In [None]:
!pip install blitz-bayesian-pytorch

In [8]:
# to use the blitz-bayesian library
from blitz.modules import BayesianLinear
from blitz.modules import BayesianLSTM

In [7]:
# to use the bayesian-torch library
from bayesian_torch.layers import LinearReparameterization as BayesianLinear
from bayesian_torch.layers import LSTMReparameterization as BayesianLSTM

## Definition

### Encoder

In [9]:
class Attention(nn.Module):
    # CLASSICAL ATTENTION MECHANISM
    # IT'S THE SAME OF THE TEORICAL PART

    def __init__(self, attn_dropout):
        super().__init__()
        self.dropout = nn.Dropout(attn_dropout)
    def forward(self, query, key, value, mask=None):
        attn = torch.matmul(query, key.transpose(-2, -1))
        d_k = query.size(-1)
        attn = attn / (d_k ** 0.5)
        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e9)
        attn = self.dropout(F.softmax(attn,-1))
        output = torch.matmul(attn, value)
        return output, attn

In [10]:
class MultiHeadAttention(nn.Module):
    # ALSO THE MULTIHEAD ATTENTION MECHANISM IS THE SAME OF TEORICAL PART

    def __init__(self, num_heads, d_model, dropout):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        #  We assume d_v always equals d_k
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.query_ff = nn.Linear(d_model, d_model)
        self.key_ff = nn.Linear(d_model, d_model)
        self.value_ff = nn.Linear(d_model, d_model)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        self.attention = Attention(attn_dropout=dropout)
    def forward(self, query, key, value, mask=None, return_attention=False):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        query = self.query_ff(query).view(nbatches, -1, self.num_heads, self.d_k).transpose(1, 2)
        key = self.key_ff(key).view(nbatches, -1, self.num_heads, self.d_k).transpose(1, 2)
        value = self.value_ff(value).view(nbatches, -1, self.num_heads, self.d_k).transpose(1, 2)
        x, self.attn = self.attention(query, key, value, mask)
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.num_heads * self.d_k)
        if return_attention:
            return self.attn_ff(x), self.attn
        return x

In [11]:
class EncoderBlock(nn.Module):
    # THE ENCODER BLOCK IS A TIPICAL TRANSFORMER ENCODER,
    # BUT WE HAVE ADDED A CONVOLUTIONAL LAYER AT THE END:
    # self.conv IS USED TO SHRINK THE INFORMATION ALONG THE ENCODER BLOCKS
    # THIS IS A METHOD TO DISTILL INFORMATION SHOULD BE PASSED TO THE DECODER
    # AND MOREOVER TO SAVE A LOT OF TIME

    def __init__(self,num_heads,d_model,time_in,time_out,num_joints,dropout):
        super().__init__()

        self.num_joints = num_joints
        self.d_model = d_model

        self.self_attn = MultiHeadAttention(num_heads, d_model, dropout)

        # LAYERNORM LAYERS AND DROPOUT
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

        # MLP
        self.lin_net = nn.Sequential(
            nn.Linear(d_model,2*d_model),
            nn.ReLU(),
            nn.Linear(2*d_model,d_model),
            nn.ReLU()
        )

        # A CNN SHRINKS THE FRAMES FROM time_it TO time_out
        # AND THE d_model IS HALVED AFTER EACH BLOCK
        self.conv = nn.Conv1d(time_in,time_out,3,padding=1)

    def forward(self, xs, mask=None):
        x = xs
        att = self.self_attn(x, x, x, mask)
        x = self.relu(x + att)
        x = self.norm1(x)

        lin_output = self.lin_net(x)
        x = self.relu(x + self.dropout(lin_output))

        # WE ADD A RESIDUAL CONNECTION ALSO HERE
        # IT REDUCES A BIT THE VANISH GRADIENT
        x = self.norm2(x + xs)

        # SHRINK
        x = self.conv(x)

        return x

### Decoder

In [12]:
class Decoder(nn.Module):
    # THE DECODER BLOCK IS A SIMPLE LSTM,
    # WHICH TAKES THE DISTILLED INFORMATION FROM THE
    # ENCODER AS HIDDEN STATE AND AUTOREGRESSIVLY
    # FORSEES THE NEXT FRAMES
    def __init__(self, hidden_dim, num_layers=1):
        super(Decoder, self).__init__()
        # LSTM
        self.lstm = BayesianLSTM(66, hidden_dim)
        # LINEAR self.out TO RETURN THE ORIGINAL DIMENSIONS
        self.out = nn.Linear(hidden_dim, 66)

    def forward(self, hidden, num_steps):
        # CELL STATE IS INITIALIZED TO ZEROS
        cell = nn.Parameter(torch.zeros(*hidden.shape)).to(device)
        batch_size = hidden.size(1)
        # THE <START> IS INITIALIZED TO ONES
        input = torch.ones((batch_size,66), dtype=torch.float).unsqueeze(0).to(device)
        outputs = torch.zeros((num_steps,batch_size,66), dtype=torch.float).to(device)

        for t in range(num_steps):
            # FORCAST
            decoder_output, (_,_) = self.lstm(input) # if blitz-bayesian library
            #decoder_output, _, kl = self.lstm(input) # if bayesian-torch library
            # SET TO THE ORIGINAL DIMENSIONS
            decoder_output = self.out(decoder_output[-1])
            outputs[t] = decoder_output
            # STACK TO THE INPUT
            input = torch.cat((input,decoder_output.unsqueeze(0)),0)
        return outputs

### Model

In [13]:
def conv_init(conv):
    nn.init.kaiming_normal_(conv.weight, nonlinearity='relu')

In [14]:
def fc_init(fc):
    nn.init.kaiming_normal_(fc.weight, nonlinearity='relu')

In [15]:
def bn_init(bn, scale):
    nn.init.constant_(bn.weight, scale)
    nn.init.constant_(bn.bias, 0)

In [16]:
class Model(nn.Module):
    # THIS IS THE FINAL MODEL IN WHICH WE MERGED THE ENCODER AND DECODER PARTS
    def __init__(self, num_channels, num_frames_out,
                 old_frames, num_joints, num_heads, drop,
                 d_model = 512, config=None):
        super().__init__()

        # CONFIGURATION FOR THE ENCODER BLOCKS
        if config==None:
            self.config = [[d_model,10,8],[d_model,8,6],[d_model,6,4],[d_model,4,2],[d_model,2,1]]

        self.num_channels = num_channels
        self.num_frames_out = num_frames_out
        self.num_heads = num_heads
        self.num_joints = num_joints
        self.old_frames = old_frames
        self.d_model = d_model

        # LINEAR BLOCK TO PASS FROM INITIAL DIMENSION 66 TO d_model
        self.lin = nn.Sequential(nn.Linear(self.num_channels*self.num_joints,d_model),nn.BatchNorm1d(self.old_frames))
        self.norm = nn.BatchNorm2d(self.num_channels)

        # ENCODER
        self.blocks = nn.ModuleList()
        # d_ : d_model of the block;
        # in_ : number of frames of the sequences at the begin;
        # out_ : number of frames of the sequences at the end
        for index, (d_,in_,out_) in enumerate(self.config):
            self.blocks.append(EncoderBlock(num_heads=self.num_heads,
                                            d_model=d_, time_in=in_, time_out=out_,
                                            num_joints=self.num_joints,dropout=drop))

        # SINUSOIDAL POSITIONAL EMBEDDINGS
        self.pos = Pos_Embed(self.num_channels,self.old_frames,self.num_joints)
        # DECODER: HIDDEN STATE DIMENSION IS THE DIMENSION OF THE LAST ENCODER
        self.dec = Decoder(self.d_model)

        # WEIGHTS INITIALIZATION
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                conv_init(m)
            if isinstance(m, nn.Conv1d):
                conv_init(m)
            elif isinstance(m, nn.BatchNorm2d):
                bn_init(m,1)
            elif isinstance(m, nn.BatchNorm1d):
                bn_init(m,1)
            elif isinstance(m, nn.Linear):
                fc_init(m)

    def forward(self,x):
        x = x.view(-1,self.old_frames,self.num_joints,self.num_channels).permute(0,3,1,2)
        # SUM POSITIONAL EMBEDDING
        x = (x + self.pos(x)).permute(0,2,3,1).view(-1,self.old_frames,self.num_joints*self.num_channels)
        # LINEAR LAYER
        x = self.lin(x)
        # ENCODER BLOCKS
        for i, block in enumerate(self.blocks):
            x = block(x)
        # RETRIEVE CONTEXT FROM THE ENCODER
        context = x.view(-1,self.d_model).unsqueeze(0)
        # PASS TO THE DECODER AND RETURN THE RESULTS
        results = self.dec(hidden = context,num_steps = self.num_frames_out)
        results = results.permute(1,0,2)
        return results

## Training

In [17]:
# Argument for training
n_epochs = 5
log_step = 50
input_n = 10
output_n = 15

# The model name to save/load
datas = 'h36m'
model_path = datas + '_3d_' + str(output_n) + 'frames_ckpt'

model = Model(num_channels=3,
              num_frames_out=output_n,
              old_frames=input_n,
              num_joints=22,
              num_heads=8,
              drop=0.3).to(device)

# Arguments to setup the optimizer
lr = 5e-04 # learning rate
use_scheduler = True # use MultiStepLR scheduler
milestones = [2, 2, 2, 5, 5]   # the epochs after which the learning rate is adjusted by gamma
gamma = 0.5 #gamma correction to the learning rate, after reaching the milestone epochs
weight_decay = 0.00003 # weight decay (L2 penalty)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

if use_scheduler:
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=gamma)

In [18]:
# WE HAVE REWRITE THE TRAIN FUNCTION
# TO IMPLEMENT THE SPEED REPRESENTATION

save_and_plot = False
def train_final(data_loader, vald_loader, clip_grad=None):
  train_loss = []
  val_loss = []
  val_loss_best = 1000

  # Initialize lists to store data from each checkpoint
  train_losses = []
  val_losses = []

  dim_used = np.array([6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 21, 22, 23, 24, 25,
                    26, 27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
                    46, 47, 51, 52, 53, 54, 55, 56, 57, 58, 59, 63, 64, 65, 66, 67, 68,
                    75, 76, 77, 78, 79, 80, 81, 82, 83, 87, 88, 89, 90, 91, 92])

  for epoch in range(n_epochs):
      running_loss = 0
      n = 0
      model.train()
      for cnt, batch in enumerate(data_loader):
          batch = batch.float().to(device)
          batch_dim = batch.shape[0]
          n += batch_dim

          # GET SPEED REPRESENTATION:
          # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
          # Y_{N+1} = X_{N+1} - X_{N}
          sequences_train = torch.cat((torch.zeros(*batch[:, :1, dim_used].size()).to(device),batch[:, 1:10, dim_used]-batch[:, :9, dim_used]), 1)
          sequences_gt = batch[:, input_n:input_n + output_n, dim_used]

          optimizer.zero_grad()
          sequences_predict=model(sequences_train)
          # COME BACK POSITIONAL REPRESENTATION:
          # SUM EACH FRAME TO THE NEXT
          # X_{N+1} = Y_{N+1} + Y_{N}
          # ADD THE LAST FRAME OF THE TRAIN ONES
          sequences_predict[:, 1:output_n, :] = sequences_predict[:, 1:output_n, :] + sequences_predict[:, :output_n-1, :]
          sequences_predict = (sequences_predict + batch[:, (input_n-1):input_n, dim_used])


          loss = mpjpe_error(sequences_predict, sequences_gt) / output_n


          if cnt % log_step == 0:
            print('[Epoch: %d, Iteration: %5d]  Training loss: %.3f' %(epoch+1, cnt+1, loss.item()*output_n))

          loss.backward()
          if clip_grad is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

          optimizer.step()
          running_loss += loss*batch_dim*output_n

      train_loss.append(running_loss.detach().cpu()/n)
      model.eval()
      with torch.no_grad():
          running_loss = 0
          n = 0
          for cnt, batch in enumerate(vald_loader):
              batch = batch.float().to(device)
              batch_dim = batch.shape[0]
              n += batch_dim

              # GET SPEED REPRESENTATION:
              # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
              # Y_{N+1} = X_{N+1} - X_{N}
              sequences_train = torch.cat((torch.zeros(*batch[:, :1, dim_used].size()).to(device), batch[:, 1:input_n, dim_used] - batch[:, :input_n-1, dim_used]), 1)
              sequences_gt = batch[:, input_n:input_n + output_n, dim_used]

              sequences_predict = model(sequences_train)

              # COME BACK POSITIONAL REPRESENTATION:
              # SUM EACH FRAME TO THE NEXT
              # X_{N+1} = Y_{N+1} + Y_{N}
              # ADD THE LAST FRAME OF THE TRAIN ONES
              sequences_predict[:, 1:output_n, :] = sequences_predict[:, 1:output_n, :] + sequences_predict[:, :(output_n-1), :]
              sequences_predict = (sequences_predict + batch[:, (input_n-1):input_n, dim_used])
              loss = mpjpe_error(sequences_predict, sequences_gt) / output_n

              if cnt % log_step == 0:
                print('[Epoch: %d, Iteration: %5d]  Validation loss: %.3f' %(epoch+1, cnt+1, loss.item()*output_n))
              running_loss += loss * batch_dim * output_n
          val_loss.append(running_loss.detach().cpu()/n)
          if running_loss/n < val_loss_best:
            val_loss_best = running_loss/n
          if (epoch+1) % 5 == 0:
            torch.save(model.state_dict(), './checkpoints/LSTM_final_checkpoint_' + str(epoch+1) + '.pt')
            train_losses.append(train_loss[-1])
            val_losses.append(val_loss[-1])

  if use_scheduler:
    scheduler.step()

  epochs=[5]#,10,15,20]

  # Create the plot
  plt.figure(figsize=(10, 6))
  plt.plot(epochs, train_losses, label='Train Loss', marker='o')
  plt.plot(epochs, val_losses, label='Validation Loss', marker='o')

  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.title('Training and Validation Loss Over Epochs')

  plt.legend()

  # Display the plot
  plt.grid(True)
  plt.show()
  return train_losses, val_losses

In [None]:
t_loss, v_loss = train_final(dataset, vald_dataset)

### Analysis and test

In [None]:
def test(ckpt_path=None):
    model.load_state_dict(torch.load(ckpt_path))
    print('model loaded')
    model.eval()
    accum_loss = 0
    n_batches = 0
    actions = define_actions(actions_to_consider_test)
    dim_used = np.array([ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 21, 22, 23, 24, 25,
                          26, 27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
                          46, 47, 51, 52, 53, 54, 55, 56, 57, 58, 59, 63, 64, 65, 66, 67, 68,
                          75, 76, 77, 78, 79, 80, 81, 82, 83, 87, 88, 89, 90, 91, 92 ])
    # joints at same loc
    joint_to_ignore = np.array([16, 20, 23, 24, 28, 31])

    index_to_ignore = np.concatenate((joint_to_ignore * 3, joint_to_ignore * 3 + 1, joint_to_ignore * 3 + 2))
    joint_equal = np.array([13, 19, 22, 13, 27, 30])
    index_to_equal = np.concatenate((joint_equal*3, joint_equal*3+1, joint_equal*3+2))
    totalll = 0
    counter = 0
    for action in actions:
      running_loss = 0
      n = 0
      dataset_test = datasets.Datasets(path, input_n, 25, skip_rate, split=2, actions=[action])
      #print('>>> test action for sequences: {:d}'.format(dataset_test.__len__()))

      test_loader = DataLoader(dataset_test, batch_size=batch_size_test, shuffle=False, num_workers=0, pin_memory=True)
      for cnt,batch in enumerate(test_loader):
        with torch.no_grad():

          batch=batch.to(device)
          batch_dim=batch.shape[0]
          n+=batch_dim

          all_joints_seq=batch.clone()[:, input_n:input_n+25,:]

          # GET SPEED REPRESENTATION OF THE FIRST FRAMES:
          # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
          # Y_{N+1} = X_{N+1} - X_{N}
          sequences_train1=torch.cat((torch.zeros(*batch[:,:1,dim_used].size()).to(device),batch[:,1:input_n,dim_used]-batch[:,:(input_n-1),dim_used]),1)
          sequences_gt=batch[:,10:35,:]


          running_time = time.time()
          sequences_predict1 = model(sequences_train1)

          # COME BACK POSITIONAL REPRESENTATION OF THE FIRST PREDICTION:
          # SUM EACH FRAME TO THE NEXT
          # X_{N+1} = Y_{N+1} + Y_{N}
          # ADD THE LAST FRAME OF THE TRAIN ONES
          sequences_predict1[:,1:output_n,:]=sequences_predict1[:,1:output_n,:]+sequences_predict1[:,:(output_n-1),:]
          sequences_predict1=(sequences_predict1+batch[:,(input_n-1):input_n,dim_used])

          # GET SPEED REPRESENTATION OF THE PREDICTED STEPS:
          # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
          # Y_{N+1} = X_{N+1} - X_{N}
          sequences_train2=torch.cat((torch.zeros(*batch[:,:1,dim_used].size()).to(device),
                                     sequences_predict1[:,6:15,:]-sequences_predict1[:,5:14,:]),1)

          sequences_predict2=model(sequences_train2)

          # COME BACK POSITIONAL REPRESENTATION OF THE SECOND PREDICTION:
          # SUM EACH FRAME TO THE NEXT
          # X_{N+1} = Y_{N+1} + Y_{N}
          # ADD THE LAST FRAME OF THE TRAIN ONES
          sequences_predict2[:,1:output_n,:]=sequences_predict2[:,1:output_n,:]+sequences_predict2[:,:(output_n-1),:]
          sequences_predict2=(sequences_predict2+sequences_predict1[:,(output_n-1):output_n,:])

          # STACK THE TWO PREDICTED SEQUENCES
          sequences_predict = torch.cat((sequences_predict1,sequences_predict2[:,:10,:]),1)

          totalll += time.time()-running_time
          counter += 1

          all_joints_seq[:,:,dim_used] = sequences_predict


          all_joints_seq[:,:,index_to_ignore] = all_joints_seq[:,:,index_to_equal]

          loss = mpjpe_error(all_joints_seq.view(-1,25,32,3),sequences_gt.view(-1,25,32,3))
          running_loss += loss*batch_dim
          accum_loss += loss*batch_dim

      #print('loss at test subject for action : '+str(action)+ ' is: '+ str(running_loss/n))
      print(str(action),': ', str(np.round((running_loss/n).item(),1)))
      n_batches+=n
    print('Average: ' + str(np.round((accum_loss/n_batches).item(),1)))
    print('Prediction time: ', totalll/counter)

In [None]:
path = './data/h3.6m/h3.6m/dataset'
skip_rate = 1
batch_size_test = 8
actions_to_consider_test = 'all'
ckpt_path = './checkpoints/LSTM_final_checkpoint_5.pt'

test(ckpt_path)

# Bayesian GRU version
- Encoder: Transformer + CNN
- Decoder: Bayesian GRU

In [None]:
!pip install blitz-bayesian-pytorch

In [8]:
from blitz.modules import BayesianLinear
from blitz.modules import BayesianGRU

## Definition

### Encoder

In [9]:
class Attention(nn.Module):
    # CLASSICAL ATTENTION MECHANISM
    # IT'S THE SAME OF THE TEORICAL PART

    def __init__(self, attn_dropout):
        super().__init__()
        self.dropout = nn.Dropout(attn_dropout)
    def forward(self, query, key, value, mask=None):
        attn = torch.matmul(query, key.transpose(-2, -1))
        d_k = query.size(-1)
        attn = attn / (d_k ** 0.5)
        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e9)
        attn = self.dropout(F.softmax(attn,-1))
        output = torch.matmul(attn, value)
        return output, attn

In [10]:
class MultiHeadAttention(nn.Module):
    # ALSO THE MULTIHEAD ATTENTION MECHANISM IS THE SAME OF TEORICAL PART

    def __init__(self, num_heads, d_model, dropout):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        #  We assume d_v always equals d_k
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.query_ff = nn.Linear(d_model, d_model)
        self.key_ff = nn.Linear(d_model, d_model)
        self.value_ff = nn.Linear(d_model, d_model)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        self.attention = Attention(attn_dropout=dropout)
    def forward(self, query, key, value, mask=None, return_attention=False):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        query = self.query_ff(query).view(nbatches, -1, self.num_heads, self.d_k).transpose(1, 2)
        key = self.key_ff(key).view(nbatches, -1, self.num_heads, self.d_k).transpose(1, 2)
        value = self.value_ff(value).view(nbatches, -1, self.num_heads, self.d_k).transpose(1, 2)
        x, self.attn = self.attention(query, key, value, mask)
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.num_heads * self.d_k)
        if return_attention:
            return self.attn_ff(x), self.attn
        return x

In [11]:
class EncoderBlock(nn.Module):
    # THE ENCODER BLOCK IS A TIPICAL TRANSFORMER ENCODER,
    # BUT WE HAVE ADDED A CONVOLUTIONAL LAYER AT THE END:
    # self.conv IS USED TO SHRINK THE INFORMATION ALONG THE ENCODER BLOCKS
    # THIS IS A METHOD TO DISTILL INFORMATION SHOULD BE PASSED TO THE DECODER
    # AND MOREOVER TO SAVE A LOT OF TIME

    def __init__(self,num_heads,d_model,time_in,time_out,num_joints,dropout):
        super().__init__()

        self.num_joints = num_joints
        self.d_model = d_model

        self.self_attn = MultiHeadAttention(num_heads, d_model, dropout)

        # LAYERNORM LAYERS AND DROPOUT
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

        # MLP
        self.lin_net = nn.Sequential(
            nn.Linear(d_model,2*d_model),
            nn.ReLU(),
            nn.Linear(2*d_model,d_model),
            nn.ReLU()
        )

        # A CNN SHRINKS THE FRAMES FROM time_it TO time_out
        # AND THE d_model IS HALVED AFTER EACH BLOCK
        self.conv = nn.Conv1d(time_in,time_out,3,padding=1)

    def forward(self, xs, mask=None):
        x = xs
        att = self.self_attn(x, x, x, mask)
        x = self.relu(x + att)
        x = self.norm1(x)

        lin_output = self.lin_net(x)
        x = self.relu(x + self.dropout(lin_output))

        # WE ADD A RESIDUAL CONNECTION ALSO HERE
        # IT REDUCES A BIT THE VANISH GRADIENT
        x = self.norm2(x + xs)

        # SHRINK
        x = self.conv(x)

        return x

### Decoder

In [12]:
class Decoder(nn.Module):
    # THE DECODER BLOCK IS A SIMPLE GRU,
    # WHICH TAKES THE DISTILLED INFORMATION FROM THE
    # ENCODER AS HIDDEN STATE AND AUTOREGRESSIVLY
    # FORSEES THE NEXT FRAMES
    def __init__(self, hidden_dim, num_layers=1):
        super(Decoder, self).__init__()
        # GRU
        self.gru = BayesianGRU(66, hidden_dim)
        # LINEAR self.out TO RETURN THE ORIGINAL DIMENSIONS
        self.out = nn.Linear(hidden_dim, 66)

    def forward(self, hidden, num_steps):
        # CELL STATE IS INITIALIZED TO ZEROS
        cell = nn.Parameter(torch.zeros(*hidden.shape)).to(device)
        batch_size = hidden.size(1)
        # THE <START> IS INITIALIZED TO ONES
        input = torch.ones((batch_size,66), dtype=torch.float).unsqueeze(0).to(device)
        outputs = torch.zeros((num_steps,batch_size,66), dtype=torch.float).to(device)

        for t in range(num_steps):
            # FORCAST
            decoder_output, _ = self.gru(input)
            # SET TO THE ORIGINAL DIMENSIONS
            decoder_output = self.out(decoder_output[-1])
            outputs[t] = decoder_output
            # STACK TO THE INPUT
            input = torch.cat((input,decoder_output.unsqueeze(0)),0)
        return outputs

### Model

In [13]:
def conv_init(conv):
    nn.init.kaiming_normal_(conv.weight, nonlinearity='relu')

In [14]:
def fc_init(fc):
    nn.init.kaiming_normal_(fc.weight, nonlinearity='relu')

In [15]:
def bn_init(bn, scale):
    nn.init.constant_(bn.weight, scale)
    nn.init.constant_(bn.bias, 0)

In [16]:
class Model(nn.Module):
    # THIS IS THE FINAL MODEL IN WHICH WE MERGED THE ENCODER AND DECODER PARTS
    def __init__(self, num_channels, num_frames_out,
                 old_frames, num_joints, num_heads, drop,
                 d_model = 512, config=None):
        super().__init__()

        # CONFIGURATION FOR THE ENCODER BLOCKS
        if config==None:
            self.config = [[d_model,10,8],[d_model,8,6],[d_model,6,4],[d_model,4,2],[d_model,2,1]]

        self.num_channels = num_channels
        self.num_frames_out = num_frames_out
        self.num_heads = num_heads
        self.num_joints = num_joints
        self.old_frames = old_frames
        self.d_model = d_model

        # LINEAR BLOCK TO PASS FROM INITIAL DIMENSION 66 TO d_model
        self.lin = nn.Sequential(nn.Linear(self.num_channels*self.num_joints,d_model),nn.BatchNorm1d(self.old_frames))
        self.norm = nn.BatchNorm2d(self.num_channels)

        # ENCODER
        self.blocks = nn.ModuleList()
        # d_ : d_model of the block;
        # in_ : number of frames of the sequences at the begin;
        # out_ : number of frames of the sequences at the end
        for index, (d_,in_,out_) in enumerate(self.config):
            self.blocks.append(EncoderBlock(num_heads=self.num_heads,
                                            d_model=d_, time_in=in_, time_out=out_,
                                            num_joints=self.num_joints,dropout=drop))

        # SINUSOIDAL POSITIONAL EMBEDDINGS
        self.pos = Pos_Embed(self.num_channels,self.old_frames,self.num_joints)
        # DECODER: HIDDEN STATE DIMENSION IS THE DIMENSION OF THE LAST ENCODER
        self.dec = Decoder(self.d_model)

        # WEIGHTS INITIALIZATION
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                conv_init(m)
            if isinstance(m, nn.Conv1d):
                conv_init(m)
            elif isinstance(m, nn.BatchNorm2d):
                bn_init(m,1)
            elif isinstance(m, nn.BatchNorm1d):
                bn_init(m,1)
            elif isinstance(m, nn.Linear):
                fc_init(m)

    def forward(self,x):
        x = x.view(-1,self.old_frames,self.num_joints,self.num_channels).permute(0,3,1,2)
        # SUM POSITIONAL EMBEDDING
        x = (x + self.pos(x)).permute(0,2,3,1).view(-1,self.old_frames,self.num_joints*self.num_channels)
        # LINEAR LAYER
        x = self.lin(x)
        # ENCODER BLOCKS
        for i, block in enumerate(self.blocks):
            x = block(x)
        # RETRIEVE CONTEXT FROM THE ENCODER
        context = x.view(-1,self.d_model).unsqueeze(0)
        # PASS TO THE DECODER AND RETURN THE RESULTS
        results = self.dec(hidden = context,num_steps = self.num_frames_out)
        results = results.permute(1,0,2)
        return results

## Training

In [17]:
# Argument for training
n_epochs = 5
log_step = 50
input_n = 10
output_n = 15

# The model name to save/load
datas = 'h36m'
model_path = datas + '_3d_' + str(output_n) + 'frames_ckpt'

model = Model(num_channels=3,
              num_frames_out=output_n,
              old_frames=input_n,
              num_joints=22,
              num_heads=8,
              drop=0.3).to(device)

# Arguments to setup the optimizer
lr = 5e-04 # learning rate
use_scheduler = True # use MultiStepLR scheduler
milestones = [2, 2, 2, 5, 5]   # the epochs after which the learning rate is adjusted by gamma
gamma = 0.5 #gamma correction to the learning rate, after reaching the milestone epochs
weight_decay = 0.00003 # weight decay (L2 penalty)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

if use_scheduler:
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=gamma)

In [18]:
# WE HAVE REWRITE THE TRAIN FUNCTION
# TO IMPLEMENT THE SPEED REPRESENTATION

save_and_plot = False
def train_final(data_loader, vald_loader, clip_grad=None):
  train_loss = []
  val_loss = []
  val_loss_best = 1000

  # Initialize lists to store data from each checkpoint
  train_losses = []
  val_losses = []

  dim_used = np.array([6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 21, 22, 23, 24, 25,
                      26, 27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
                      46, 47, 51, 52, 53, 54, 55, 56, 57, 58, 59, 63, 64, 65, 66, 67, 68,
                      75, 76, 77, 78, 79, 80, 81, 82, 83, 87, 88, 89, 90, 91, 92])

  for epoch in range(n_epochs):
      running_loss = 0
      n = 0
      model.train()
      for cnt, batch in enumerate(data_loader):
          batch = batch.float().to(device)
          batch_dim = batch.shape[0]
          n += batch_dim

          # GET SPEED REPRESENTATION:
          # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
          # Y_{N+1} = X_{N+1} - X_{N}
          sequences_train = torch.cat((torch.zeros(*batch[:, :1, dim_used].size()).to(device),batch[:, 1:10, dim_used]-batch[:, :9, dim_used]), 1)
          sequences_gt = batch[:, input_n:input_n + output_n, dim_used]

          optimizer.zero_grad()
          sequences_predict=model(sequences_train)
          # COME BACK POSITIONAL REPRESENTATION:
          # SUM EACH FRAME TO THE NEXT
          # X_{N+1} = Y_{N+1} + Y_{N}
          # ADD THE LAST FRAME OF THE TRAIN ONES
          sequences_predict[:, 1:output_n, :] = sequences_predict[:, 1:output_n, :] + sequences_predict[:, :output_n-1, :]
          sequences_predict = (sequences_predict + batch[:, (input_n-1):input_n, dim_used])


          loss = mpjpe_error(sequences_predict, sequences_gt) / output_n


          if cnt % log_step == 0:
            print('[Epoch: %d, Iteration: %5d]  Training loss: %.3f' %(epoch+1, cnt+1, loss.item()*output_n))

          loss.backward()
          if clip_grad is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

          optimizer.step()
          running_loss += loss*batch_dim*output_n

      train_loss.append(running_loss.detach().cpu()/n)
      model.eval()
      with torch.no_grad():
          running_loss = 0
          n = 0
          for cnt, batch in enumerate(vald_loader):
              batch = batch.float().to(device)
              batch_dim = batch.shape[0]
              n += batch_dim

              # GET SPEED REPRESENTATION:
              # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
              # Y_{N+1} = X_{N+1} - X_{N}
              sequences_train = torch.cat((torch.zeros(*batch[:, :1, dim_used].size()).to(device), batch[:, 1:input_n, dim_used] - batch[:, :input_n-1, dim_used]), 1)
              sequences_gt = batch[:, input_n:input_n + output_n, dim_used]

              sequences_predict = model(sequences_train)

              # COME BACK POSITIONAL REPRESENTATION:
              # SUM EACH FRAME TO THE NEXT
              # X_{N+1} = Y_{N+1} + Y_{N}
              # ADD THE LAST FRAME OF THE TRAIN ONES
              sequences_predict[:, 1:output_n, :] = sequences_predict[:, 1:output_n, :] + sequences_predict[:, :(output_n-1), :]
              sequences_predict = (sequences_predict + batch[:, (input_n-1):input_n, dim_used])
              loss = mpjpe_error(sequences_predict, sequences_gt) / output_n

              if cnt % log_step == 0:
                print('[Epoch: %d, Iteration: %5d]  Validation loss: %.3f' %(epoch+1, cnt+1, loss.item()*output_n))
              running_loss += loss * batch_dim * output_n
          val_loss.append(running_loss.detach().cpu()/n)
          if running_loss/n < val_loss_best:
            val_loss_best = running_loss/n
          if (epoch+1) % 5 == 0:
            torch.save(model.state_dict(), './checkpoints/LSTM_final_checkpoint_' + str(epoch+1) + '.pt')
            train_losses.append(train_loss[-1])
            val_losses.append(val_loss[-1])

  if use_scheduler:
    scheduler.step()

  epochs=[5]#,10,15,20]

  # Create the plot
  plt.figure(figsize=(10, 6))
  plt.plot(epochs, train_losses, label='Train Loss', marker='o')
  plt.plot(epochs, val_losses, label='Validation Loss', marker='o')

  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.title('Training and Validation Loss Over Epochs')

  plt.legend()

  # Display the plot
  plt.grid(True)
  plt.show()
  return train_losses,val_losses

In [19]:
t_loss, v_loss = train_final(dataset, vald_dataset)

[Epoch: 1, Iteration:     1]  Training loss: 77.170
[Epoch: 1, Iteration:    51]  Training loss: 84.953
[Epoch: 1, Iteration:   101]  Training loss: 80.441
[Epoch: 1, Iteration:   151]  Training loss: 63.808
[Epoch: 1, Iteration:   201]  Training loss: 76.087
[Epoch: 1, Iteration:   251]  Training loss: 75.209
[Epoch: 1, Iteration:   301]  Training loss: 82.337
[Epoch: 1, Iteration:   351]  Training loss: 72.179
[Epoch: 1, Iteration:   401]  Training loss: 72.022


KeyboardInterrupt: ignored

### Analysis and test

In [None]:
def test(ckpt_path=None):
    model.load_state_dict(torch.load(ckpt_path))
    print('model loaded')
    model.eval()
    accum_loss = 0
    n_batches = 0
    actions = define_actions(actions_to_consider_test)
    dim_used = np.array([ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 21, 22, 23, 24, 25,
                          26, 27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
                          46, 47, 51, 52, 53, 54, 55, 56, 57, 58, 59, 63, 64, 65, 66, 67, 68,
                          75, 76, 77, 78, 79, 80, 81, 82, 83, 87, 88, 89, 90, 91, 92 ])
    # joints at same loc
    joint_to_ignore = np.array([16, 20, 23, 24, 28, 31])

    index_to_ignore = np.concatenate((joint_to_ignore * 3, joint_to_ignore * 3 + 1, joint_to_ignore * 3 + 2))
    joint_equal = np.array([13, 19, 22, 13, 27, 30])
    index_to_equal = np.concatenate((joint_equal*3, joint_equal*3+1, joint_equal*3+2))
    totalll = 0
    counter = 0
    for action in actions:
      running_loss = 0
      n = 0
      dataset_test = datasets.Datasets(path, input_n, 25, skip_rate, split=2, actions=[action])
      #print('>>> test action for sequences: {:d}'.format(dataset_test.__len__()))

      test_loader = DataLoader(dataset_test, batch_size=batch_size_test, shuffle=False, num_workers=0, pin_memory=True)
      for cnt,batch in enumerate(test_loader):
        with torch.no_grad():

          batch=batch.to(device)
          batch_dim=batch.shape[0]
          n+=batch_dim

          all_joints_seq=batch.clone()[:, input_n:input_n+25,:]

          # GET SPEED REPRESENTATION OF THE FIRST FRAMES:
          # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
          # Y_{N+1} = X_{N+1} - X_{N}
          sequences_train1=torch.cat((torch.zeros(*batch[:,:1,dim_used].size()).to(device),batch[:,1:input_n,dim_used]-batch[:,:(input_n-1),dim_used]),1)
          sequences_gt=batch[:,10:35,:]


          running_time = time.time()
          sequences_predict1 = model(sequences_train1)

          # COME BACK POSITIONAL REPRESENTATION OF THE FIRST PREDICTION:
          # SUM EACH FRAME TO THE NEXT
          # X_{N+1} = Y_{N+1} + Y_{N}
          # ADD THE LAST FRAME OF THE TRAIN ONES
          sequences_predict1[:,1:output_n,:]=sequences_predict1[:,1:output_n,:]+sequences_predict1[:,:(output_n-1),:]
          sequences_predict1=(sequences_predict1+batch[:,(input_n-1):input_n,dim_used])

          # GET SPEED REPRESENTATION OF THE PREDICTED STEPS:
          # SET THE FIRST FRAME TO ZERO AND CALCULATE THE SPEED AS
          # Y_{N+1} = X_{N+1} - X_{N}
          sequences_train2=torch.cat((torch.zeros(*batch[:,:1,dim_used].size()).to(device),
                                     sequences_predict1[:,6:15,:]-sequences_predict1[:,5:14,:]),1)

          sequences_predict2=model(sequences_train2)

          # COME BACK POSITIONAL REPRESENTATION OF THE SECOND PREDICTION:
          # SUM EACH FRAME TO THE NEXT
          # X_{N+1} = Y_{N+1} + Y_{N}
          # ADD THE LAST FRAME OF THE TRAIN ONES
          sequences_predict2[:,1:output_n,:]=sequences_predict2[:,1:output_n,:]+sequences_predict2[:,:(output_n-1),:]
          sequences_predict2=(sequences_predict2+sequences_predict1[:,(output_n-1):output_n,:])

          # STACK THE TWO PREDICTED SEQUENCES
          sequences_predict = torch.cat((sequences_predict1,sequences_predict2[:,:10,:]),1)

          totalll += time.time()-running_time
          counter += 1

          all_joints_seq[:,:,dim_used] = sequences_predict


          all_joints_seq[:,:,index_to_ignore] = all_joints_seq[:,:,index_to_equal]

          loss = mpjpe_error(all_joints_seq.view(-1,25,32,3),sequences_gt.view(-1,25,32,3))
          running_loss += loss*batch_dim
          accum_loss += loss*batch_dim

      #print('loss at test subject for action : '+str(action)+ ' is: '+ str(running_loss/n))
      print(str(action),': ', str(np.round((running_loss/n).item(),1)))
      n_batches+=n
    print('Average: ' + str(np.round((accum_loss/n_batches).item(),1)))
    print('Prediction time: ', totalll/counter)

In [None]:
path = './data/h3.6m/h3.6m/dataset'
skip_rate = 1
batch_size_test = 8
actions_to_consider_test = 'all'
ckpt_path = './checkpoints/LSTM_final_checkpoint_5.pt'

test(ckpt_path)