<a href="https://colab.research.google.com/github/LuisFerRosas/ia3/blob/nuevo/nuevo__modelo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
import torch
import torch.nn as nn
import numpy as np
from torch import Tensor 
import math
import torch.nn.functional as F  

In [21]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


class PositionalEncoding(nn.Module):
  def __init__(self,
                emb_size: int,
                dropout: float,
                maxlen: int = 6000):
      super(PositionalEncoding, self).__init__()
      den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
      pos = torch.arange(0, maxlen).reshape(maxlen, 1)
      pos_embedding = torch.zeros((maxlen, emb_size))
      pos_embedding[:, 0::2] = torch.sin(pos * den)
      pos_embedding[:, 1::2] = torch.cos(pos * den)
      pos_embedding = pos_embedding.unsqueeze(-2)

      self.dropout = nn.Dropout(dropout)
      self.register_buffer('pos_embedding', pos_embedding)

  def forward(self, token_embedding: Tensor):
      return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [55]:
class TmusicTrasforms(nn.Module):
  def __init__(self,tgt_vocabulario):
    super().__init__() 
    
    ################ TRANSFORMER BLOCK #############################
    # maxpool the input feature map/tensor to the transformer 
    # a rectangular kernel worked better here for the rectangular input spectrogram feature map/tensor
    self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1,4], stride=[1,4])
    
    
    self.tgt_tok_emb = TokenEmbedding(tgt_vocabulario, 512)
    self.positional_encoding = PositionalEncoding(512, dropout=0.3)
    
    transformer_layer = nn.TransformerEncoderLayer(
        d_model=40, # input feature (frequency) dim after maxpooling 40*282 -> 40*70 (MFC*time)
        nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block
        dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 40-->512--->40
        dropout=0.4, 
        activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time
    )
    
    self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)
    
    transformer_decoder_layer=nn.TransformerDecoderLayer(
        d_model=512,
        nhead=4,
        dim_feedforward=512,
        dropout=0.4,
        activation='relu'
    )
    self.transformer_decoder=nn.TransformerDecoder(transformer_decoder_layer,num_layers=6)

    ############### 1ST PARALLEL 2D CONVOLUTION BLOCK ############
    # 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
    self.conv2Dblock1 = nn.Sequential(
        
        # 1st 2D convolution layer
        nn.Conv2d(
            in_channels=1, # input volume depth == input channel dim == 1
            out_channels=8, # expand output feature map volume's depth to 16
            kernel_size=3, # typical 3*3 stride 1 kernel
            stride=1,
            padding=1
                  ),
        nn.BatchNorm2d(8), # batch normalize the output feature map before activation
        nn.ReLU(), # feature map --> activation map
        nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size
        nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training
        
        # 2nd 2D convolution layer identical to last except output dim, maxpool kernel
        nn.Conv2d(
            in_channels=8, 
            out_channels=16, # expand output feature map volume's depth to 32
            kernel_size=3,
            stride=1,
            padding=1
                  ),
        nn.BatchNorm2d(16),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=5, stride=5), # increase maxpool kernel for subsequent filters
        nn.Dropout(p=0.3), 

        # 2nd 2D convolution layer identical to last except output dim, maxpool kernel
        nn.Conv2d(
            in_channels=16, 
            out_channels=32, # expand output feature map volume's depth to 32
            kernel_size=3,
            stride=1,
            padding=1
                  ),
        nn.BatchNorm2d(32),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2), # increase maxpool kernel for subsequent filters
        nn.Dropout(p=0.3), 
        
        # 3rd 2D convolution layer identical to last except output dim
        nn.Conv2d(
            in_channels=32,
            out_channels=64, # expand output feature map volume's depth to 64
            kernel_size=3,
            stride=1,
            padding=1
                  ),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
        nn.Dropout(p=0.3),
    )
    ############### 2ND PARALLEL 2D CONVOLUTION BLOCK ############
    # 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
    self.conv2Dblock2 = nn.Sequential(
      # 1st 2D convolution layer
        nn.Conv2d(
            in_channels=1, # input volume depth == input channel dim == 1
            out_channels=8, # expand output feature map volume's depth to 16
            kernel_size=3, # typical 3*3 stride 1 kernel
            stride=1,
            padding=1
                  ),
        nn.BatchNorm2d(8), # batch normalize the output feature map before activation
        nn.ReLU(), # feature map --> activation map
        nn.MaxPool2d(kernel_size=2, stride=2), #typical maxpool kernel size
        nn.Dropout(p=0.3), #randomly zero 30% of 1st layer's output feature map in training
        
        # 2nd 2D convolution layer identical to last except output dim, maxpool kernel
        nn.Conv2d(
            in_channels=8, 
            out_channels=16, # expand output feature map volume's depth to 32
            kernel_size=3,
            stride=1,
            padding=1
                  ),
        nn.BatchNorm2d(16),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=5, stride=5), # increase maxpool kernel for subsequent filters
        nn.Dropout(p=0.3), 

        # 2nd 2D convolution layer identical to last except output dim, maxpool kernel
        nn.Conv2d(
            in_channels=16, 
            out_channels=32, # expand output feature map volume's depth to 32
            kernel_size=3,
            stride=1,
            padding=1
                  ),
        nn.BatchNorm2d(32),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2), # increase maxpool kernel for subsequent filters
        nn.Dropout(p=0.3), 
        
        # 3rd 2D convolution layer identical to last except output dim
        nn.Conv2d(
            in_channels=32,
            out_channels=64, # expand output feature map volume's depth to 64
            kernel_size=3,
            stride=1,
            padding=1
                  ),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
        nn.Dropout(p=0.3),
    
    )

    ################# FINAL LINEAR BLOCK ####################
    # Linear softmax layer to take final concatenated embedding tensor 
    #    from parallel 2D convolutional and transformer blocks, output 8 logits 
    # Each full convolution block outputs (64*1*8) embedding flattened to dim 512 1D array 
    # Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array
    # 512*2+40 == 1064 input features --> 8 output emotions 
    
    self.fc2_linear = nn.Linear(512,tgt_vocabulario)
    
    ### Softmax layer for the 8 output logits from final FC linear layer 
    self.softmax_out = nn.Softmax(dim=1) # dim==1 is the freq embedding
    
    # define one complete parallel fwd pass of input feature tensor thru 2*conv+1*transformer blocks
  def forward(self,x,partitura_tok):
    
    ############ 1st parallel Conv2D block: 4 Convolutional layers ############################
    # create final feature embedding from 1st convolutional layer 
    # input features pased through 4 sequential 2D convolutional layers
    conv2d_embedding1 = self.conv2Dblock1(x) # x == N/batch * channel * freq * time
    # print("conv2d_embedding1 : "+str(conv2d_embedding1.shape))
    # flatten final 64*1*8 feature map from convolutional layers to length 512 1D array 
    # skip the 1st (N/batch) dimension when flattening
    conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim=1) 
    # print("conv2d_embedding1 flatten : "+str(conv2d_embedding1.shape))
    ############ 2nd parallel Conv2D block: 4 Convolutional layers #############################
    # create final feature embedding from 2nd convolutional layer 
    # input features pased through 4 sequential 2D convolutional layers
    conv2d_embedding2 = self.conv2Dblock2(x) # x == N/batch * channel * freq * time
    # print("conv2d_embedding2 : "+str(conv2d_embedding2.shape))
    # flatten final 64*1*8 feature map from convolutional layers to length 512 1D array 
    # skip the 1st (N/batch) dimension when flattening
    conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim=1) 
    # print("conv2d_embedding2 flatten : "+str(conv2d_embedding2.shape))
      
    ########## 4-encoder-layer Transformer block w/ 40-->512-->40 feedfwd network ##############
    # maxpool input feature map: 1*40*282 w/ 1*4 kernel --> 1*40*70 
    x_maxpool = self.transformer_maxpool(x)
    # print("x_maxpool : "+str(x_maxpool.shape))
    # remove channel dim: 1*40*70 --> 40*70
    x_maxpool_reduced = torch.squeeze(x_maxpool,1)
    # print("x_maxpool_reduced : "+str(x_maxpool_reduced.shape))
    # convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format
    # because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
    x = x_maxpool_reduced.permute(2,0,1) 
    # print("x------>entrada transformer : "+str(x.shape))
    # finally, pass reduced input feature map x into transformer encoder layers
    transformer_output = self.transformer_encoder(x)
    # print("salida transformer : "+str(transformer_output.shape))
    
    # create final feature emedding from transformer layer by taking mean in the time dimension (now the 0th dim)
    # transformer outputs 2x40 (MFCC embedding*time) feature map, take mean of columns i.e. take time average
    transformer_embedding = torch.mean(transformer_output, dim=0) # dim 40x70 --> 40
    # print("transformer_embedding media : "+str(transformer_embedding.shape))
  
    complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2,transformer_embedding], dim=1)  
    # print("complete_embedding  : "+str(complete_embedding.shape))
    cambDim = complete_embedding.unsqueeze(2)
    # print("cambDim  : "+str(cambDim.shape))

    expandido=cambDim.expand([2,5544,512])
    # print("expandido  : "+str(expandido.shape))

    memory=expandido.transpose(1,0)
    # print("memory  : "+str(memory.shape))
      
    partitura_tok=partitura_tok.transpose(1,0)
    # print("partitura_tok  : "+str(partitura_tok.shape))
    partitura_tok= self.tgt_tok_emb(partitura_tok)
    # print("partitura_tok2  : "+str(partitura_tok.shape))
    tgt_emb = self.positional_encoding(partitura_tok)
    # print("tgt_emb_positional  : "+str(tgt_emb.shape))

    mask_tgt=generate_square_subsequent_mask(tgt_emb.shape[0])
    ouput_decoder=self.transformer_decoder(tgt_emb,memory,mask_tgt)
    # print("ouput_decoder  : "+str(ouput_decoder.shape))
    linear_decoder=self.fc2_linear(ouput_decoder)
    # print("linear_decoder  : "+str(linear_decoder.shape))
   
   
    ######### Final Softmax layer: use logits from FC linear, get softmax for prediction ######
    output_softmax = self.softmax_out(linear_decoder)
    # print("output_softmax  : "+str(output_softmax.shape))
    # need output logits to compute cross entropy loss, need softmax probabilities to predict class
    return  output_softmax 
  
  

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


In [151]:
from sklearn.metrics import precision_score
def make_train_step(model, criterion, optimizer):
    
  def train_step(espectro,partituraT,lenPartituras):
          
    output_softmax = model(espectro,partituraT)
    salida =output_softmax.transpose(0,1)
   
   
    sumAcuraccy=0
    for i in range(salida.shape[0]):
      predictions=torch.argmax(salida[i],dim=1)
      acuraccy=precision_score(predictions,partituraT[i],average='micro')
      
      # print("acuraccy :: "+str(acuraccy))
      
      sumAcuraccy+=acuraccy

    sumAcuraccy=sumAcuraccy/salida.shape[0]
    
    output_lengths = torch.full((output_softmax.shape[1],), output_softmax.shape[0], dtype=torch.long)
    
    loss = criterion(output_softmax,partituraT,output_lengths,lenPartituras) 
    
    # compute gradients for the optimizer to use 
    # loss.backward()
    
    # update network parameters based on gradient stored (by calling loss.backward())
    # optimizer.step()
    
    # zero out gradients for next pass
    # pytorch accumulates gradients from backwards passes (convenient for RNNs)
    # optimizer.zero_grad() 
    
    return loss.item(),sumAcuraccy
  return train_step

In [None]:
def make_validate_fnc(model,criterion):
  def validate(espectro,partituraT,lenPartituras):
      
    # don't want to update any network parameters on validation passes: don't need gradient
    # wrap in torch.no_grad to save memory and compute in validation phase: 
    with torch.no_grad(): 
        
      # set model to validation phase i.e. turn off dropout and batchnorm layers 
      model.eval()

      # get the model's predictions on the validation set
      output_softmax = model(espectro,partituraT)
      salida =output_softmax.transpose(0,1)
      sumAcuraccy=0
      for i in range(salida.shape[0]):
        predictions=torch.argmax(salida[i],dim=1)
        acuraccy=precision_score(predictions,partituraT[i],average='micro')
        # print("acuraccy :: "+str(acuraccy))      
        sumAcuraccy+=acuraccy

    sumAcuraccy=sumAcuraccy/salida.shape[0]
    
    output_lengths = torch.full((output_softmax.shape[1],), output_softmax.shape[0], dtype=torch.long)
    
    loss = criterion(output_softmax,partituraT,output_lengths,lenPartituras) 
        
    return loss.item(),sumAcuraccy
  return validate

In [None]:
def make_save_checkpoint(): 
    def save_checkpoint(optimizer, model, epoch, filename):
        checkpoint_dict = {
            'optimizer': optimizer.state_dict(),
            'model': model.state_dict(),
            'epoch': epoch
        }
        torch.save(checkpoint_dict, filename)
    return save_checkpoint

def load_checkpoint(optimizer, model, filename):
    checkpoint_dict = torch.load(filename)
    epoch = checkpoint_dict['epoch']
    model.load_state_dict(checkpoint_dict['model'])
    if optimizer is not None:
        optimizer.load_state_dict(checkpoint_dict['optimizer'])
    return epoch

In [81]:
import torch
inputt = torch.randn(2,1, 40,1723, requires_grad=True)
partitura = torch.randint( 0,132, (2,100))
pato = torch.randn(2,5544, requires_grad=True)
lenParitura = torch.randint( 30,50, (2,))
prueba = torch.randn(100,2,512, requires_grad=True)
print(inputt.shape)
print(partitura.shape)
print(lenParitura)
print(prueba.shape[0])

torch.Size([2, 1, 40, 1723])
torch.Size([2, 100])
tensor([43, 35])
100


In [56]:
model=TmusicTrasforms(tgt_vocabulario=233)
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model=model.to(DEVICE)
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [96]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CTCLoss()

In [152]:
train_step=make_train_step(model,criterion,optimizer)
loss=train_step(inputt,partitura,lenParitura)
loss

conv2d_embedding1 : torch.Size([2, 64, 1, 43])
conv2d_embedding1 flatten : torch.Size([2, 2752])
conv2d_embedding2 : torch.Size([2, 64, 1, 43])
conv2d_embedding2 flatten : torch.Size([2, 2752])
x_maxpool : torch.Size([2, 1, 40, 430])
x_maxpool_reduced : torch.Size([2, 40, 430])
x------>entrada transformer : torch.Size([430, 2, 40])
salida transformer : torch.Size([430, 2, 40])
transformer_embedding media : torch.Size([2, 40])
complete_embedding  : torch.Size([2, 5544])
cambDim  : torch.Size([2, 5544, 1])
expandido  : torch.Size([2, 5544, 512])
memory  : torch.Size([5544, 2, 512])
partitura_tok  : torch.Size([100, 2])
partitura_tok2  : torch.Size([100, 2, 512])
tgt_emb_positional  : torch.Size([100, 2, 512])
ouput_decoder  : torch.Size([100, 2, 512])
linear_decoder  : torch.Size([100, 2, 233])
output_softmax  : torch.Size([100, 2, 233])


(-3.7511916160583496, 0.005)

In [57]:
model.train()
ouput=model(inputt,partitura)
ouput

conv2d_embedding1 : torch.Size([2, 64, 1, 43])
conv2d_embedding1 flatten : torch.Size([2, 2752])
conv2d_embedding2 : torch.Size([2, 64, 1, 43])
conv2d_embedding2 flatten : torch.Size([2, 2752])
x_maxpool : torch.Size([2, 1, 40, 430])
x_maxpool_reduced : torch.Size([2, 40, 430])
x------>entrada transformer : torch.Size([430, 2, 40])
salida transformer : torch.Size([430, 2, 40])
transformer_embedding media : torch.Size([2, 40])
complete_embedding  : torch.Size([2, 5544])
cambDim  : torch.Size([2, 5544, 1])
expandido  : torch.Size([2, 5544, 512])
memory  : torch.Size([5544, 2, 512])
partitura_tok  : torch.Size([100, 2])
partitura_tok2  : torch.Size([100, 2, 512])
tgt_emb_positional  : torch.Size([100, 2, 512])
ouput_decoder  : torch.Size([100, 2, 512])
linear_decoder  : torch.Size([100, 2, 233])
output_softmax  : torch.Size([100, 2, 233])


tensor([[[0.9762, 0.8009, 0.5210,  ..., 0.3210, 0.2620, 0.7164],
         [0.0238, 0.1991, 0.4790,  ..., 0.6790, 0.7380, 0.2836]],

        [[0.4998, 0.6676, 0.0292,  ..., 0.8694, 0.4351, 0.4776],
         [0.5002, 0.3324, 0.9708,  ..., 0.1306, 0.5649, 0.5224]],

        [[0.0164, 0.4358, 0.6146,  ..., 0.8432, 0.2852, 0.3257],
         [0.9836, 0.5642, 0.3854,  ..., 0.1568, 0.7148, 0.6743]],

        ...,

        [[0.6765, 0.2190, 0.9363,  ..., 0.9026, 0.3726, 0.5867],
         [0.3235, 0.7810, 0.0637,  ..., 0.0974, 0.6274, 0.4133]],

        [[0.5789, 0.6902, 0.6287,  ..., 0.5929, 0.6572, 0.3143],
         [0.4211, 0.3098, 0.3713,  ..., 0.4071, 0.3428, 0.6857]],

        [[0.4938, 0.7316, 0.7019,  ..., 0.8561, 0.6548, 0.8672],
         [0.5062, 0.2684, 0.2981,  ..., 0.1439, 0.3452, 0.1328]]],
       grad_fn=<SoftmaxBackward>)

In [None]:
from torchsummary import summary

# need device to instantiate model
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# instantiate model for 8 emotions and move to GPU 
model = TmusicTrasforms(8).to(device)

# include input feature map dims in call to summary()
summary(model, input_size=[(1,40,1723),(1,100)])


In [None]:
alto=torch.max(pato)
alto

In [None]:
emnb =nn.Embedding(133,512)

salida=emnb(partitura.long())
salida.shape

In [None]:
pato[0]

In [None]:
pato.shape

In [None]:
x = pato.unsqueeze(2)
print(x.size())

x=x.expand([2,5544,512])
print(x.size())

In [None]:
x[0]

In [47]:
ys = torch.ones(1, 1).fill_(1).type(torch.long)
ys.shape

torch.Size([1, 1])

In [None]:
ys = torch.cat([ys,torch.ones(1, 1).type_as().fill_(45)], dim=0)

In [70]:
log_probs = torch.randn(100, 2, 233).log_softmax(2).detach().requires_grad_()
print(log_probs.shape)#salida del modelo
targets = torch.randint(0, 233, (2, 100), dtype=torch.long)
print(targets.shape)#tokens del las partituras
input_lengths = torch.full((2,), 50, dtype=torch.long)
print(input_lengths)#salida de 100 caracteristicas salida decoder
target_lengths = torch.randint(10,30,(2,), dtype=torch.long)
print(target_lengths)#tamanio de las partituras

torch.Size([100, 2, 233])
torch.Size([2, 100])
tensor([50, 50])
tensor([18, 16])


In [72]:
log_probs.shape[1]

2

In [75]:
import torch.nn.functional as F  
criterion =nn.CTCLoss()
loss=criterion(log_probs, targets, input_lengths, target_lengths)
loss

tensor(13.9402, grad_fn=<MeanBackward0>)