# BERT com camadas Wav2Vec

# Preparando Dados , Imports e Instalações

In [1]:
!pip install -q install transformers pytorch_lightning neptune-client==0.9.8

[K     |████████████████████████████████| 3.1 MB 4.1 MB/s 
[K     |████████████████████████████████| 525 kB 60.8 MB/s 
[K     |████████████████████████████████| 231 kB 67.4 MB/s 
[K     |████████████████████████████████| 829 kB 49.4 MB/s 
[K     |████████████████████████████████| 53 kB 2.6 MB/s 
[K     |████████████████████████████████| 180 kB 85.6 MB/s 
[K     |████████████████████████████████| 63 kB 2.3 MB/s 
[K     |████████████████████████████████| 895 kB 69.6 MB/s 
[K     |████████████████████████████████| 61 kB 692 kB/s 
[K     |████████████████████████████████| 3.3 MB 62.3 MB/s 
[K     |████████████████████████████████| 596 kB 69.7 MB/s 
[K     |████████████████████████████████| 332 kB 67.2 MB/s 
[K     |████████████████████████████████| 132 kB 83.5 MB/s 
[K     |████████████████████████████████| 1.1 MB 63.1 MB/s 
[K     |████████████████████████████████| 271 kB 55.9 MB/s 
[K     |████████████████████████████████| 192 kB 90.0 MB/s 
[K     |██████████████████████

In [2]:
import torch
import random
from torch.utils.data import DataLoader
import torchmetrics
from torch import nn
import numpy as np
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer
from transformers import BertModel
from transformers import AutoTokenizer
import neptune.new as neptune
from transformers import Wav2Vec2Model

pl.utilities.seed.seed_everything(seed=123)

Global seed set to 123


123

In [3]:
if torch.cuda.is_available(): 
   dev = "cuda:0"
   print(torch. cuda. get_device_name(dev))
else: 
   dev = "cpu" 
print(dev)
device = torch.device(dev)

Tesla P100-PCIE-16GB
cuda:0


In [4]:
!wget -nc http://files.fast.ai/data/aclImdb.tgz 
!tar -xzf aclImdb.tgz

--2021-12-08 02:26:25--  http://files.fast.ai/data/aclImdb.tgz
Resolving files.fast.ai (files.fast.ai)... 104.26.2.19, 172.67.69.159, 104.26.3.19, ...
Connecting to files.fast.ai (files.fast.ai)|104.26.2.19|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.fast.ai/data/aclImdb.tgz [following]
--2021-12-08 02:26:25--  https://files.fast.ai/data/aclImdb.tgz
Connecting to files.fast.ai (files.fast.ai)|104.26.2.19|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 145982645 (139M) [application/x-gtar-compressed]
Saving to: ‘aclImdb.tgz’


2021-12-08 02:26:40 (9.94 MB/s) - ‘aclImdb.tgz’ saved [145982645/145982645]



In [5]:
import os

max_valid = 5000

def load_texts(folder):
    texts = []
    for path in os.listdir(folder):
        with open(os.path.join(folder, path)) as f:
            texts.append(f.read())
    return texts

x_train_pos = load_texts('aclImdb/train/pos')
x_train_neg = load_texts('aclImdb/train/neg')
x_test_pos = load_texts('aclImdb/test/pos')
x_test_neg = load_texts('aclImdb/test/neg')

x_train = x_train_pos + x_train_neg
x_test = x_test_pos + x_test_neg
y_train = [True] * len(x_train_pos) + [False] * len(x_train_neg)
y_test = [True] * len(x_test_pos) + [False] * len(x_test_neg)

# Embaralhamos o treino para depois fazermos a divisão treino/valid.
c = list(zip(x_train, y_train))
random.shuffle(c)
x_train, y_train = zip(*c)

x_valid = x_train[-max_valid:]
y_valid = y_train[-max_valid:]
x_train = x_train[:-max_valid]
y_train = y_train[:-max_valid]

print(len(x_train), 'amostras de treino.')
print(len(x_valid), 'amostras de desenvolvimento.')
print(len(x_test), 'amostras de teste.')

print('3 primeiras amostras treino:')
for x, y in zip(x_train[:3], y_train[:3]):
    print(y, x[:100])

print('3 últimas amostras treino:')
for x, y in zip(x_train[-3:], y_train[-3:]):
    print(y, x[:100])

print('3 primeiras amostras validação:')
for x, y in zip(x_valid[:3], y_test[:3]):
    print(y, x[:100])

print('3 últimas amostras validação:')
for x, y in zip(x_valid[-3:], y_valid[-3:]):
    print(y, x[:100])

20000 amostras de treino.
5000 amostras de desenvolvimento.
25000 amostras de teste.
3 primeiras amostras treino:
False Whoa nelly! I've heard a ton of mixed reviews for this...but one of my go to hardcore horror reviewe
False I have seen it. It's not "good" but interesting in an understated way. The boys in it are quite natu
True Having lived in Ontario my whole life, in the same town that Marlene Moore grew up in, I've heard st
3 últimas amostras treino:
False It really impresses me that it got made. The director/writer/actor must be really charismatic in rea
True OK, I saw this in the theaters when it came out and I don't know why. I haven't seen it since, but I
True There is not much more I can say about this movie than all of the commentaries on page one, except -
3 primeiras amostras validação:
True I first saw Thief as a child which makes me almost as old as the Jinn I guess. As any kid would be, 
True This film is worth seeing alone for Jared Harris' outstanding portrayal of Jo

# Defindo Dataset e Dataloader

In [6]:
class IMDBDataset(torch.utils.data.Dataset):   
    def __init__(self, sentences, labels,max_len,model_tokens):
        super().__init__()

        self.tokenizer = AutoTokenizer.from_pretrained(model_tokens)
        self.tokens = self.tokenizer(sentences, padding = "max_length", max_length = max_len, truncation=True)["input_ids"]
        self.mask_attention = self.tokenizer(sentences, padding = "max_length", max_length = max_len, truncation=True)["attention_mask"]
        self.labels = labels 
      
    def __len__(self):
        return len(self.labels)
         
    def __getitem__(self, idx):
      return torch.tensor(self.tokens[idx]).long(), torch.tensor(self.mask_attention[idx]).long(), torch.tensor(self.labels[idx]).long()
    

class IMDBDataModule(pl.LightningDataModule):
    def __init__(self,model_tokens,
                 x_train,y_train, 
                 x_val,y_val, 
                 x_test,y_test,
                 batch_size: int = 50, 
                 num_workers: int = 2,
                 max_len = 512):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.model_token = model_tokens
        self.max_len = max_len
        self.x_train =  x_train
        self.y_train =  y_train
        self.x_val = x_val
        self.y_val = y_val
        self.x_test = x_test
        self.y_test = y_test
        
        
                  
    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
          self.train_dataset= IMDBDataset(list(self.x_train), list(self.y_train), self.max_len, model_tokens = self.model_token)
          self.val_dataset = IMDBDataset(list(self.x_val), list(self.y_val), self.max_len, model_tokens = self.model_token)
        if stage == 'test' or stage is None:
            self.test_dataset =  IMDBDataset(list(self.x_test), list(self.y_test), self.max_len, model_tokens = self.model_token)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size,drop_last=True)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size,drop_last=True)

# Model

In [7]:
def _make_trainable(module):
    """Unfreeze a given module.
    Operates in-place.
    Parameters
    ----------
    module : instance of `torch.nn.Module`
    """
    for param in module.parameters():
        param.requires_grad = True
    module.train()

In [8]:
def freeze(module, n=-1, train_bn=False):
    """Freeze the layers up to index n.
    Operates in-place.
    Parameters
    ----------
    module : instance of `torch.nn.Module`
    n : int
        By default, all the layers will be frozen. Otherwise, an integer
        between 0 and `len(module.children())` must be given.
    train_bn : bool (default: True)
        If True, the BatchNorm layers will remain in training mode.
    """
    idx = 0
    children = list(module.children())
    n_max = len(children) if n == -1 else int(n)
    for child in children:
        if idx < n_max:
            _recursive_freeze(module=child, train_bn=train_bn)
        else:
            _make_trainable(module=child)

In [9]:
def _recursive_freeze(module, train_bn=False):
    """Freeze the layers of a given module.
    Operates in-place.
    Parameters
    ----------
    module : instance of `torch.nn.Module`
    train_bn : bool (default: True)
        If True, the BatchNorm layers will remain in training mode.
        Otherwise, they will be set to eval mode along with the other modules.
    """
    children = list(module.children())
    if not children:
        if not (isinstance(module, torch.nn.LayerNorm) and train_bn):
            for param in module.parameters():
                param.requires_grad = False
            module.eval()
        else:
            # Make the BN layers trainable
            _make_trainable(module)
    else:
        for child in children:
            _recursive_freeze(module=child, train_bn=train_bn)

In [10]:
def randomize_model(model):
    for module_ in model.named_modules(): 
        if isinstance(module_[1],(torch.nn.Linear, torch.nn.Embedding)):
            nn.init.xavier_uniform_(module_[1].weight.data, gain=1.0)
        elif isinstance(module_[1], torch.nn.LayerNorm):
            module_[1].bias.data.zero_()
            module_[1].weight.data.fill_(1.0)
        if isinstance(module_[1], torch.nn.Linear) and module_[1].bias is not None:
            module_[1].bias.data.zero_()
    return model

In [11]:
class ReviewClassifier(nn.Module):
    def __init__(self,num_class, 
                 bert_model = 'bert-large-uncased'):
        super().__init__()

        
        self.num_class = num_class
        self.bert_layer = BertModel.from_pretrained(bert_model)
        self.wav2vec_layer = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large")

        # LOAD TRANSFORMER WEIGHTS
        # Alterando camada 0 do BERT para adicionar camada 0 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[0].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[0].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[0].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[0].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[0].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[0].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[0].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[0].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[0].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[0].layer_norm.state_dict())
        self.bert_layer.encoder.layer[0].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[0].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[0].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[0].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[0].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[0].final_layer_norm.state_dict())

        # Alterando camada 1 do BERT para adicionar camada 1 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[1].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[1].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[1].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[1].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[1].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[1].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[1].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[1].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[1].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[1].layer_norm.state_dict())
        self.bert_layer.encoder.layer[1].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[1].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[1].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[1].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[1].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[1].final_layer_norm.state_dict())

        # Alterando camada 2 do BERT para adicionar camada 2 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[2].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[2].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[2].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[2].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[2].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[2].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[2].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[2].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[2].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[2].layer_norm.state_dict())
        self.bert_layer.encoder.layer[2].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[2].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[2].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[2].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[2].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[2].final_layer_norm.state_dict())
        
        # Alterando camada 3 do BERT para adicionar camada 3 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[3].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[3].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[3].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[3].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[3].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[3].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[3].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[3].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[3].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[3].layer_norm.state_dict())
        self.bert_layer.encoder.layer[3].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[3].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[3].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[3].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[3].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[3].final_layer_norm.state_dict())

        # Alterando camada 4 do BERT para adicionar camada 4 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[4].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[4].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[4].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[4].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[4].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[4].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[4].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[4].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[4].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[4].layer_norm.state_dict())
        self.bert_layer.encoder.layer[4].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[4].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[4].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[4].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[4].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[4].final_layer_norm.state_dict())

        # Alterando camada 5 do BERT para adicionar camada 5 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[5].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[5].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[5].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[5].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[5].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[5].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[5].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[5].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[5].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[5].layer_norm.state_dict())
        self.bert_layer.encoder.layer[5].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[5].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[5].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[5].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[5].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[5].final_layer_norm.state_dict())

        # Alterando camada 6 do BERT para adicionar camada 6 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[6].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[6].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[6].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[6].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[6].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[6].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[6].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[6].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[6].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[6].layer_norm.state_dict())
        self.bert_layer.encoder.layer[6].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[6].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[6].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[6].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[6].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[6].final_layer_norm.state_dict())

        # Alterando camada 7 do BERT para adicionar camada 7 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[7].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[7].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[7].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[7].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[7].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[7].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[7].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[7].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[7].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[7].layer_norm.state_dict())
        self.bert_layer.encoder.layer[7].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[7].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[7].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[7].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[7].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[7].final_layer_norm.state_dict())

        # Alterando camada 8 do BERT para adicionar camada 8 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[8].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[8].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[8].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[8].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[8].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[8].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[8].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[8].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[8].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[8].layer_norm.state_dict())
        self.bert_layer.encoder.layer[8].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[8].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[8].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[8].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[8].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[8].final_layer_norm.state_dict())
        
        # Alterando camada 9 do BERT para adicionar camada 9 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[9].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[9].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[9].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[9].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[9].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[9].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[9].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[9].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[9].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[9].layer_norm.state_dict())
        self.bert_layer.encoder.layer[9].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[9].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[9].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[9].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[9].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[9].final_layer_norm.state_dict())

        # Alterando camada 10 do BERT para adicionar camada 10 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[10].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[10].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[10].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[10].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[10].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[10].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[10].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[10].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[10].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[10].layer_norm.state_dict())
        self.bert_layer.encoder.layer[10].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[10].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[10].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[10].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[10].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[10].final_layer_norm.state_dict())

        # Alterando camada 11 do BERT para adicionar camada 11 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[11].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[11].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[11].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[11].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[11].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[11].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[11].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[11].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[11].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[11].layer_norm.state_dict())
        self.bert_layer.encoder.layer[11].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[11].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[11].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[11].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[11].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[11].final_layer_norm.state_dict())

        # Alterando camada 12 do BERT para adicionar camada 12 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[12].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[12].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[12].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[12].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[12].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[12].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[12].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[12].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[12].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[12].layer_norm.state_dict())
        self.bert_layer.encoder.layer[12].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[12].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[12].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[12].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[12].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[12].final_layer_norm.state_dict())

        # Alterando camada 13 do BERT para adicionar camada 13 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[13].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[13].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[13].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[13].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[13].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[13].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[13].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[13].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[13].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[13].layer_norm.state_dict())
        self.bert_layer.encoder.layer[13].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[13].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[13].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[13].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[13].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[13].final_layer_norm.state_dict())

        # Alterando camada 14 do BERT para adicionar camada 14 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[14].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[14].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[14].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[14].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[14].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[14].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[14].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[14].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[14].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[14].layer_norm.state_dict())
        self.bert_layer.encoder.layer[14].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[14].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[14].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[14].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[14].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[14].final_layer_norm.state_dict())

        # Alterando camada 15 do BERT para adicionar camada 15 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[15].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[15].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[15].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[15].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[15].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[15].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[15].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[15].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[15].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[15].layer_norm.state_dict())
        self.bert_layer.encoder.layer[15].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[15].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[15].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[15].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[15].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[15].final_layer_norm.state_dict())

        # Alterando camada 16 do BERT para adicionar camada 16 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[16].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[16].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[16].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[16].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[16].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[16].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[16].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[16].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[16].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[16].layer_norm.state_dict())
        self.bert_layer.encoder.layer[16].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[16].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[16].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[16].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[16].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[16].final_layer_norm.state_dict())

        # Alterando camada 17 do BERT para adicionar camada 17 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[17].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[17].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[17].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[17].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[17].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[17].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[17].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[17].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[17].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[17].layer_norm.state_dict())
        self.bert_layer.encoder.layer[17].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[17].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[17].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[17].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[17].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[17].final_layer_norm.state_dict())

        # Alterando camada 18 do BERT para adicionar camada 18 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[18].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[18].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[18].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[18].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[18].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[18].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[18].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[18].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[18].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[18].layer_norm.state_dict())
        self.bert_layer.encoder.layer[18].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[18].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[18].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[18].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[18].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[18].final_layer_norm.state_dict())

        # Alterando camada 19 do BERT para adicionar camada 19 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[19].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[19].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[19].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[19].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[19].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[19].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[19].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[19].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[19].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[19].layer_norm.state_dict())
        self.bert_layer.encoder.layer[19].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[19].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[19].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[19].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[19].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[19].final_layer_norm.state_dict())

        # Alterando camada 20 do BERT para adicionar camada 20 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[20].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[20].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[20].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[20].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[20].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[20].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[20].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[20].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[20].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[20].layer_norm.state_dict())
        self.bert_layer.encoder.layer[20].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[20].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[20].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[20].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[20].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[20].final_layer_norm.state_dict())

        # Alterando camada 21 do BERT para adicionar camada 21 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[21].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[21].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[21].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[21].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[21].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[21].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[21].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[21].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[21].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[21].layer_norm.state_dict())
        self.bert_layer.encoder.layer[21].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[21].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[21].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[21].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[21].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[21].final_layer_norm.state_dict())

        # Alterando camada 22 do BERT para adicionar camada 22 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[22].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[22].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[22].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[22].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[22].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[22].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[22].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[22].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[22].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[22].layer_norm.state_dict())
        self.bert_layer.encoder.layer[22].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[22].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[22].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[22].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[22].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[22].final_layer_norm.state_dict())

        # Alterando camada 23 do BERT para adicionar camada 23 do Wav2Vec
        # Camadas de atenção
        self.bert_layer.encoder.layer[23].attention.self.query.load_state_dict(self.wav2vec_layer.encoder.layers[23].attention.q_proj.state_dict())
        self.bert_layer.encoder.layer[23].attention.self.key.load_state_dict(self.wav2vec_layer.encoder.layers[23].attention.k_proj.state_dict())
        self.bert_layer.encoder.layer[23].attention.self.value.load_state_dict(self.wav2vec_layer.encoder.layers[23].attention.v_proj.state_dict())

        # Camadas Densas e Normalizações
        self.bert_layer.encoder.layer[23].attention.output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[23].attention.out_proj.state_dict())
        self.bert_layer.encoder.layer[23].attention.output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[23].layer_norm.state_dict())
        self.bert_layer.encoder.layer[23].intermediate.dense.load_state_dict(self.wav2vec_layer.encoder.layers[23].feed_forward.intermediate_dense.state_dict())
        self.bert_layer.encoder.layer[23].output.dense.load_state_dict(self.wav2vec_layer.encoder.layers[23].feed_forward.output_dense.state_dict())
        self.bert_layer.encoder.layer[23].output.LayerNorm.load_state_dict(self.wav2vec_layer.encoder.layers[23].final_layer_norm.state_dict())

        #Classification layer
        self.cls_layer = nn.Linear(self.bert_layer.config.hidden_size, self.num_class)

        # "Delete" Unused                
        self.wav2vec_layer = None

        # Perform Freezing
        freeze(self.bert_layer.encoder)

        # Perform Initialization
        randomize_model(self.cls_layer)

        print('Camadas e requires_grad:')
        for name, param in self.bert_layer.named_parameters():                
            print(f'{name}:{param.requires_grad}')

    def forward(self, seq, attn_masks):
        out_bert = self.bert_layer(seq, attention_mask = attn_masks)
        logits = self.cls_layer(out_bert.pooler_output)
        return logits     

class LiteModel(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.params = hparams
        self.best_valid_loss = 16e9
        self.criterion = torch.nn.CrossEntropyLoss()
        self.model = ReviewClassifier( num_class = hparams['n_classes'],
                                      bert_model = hparams['bert_model'])
        
    def forward(self, x_indexs, x_att_mask):
      logits = self.model(x_indexs, x_att_mask)
      preds = logits.argmax(dim=1)
      return logits, preds

    def training_step(self, train_batch, batch_idx):
      x_indexs, x_att_mask, y = train_batch
      logits = self.model(x_indexs, x_att_mask)
      batch_losses = self.criterion(logits.squeeze(-1), y)
      loss = batch_losses.mean()
      run['train/batch_loss'].log(loss)
      return {'loss': loss, 'batch_losses': batch_losses}

    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([output['batch_losses'] for output in outputs]).mean()
        run['train/loss'].log(avg_loss)
        self.log('train_loss', avg_loss, on_epoch=True, prog_bar=True)
        
        return
  
    def validation_step(self, val_batch, batch_idx):
        x_indexs, x_att_mask, y = val_batch
        logits, preds = self.forward(x_indexs, x_att_mask)
        batch_losses = self.criterion(logits, y)
        batch_accuracy = (preds == y)
        val_f1 = torchmetrics.functional.f1(preds, y, num_classes=2, average='weighted')
        return {'batch_losses': batch_losses, 'batch_accuracy': batch_accuracy,
                "batch_f1": val_f1}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([output['batch_losses'] for output in outputs]).mean()
        accuracy = torch.stack([output['batch_accuracy'] for output in outputs]).float().mean()
        f1 = torch.stack([output['batch_f1'] for output in outputs]).float().mean()
        run['valid/loss'].log(avg_loss)
        run['valid/acuracy'].log(accuracy)
        run['valid/F1'].log(f1)
        metrics = {'valid_loss': avg_loss.item(), 'accuracy': accuracy.item(), 'f1-score':f1.item()}
        output =  {'progress_bar': metrics, 'valid_loss': avg_loss.item()}
               
        if avg_loss < self.best_valid_loss:
            torch.save(self.model.state_dict(), '/content/'+self.params['bert_model']+'best_model.pt')
            self.best_valid_loss = avg_loss
        torch.save(self.model.state_dict(), '/content/'+self.params['bert_model']+'trainer_model.pt')

        self.log('validate_loss', avg_loss, on_epoch=True, prog_bar=True)
        self.log('validate_acc', accuracy, on_epoch=True, prog_bar=True)
        self.log('validate_f1', f1, on_epoch=True, prog_bar=True)
        return output
  
    def test_step(self, val_batch, batch_idx):
        x_indexs, x_att_mask, y = val_batch
        logits, preds = self.forward(x_indexs, x_att_mask)
        batch_losses = self.criterion(logits, y)
        batch_accuracy = (preds == y)
        test_f1 = torchmetrics.functional.f1(preds, y, num_classes=2, average='weighted')
        return {'batch_losses': batch_losses, 'batch_accuracy': batch_accuracy,
                'batch_f1':test_f1}

    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([output['batch_losses'] for output in outputs]).mean()
        accuracy = torch.stack([output['batch_accuracy'] for output in outputs]).float().mean()
        f1 = torch.stack([output['batch_f1'] for output in outputs]).float().mean()

        metrics = {'Test loss': avg_loss.item(), 'test accuracy': accuracy.item(), 'test f1':f1.item()}
        output =  {'progress_bar': metrics}

        run['test/loss'].log(avg_loss)
        run['test/acuracy'].log(accuracy)
        run['test/F1'].log(f1)

        self.log('test_loss', avg_loss, on_epoch=True, prog_bar=True)
        self.log('test_acc', accuracy,  on_epoch=True, prog_bar=True)
        self.log('test_f1', f1,  on_epoch=True, prog_bar=True)
        return output

    def configure_optimizers(self):
        optimizer = torch.optim.Adamax(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.params['learning_rate'])
        scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda epoch: 1.0)
        return [optimizer], [scheduler]  # Forma de retorno para associar um otimizador a um scheduler.


# Experiment

In [12]:
run = neptune.init(
    project="d230640/Projeto-Final-2021-2",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJmZTg1OGU3Yi1jOWE1LTRjMjEtYTJjNS1hMjMwM2Y3NDRjOTUifQ==",
)  # your credentials

https://app.neptune.ai/d230640/Projeto-Final-2021-2/e/PROJ1-125


In [13]:
# definindo os hyperparametros
hparams = {
     'max_epochs': 11,
     'max_len':512,
     'n_classes':2,
     'bert_model':'bert-large-uncased',
     'learning_rate': 1e-4,
     'bs':4
    
}


dm = IMDBDataModule(batch_size=hparams['bs'],x_train = x_train, y_train = y_train,
                     x_val=x_valid, y_val = y_valid, 
                     x_test= x_test, y_test = y_test,
                     model_tokens = hparams['bert_model'],max_len=hparams['max_len'])
dm.setup()

pl_model = LiteModel(hparams=hparams)

trainer = pl.Trainer(max_epochs=hparams['max_epochs'],
                     progress_bar_refresh_rate = 1,
                     accumulate_grad_batches = 20,
                     gpus=1,
                     log_every_n_steps=1) 

%time trainer.fit(pl_model, dm)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-large were not used when initializing Wav2Vec2Model: ['quantizer.weight_proj.bias', 'project_hid.bias', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'project_q.bias', 'project_q.weight', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Camadas e requires_grad:
embeddings.word_embeddings.weight:True
embeddings.position_embeddings.weight:True
embeddings.token_type_embeddings.weight:True
embeddings.LayerNorm.weight:True
embeddings.LayerNorm.bias:True
encoder.layer.0.attention.self.query.weight:False
encoder.layer.0.attention.self.query.bias:False
encoder.layer.0.attention.self.key.weight:False
encoder.layer.0.attention.self.key.bias:False
encoder.layer.0.attention.self.value.weight:False
encoder.layer.0.attention.self.value.bias:False
encoder.layer.0.attention.output.dense.weight:False
encoder.layer.0.attention.output.dense.bias:False
encoder.layer.0.attention.output.LayerNorm.weight:False
encoder.layer.0.attention.output.LayerNorm.bias:False
encoder.layer.0.intermediate.dense.weight:False
encoder.layer.0.intermediate.dense.bias:False
encoder.layer.0.output.dense.weight:False
encoder.layer.0.output.dense.bias:False
encoder.layer.0.output.LayerNorm.weight:False
encoder.layer.0.output.LayerNorm.bias:False
encoder.layer.1.

  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



encoder.layer.21.attention.self.key.bias:False
encoder.layer.21.attention.self.value.weight:False
encoder.layer.21.attention.self.value.bias:False
encoder.layer.21.attention.output.dense.weight:False
encoder.layer.21.attention.output.dense.bias:False
encoder.layer.21.attention.output.LayerNorm.weight:False
encoder.layer.21.attention.output.LayerNorm.bias:False
encoder.layer.21.intermediate.dense.weight:False
encoder.layer.21.intermediate.dense.bias:False
encoder.layer.21.output.dense.weight:False
encoder.layer.21.output.dense.bias:False
encoder.layer.21.output.LayerNorm.weight:False
encoder.layer.21.output.LayerNorm.bias:False
encoder.layer.22.attention.self.query.weight:False
encoder.layer.22.attention.self.query.bias:False
encoder.layer.22.attention.self.key.weight:False
encoder.layer.22.attention.self.key.bias:False
encoder.layer.22.attention.self.value.weight:False
encoder.layer.22.attention.self.value.bias:False
encoder.layer.22.attention.output.dense.weight:False
encoder.layer.2


  | Name      | Type             | Params
-----------------------------------------------
0 | criterion | CrossEntropyLoss | 0     
1 | model     | ReviewClassifier | 335 M 
-----------------------------------------------
32.8 M    Trainable params
302 M     Non-trainable params
335 M     Total params
1,340.576 Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 123
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: 0it [00:00, ?it/s]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

CPU times: user 9h 49min 33s, sys: 2min 54s, total: 9h 52min 28s
Wall time: 9h 50min 53s


In [14]:
#teste
%time trainer.test(test_dataloaders=dm.test_dataloader())

  "`trainer.test(test_dataloaders)` is deprecated in v1.4 and will be removed in v1.6."
  f"`.{fn}(ckpt_path=None)` was called without a model."
  f"DataModule.{name} has already been called, so it will not be called again. "
Restoring states from the checkpoint path at /content/lightning_logs/version_0/checkpoints/epoch=10-step=2749.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /content/lightning_logs/version_0/checkpoints/epoch=10-step=2749.ckpt
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.6362400054931641,
 'test_f1': 0.719274640083313,
 'test_loss': 0.6613730788230896}
--------------------------------------------------------------------------------
CPU times: user 28min 8s, sys: 5.33 s, total: 28min 13s
Wall time: 28min 7s


[{'test_acc': 0.6362400054931641,
  'test_f1': 0.719274640083313,
  'test_loss': 0.6613730788230896}]

In [15]:
run.stop()