# BERT com camadas Wav2Vec

# Preparando Dados , Imports e Instalações

In [None]:
!pip install -q install transformers pytorch_lightning neptune-client==0.9.8

In [None]:
import torch
import random
from torch.utils.data import DataLoader
import torchmetrics
from torch import nn
import numpy as np
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer
from transformers import BertModel
from transformers import AutoTokenizer
import neptune.new as neptune
from transformers import Wav2Vec2Model

pl.utilities.seed.seed_everything(seed=123)

Global seed set to 123


123

In [None]:
if torch.cuda.is_available(): 
   dev = "cuda:0"
   print(torch. cuda. get_device_name(dev))
else: 
   dev = "cpu" 
print(dev)
device = torch.device(dev)

Tesla P100-PCIE-16GB
cuda:0


In [None]:
!wget -nc http://files.fast.ai/data/aclImdb.tgz 
!tar -xzf aclImdb.tgz

File ‘aclImdb.tgz’ already there; not retrieving.



In [None]:
import os

max_valid = 5000

def load_texts(folder):
    texts = []
    for path in os.listdir(folder):
        with open(os.path.join(folder, path)) as f:
            texts.append(f.read())
    return texts

x_train_pos = load_texts('aclImdb/train/pos')
x_train_neg = load_texts('aclImdb/train/neg')
x_test_pos = load_texts('aclImdb/test/pos')
x_test_neg = load_texts('aclImdb/test/neg')

x_train = x_train_pos + x_train_neg
x_test = x_test_pos + x_test_neg
y_train = [True] * len(x_train_pos) + [False] * len(x_train_neg)
y_test = [True] * len(x_test_pos) + [False] * len(x_test_neg)

# Embaralhamos o treino para depois fazermos a divisão treino/valid.
c = list(zip(x_train, y_train))
random.shuffle(c)
x_train, y_train = zip(*c)

x_valid = x_train[-max_valid:]
y_valid = y_train[-max_valid:]
x_train = x_train[:-max_valid]
y_train = y_train[:-max_valid]

print(len(x_train), 'amostras de treino.')
print(len(x_valid), 'amostras de desenvolvimento.')
print(len(x_test), 'amostras de teste.')

print('3 primeiras amostras treino:')
for x, y in zip(x_train[:3], y_train[:3]):
    print(y, x[:100])

print('3 últimas amostras treino:')
for x, y in zip(x_train[-3:], y_train[-3:]):
    print(y, x[:100])

print('3 primeiras amostras validação:')
for x, y in zip(x_valid[:3], y_test[:3]):
    print(y, x[:100])

print('3 últimas amostras validação:')
for x, y in zip(x_valid[-3:], y_valid[-3:]):
    print(y, x[:100])

20000 amostras de treino.
5000 amostras de desenvolvimento.
25000 amostras de teste.
3 primeiras amostras treino:
False DOWN TO EARTH / (2001) * (out of four)<br /><br />By Blake French:<br /><br />"Down to Earth" is suc
False I'm sorry to say this, but the acting in this film is horrible. The dialogue sounds as if they are r
True Terrfic film with a slightyly slow start - give it a chance to start cooking. Story builds in intere
3 últimas amostras treino:
False This must be one of the worst movies I've ever seen, the graphics are ridiculous, and the script pat
True One thing about Hollywood, someone has a success and it's always rushed to be copied. And another th
True After seeing several movies of Villaronga, I had a pretty clear opinion about him -- he concentrates
3 primeiras amostras validação:
True I had seen this movie long time back, but found it amazing and to this day it has never stopped amaz
True This movie was incredible. I would recommend it to anyone, much better than w

# Defindo Dataset e Dataloader

In [None]:
class IMDBDataset(torch.utils.data.Dataset):   
    def __init__(self, sentences, labels,max_len,model_tokens):
        super().__init__()

        self.tokenizer = AutoTokenizer.from_pretrained(model_tokens)
        self.tokens = self.tokenizer(sentences, padding = "max_length", max_length = max_len, truncation=True)["input_ids"]
        self.mask_attention = self.tokenizer(sentences, padding = "max_length", max_length = max_len, truncation=True)["attention_mask"]
        self.labels = labels 
      
    def __len__(self):
        return len(self.labels)
         
    def __getitem__(self, idx):
      return torch.tensor(self.tokens[idx]).long(), torch.tensor(self.mask_attention[idx]).long(), torch.tensor(self.labels[idx]).long()
    

class IMDBDataModule(pl.LightningDataModule):
    def __init__(self,model_tokens,
                 x_train,y_train, 
                 x_val,y_val, 
                 x_test,y_test,
                 batch_size: int = 50, 
                 num_workers: int = 2,
                 max_len = 512):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.model_token = model_tokens
        self.max_len = max_len
        self.x_train =  x_train
        self.y_train =  y_train
        self.x_val = x_val
        self.y_val = y_val
        self.x_test = x_test
        self.y_test = y_test
        
        
                  
    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
          self.train_dataset= IMDBDataset(list(self.x_train), list(self.y_train), self.max_len, model_tokens = self.model_token)
          self.val_dataset = IMDBDataset(list(self.x_val), list(self.y_val), self.max_len, model_tokens = self.model_token)
        if stage == 'test' or stage is None:
            self.test_dataset =  IMDBDataset(list(self.x_test), list(self.y_test), self.max_len, model_tokens = self.model_token)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size,drop_last=True)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size,drop_last=True)

# BERT from Scratch

In [None]:
def randomize_model(model):
    for module_ in model.named_modules(): 
        if isinstance(module_[1],(torch.nn.Linear, torch.nn.Embedding)):
            nn.init.xavier_uniform_(module_[1].weight.data, gain=1.0)
        elif isinstance(module_[1], torch.nn.LayerNorm):
            module_[1].bias.data.zero_()
            module_[1].weight.data.fill_(1.0)
        if isinstance(module_[1], torch.nn.Linear) and module_[1].bias is not None:
            module_[1].bias.data.zero_()
    return model

In [None]:
class ReviewClassifier(nn.Module):
    def __init__(self,num_class, 
                 bert_model = 'bert-base-uncased'):
        super().__init__()

        
        self.num_class = num_class
        
        self.bert_layer = BertModel.from_pretrained(bert_model)

        self.wav2vec_layer = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")

        # Alterando camadas do BERT
        # Camadas de atenção

        randomize_model(self.bert_layer.encoder.layer[0])
        randomize_model(self.bert_layer.encoder.layer[0])
        randomize_model(self.bert_layer.encoder.layer[0])
        randomize_model(self.bert_layer.encoder.layer[0])
        randomize_model(self.bert_layer.encoder.layer[0])
        randomize_model(self.bert_layer.encoder.layer[0])
        randomize_model(self.bert_layer.encoder.layer[0])
        randomize_model(self.bert_layer.encoder.layer[0])
        randomize_model(self.bert_layer.encoder.layer[1])
        randomize_model(self.bert_layer.encoder.layer[1])
        randomize_model(self.bert_layer.encoder.layer[1])
        randomize_model(self.bert_layer.encoder.layer[1])
        randomize_model(self.bert_layer.encoder.layer[1])
        randomize_model(self.bert_layer.encoder.layer[1])
        randomize_model(self.bert_layer.encoder.layer[1])
        randomize_model(self.bert_layer.encoder.layer[1])
        randomize_model(self.bert_layer.encoder.layer[2])
        randomize_model(self.bert_layer.encoder.layer[2])
        randomize_model(self.bert_layer.encoder.layer[2])
        randomize_model(self.bert_layer.encoder.layer[2])
        randomize_model(self.bert_layer.encoder.layer[2])
        randomize_model(self.bert_layer.encoder.layer[2])
        randomize_model(self.bert_layer.encoder.layer[2])
        randomize_model(self.bert_layer.encoder.layer[2])
        randomize_model(self.bert_layer.encoder.layer[3])
        randomize_model(self.bert_layer.encoder.layer[3])
        randomize_model(self.bert_layer.encoder.layer[3])
        randomize_model(self.bert_layer.encoder.layer[3])
        randomize_model(self.bert_layer.encoder.layer[3])
        randomize_model(self.bert_layer.encoder.layer[3])
        randomize_model(self.bert_layer.encoder.layer[3])
        randomize_model(self.bert_layer.encoder.layer[3])
        randomize_model(self.bert_layer.encoder.layer[4])
        randomize_model(self.bert_layer.encoder.layer[4])
        randomize_model(self.bert_layer.encoder.layer[4])
        randomize_model(self.bert_layer.encoder.layer[4])
        randomize_model(self.bert_layer.encoder.layer[4])
        randomize_model(self.bert_layer.encoder.layer[4])
        randomize_model(self.bert_layer.encoder.layer[4])
        randomize_model(self.bert_layer.encoder.layer[4])
        randomize_model(self.bert_layer.encoder.layer[5])
        randomize_model(self.bert_layer.encoder.layer[5])
        randomize_model(self.bert_layer.encoder.layer[5])
        randomize_model(self.bert_layer.encoder.layer[5])
        randomize_model(self.bert_layer.encoder.layer[5])
        randomize_model(self.bert_layer.encoder.layer[5])
        randomize_model(self.bert_layer.encoder.layer[5])
        randomize_model(self.bert_layer.encoder.layer[5])
        randomize_model(self.bert_layer.encoder.layer[6])
        randomize_model(self.bert_layer.encoder.layer[6])
        randomize_model(self.bert_layer.encoder.layer[6])
        randomize_model(self.bert_layer.encoder.layer[6])
        randomize_model(self.bert_layer.encoder.layer[6])
        randomize_model(self.bert_layer.encoder.layer[6])
        randomize_model(self.bert_layer.encoder.layer[6])
        randomize_model(self.bert_layer.encoder.layer[6])
        randomize_model(self.bert_layer.encoder.layer[7])
        randomize_model(self.bert_layer.encoder.layer[7])
        randomize_model(self.bert_layer.encoder.layer[7])
        randomize_model(self.bert_layer.encoder.layer[7])
        randomize_model(self.bert_layer.encoder.layer[7])
        randomize_model(self.bert_layer.encoder.layer[7])
        randomize_model(self.bert_layer.encoder.layer[7])
        randomize_model(self.bert_layer.encoder.layer[7])
        randomize_model(self.bert_layer.encoder.layer[8])
        randomize_model(self.bert_layer.encoder.layer[8])
        randomize_model(self.bert_layer.encoder.layer[8])
        randomize_model(self.bert_layer.encoder.layer[8])
        randomize_model(self.bert_layer.encoder.layer[8])
        randomize_model(self.bert_layer.encoder.layer[8])
        randomize_model(self.bert_layer.encoder.layer[8])
        randomize_model(self.bert_layer.encoder.layer[8])
        randomize_model(self.bert_layer.encoder.layer[9])
        randomize_model(self.bert_layer.encoder.layer[9])
        randomize_model(self.bert_layer.encoder.layer[9])
        randomize_model(self.bert_layer.encoder.layer[9])
        randomize_model(self.bert_layer.encoder.layer[9])
        randomize_model(self.bert_layer.encoder.layer[9])
        randomize_model(self.bert_layer.encoder.layer[9])
        randomize_model(self.bert_layer.encoder.layer[9])
        randomize_model(self.bert_layer.encoder.layer[10])
        randomize_model(self.bert_layer.encoder.layer[10])
        randomize_model(self.bert_layer.encoder.layer[10])
        randomize_model(self.bert_layer.encoder.layer[10])
        randomize_model(self.bert_layer.encoder.layer[10])
        randomize_model(self.bert_layer.encoder.layer[10])
        randomize_model(self.bert_layer.encoder.layer[10])
        randomize_model(self.bert_layer.encoder.layer[10])
        randomize_model(self.bert_layer.encoder.layer[11])
        randomize_model(self.bert_layer.encoder.layer[11])
        randomize_model(self.bert_layer.encoder.layer[11])
        randomize_model(self.bert_layer.encoder.layer[11])
        randomize_model(self.bert_layer.encoder.layer[11])
        randomize_model(self.bert_layer.encoder.layer[11])
        randomize_model(self.bert_layer.encoder.layer[11])
        randomize_model(self.bert_layer.encoder.layer[11])
        
        
        #Classification layer
        self.cls_layer = nn.Linear(self.bert_layer.config.hidden_size, self.num_class)

        # Perform Initialization
        randomize_model(self.cls_layer)


    def forward(self, seq, attn_masks):
        out_bert = self.bert_layer(seq, attention_mask = attn_masks)
        logits = self.cls_layer(out_bert.pooler_output)
        return logits    

class LiteModel(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.params = hparams
        self.best_valid_loss = 16e9
        self.criterion = torch.nn.CrossEntropyLoss()
        self.model = ReviewClassifier( num_class = hparams['n_classes'],
                                      bert_model = hparams['bert_model'])
        
    # will be used during inference
    def forward(self, x_indexs, x_att_mask):
      logits = self.model(x_indexs, x_att_mask)
      preds = logits.argmax(dim=1)
      return logits, preds

    def training_step(self, train_batch, batch_idx):
      x_indexs, x_att_mask, y = train_batch
      logits = self.model(x_indexs, x_att_mask)

      # calcula a perda
      batch_losses = self.criterion(logits.squeeze(-1), y)
      loss = batch_losses.mean()
      run['train/batch_loss'].log(loss)

      if self.trainer.global_step != 0:
            run['train/grad2.0normemb'].log(float(torch.norm(self.model.bert_layer.embeddings.word_embeddings.weight.grad, p=2)))
            run['train/grad2.0norm0'].log(float(torch.norm(self.model.bert_layer.encoder.layer[0].attention.self.query.weight.grad, p=2)))
            run['train/grad2.0norm5'].log(float(torch.norm(self.model.bert_layer.encoder.layer[5].attention.self.query.weight.grad, p=2)))
            run['train/grad2.0norm11'].log(float(torch.norm(self.model.bert_layer.encoder.layer[11].attention.self.query.weight.grad, p=2)))
      # O PL sempre espera um retorno nomeado 'loss' da training_step.
      return {'loss': loss, 'batch_losses': batch_losses}

    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([output['batch_losses'] for output in outputs]).mean()

        run['train/loss'].log(avg_loss)
        self.log('train_loss', avg_loss, on_epoch=True, prog_bar=True)
        
        return
  
    def validation_step(self, val_batch, batch_idx):
        x_indexs, x_att_mask, y = val_batch
        
        # Transforma a entrada para duas dimensões
        # predict da rede
        logits, preds = self.forward(x_indexs, x_att_mask)
        
        # calcula a perda
        batch_losses = self.criterion(logits, y)
        # calcula a acurácia
        batch_accuracy = (preds == y)

        val_f1 = torchmetrics.functional.f1(preds, y, num_classes=2, average='weighted')
        # Retornamos as losses do batch para podermos fazer a média no validation_epoch_end.
        return {'batch_losses': batch_losses, 'batch_accuracy': batch_accuracy,
                "batch_f1": val_f1}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([output['batch_losses'] for output in outputs]).mean()
        accuracy = torch.stack([output['batch_accuracy'] for output in outputs]).float().mean()
        f1 = torch.stack([output['batch_f1'] for output in outputs]).float().mean()


        run['valid/loss'].log(avg_loss)
        run['valid/acuracy'].log(accuracy)
        run['valid/F1'].log(f1)

        metrics = {'valid_loss': avg_loss.item(), 'accuracy': accuracy.item(), 'f1-score':f1.item()}
        output =  {'progress_bar': metrics, 'valid_loss': avg_loss.item()}

        #salva o melhor modelo
               
        if avg_loss < self.best_valid_loss:
            torch.save(self.model.state_dict(), '/content/'+self.params['bert_model']+'best_model.pt')
            self.best_valid_loss = avg_loss
        torch.save(self.model.state_dict(), '/content/'+self.params['bert_model']+'trainer_model.pt')


        self.log('validate_loss', avg_loss, on_epoch=True, prog_bar=True)
        self.log('validate_acc', accuracy, on_epoch=True, prog_bar=True)
        self.log('validate_f1', f1, on_epoch=True, prog_bar=True)
        return output
  
    def test_step(self, val_batch, batch_idx):
        # A implementação deste método é opcional no Pytorch Lightning.
        x_indexs, x_att_mask, y = val_batch
        
        # predict da rede
        logits, preds = self.forward(x_indexs, x_att_mask)

        # calcula a perda
        batch_losses = self.criterion(logits, y)
        # calcula a acurácia
        batch_accuracy = (preds == y)
        test_f1 = torchmetrics.functional.f1(preds, y, num_classes=2, average='weighted')
        
        # Retornamos as losses do batch para podermos fazer a média no validation_epoch_end.
        return {'batch_losses': batch_losses, 'batch_accuracy': batch_accuracy,
                'batch_f1':test_f1}

    def test_epoch_end(self, outputs):
        # A implementação deste método é opcional no Pytorch Lightning.
        avg_loss = torch.stack([output['batch_losses'] for output in outputs]).mean()
        accuracy = torch.stack([output['batch_accuracy'] for output in outputs]).float().mean()
        f1 = torch.stack([output['batch_f1'] for output in outputs]).float().mean()

        metrics = {'Test loss': avg_loss.item(), 'test accuracy': accuracy.item(), 'test f1':f1.item()}
        output =  {'progress_bar': metrics}

        run['test/loss'].log(avg_loss)
        run['test/acuracy'].log(accuracy)
        run['test/F1'].log(f1)

        self.log('test_loss', avg_loss, on_epoch=True, prog_bar=True)
        self.log('test_acc', accuracy,  on_epoch=True, prog_bar=True)
        self.log('test_f1', f1,  on_epoch=True, prog_bar=True)
        return output

    def configure_optimizers(self):
        # Gradiente descendente
        optimizer = torch.optim.Adamax(self.model.parameters(), lr=self.params['learning_rate'])
        
        # Aqui usamos um scheduler dummy pois o pytorch lightning original requer um.
        scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda epoch: 1.0)
        return [optimizer], [scheduler]  # Forma de retorno para associar um otimizador a um scheduler.


# Experiment

In [None]:
# Trocar neptune
run = neptune.init(
    project="d230640/Projeto-Final-2021-2-Resultados",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJmZTg1OGU3Yi1jOWE1LTRjMjEtYTJjNS1hMjMwM2Y3NDRjOTUifQ==",
)  

https://app.neptune.ai/d230640/Projeto-Final-2021-2-Resultados/e/PROJ2-17


In [None]:
# definindo os hyperparametros
hparams = {
    'max_epochs': 11,
    'max_len':512,
    'n_classes':2,
    'bert_model':'bert-base-uncased',
    'learning_rate': 1e-4,
    'bs':16
    
}


dm = IMDBDataModule(batch_size=hparams['bs'],x_train = x_train, y_train = y_train,
                    x_val=x_valid, y_val = y_valid, 
                    x_test= x_test, y_test = y_test,
                    model_tokens = hparams['bert_model'],max_len=hparams['max_len'])
dm.setup()

pl_model = LiteModel(hparams=hparams)

trainer = pl.Trainer(max_epochs=hparams['max_epochs'],
                     progress_bar_refresh_rate = 1,
                     accumulate_grad_batches = 5,
                     gpus=1,
                     log_every_n_steps=1) 

%time trainer.fit(pl_model, dm)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Some weights of the model checkpoint at facebook/w

Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 123
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: 0it [00:00, ?it/s]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

CPU times: user 3h 33min 24s, sys: 1min 47s, total: 3h 35min 11s
Wall time: 3h 34min 26s


In [None]:
#teste
%time trainer.test(test_dataloaders=dm.test_dataloader())

  "`trainer.test(test_dataloaders)` is deprecated in v1.4 and will be removed in v1.6."
  f"`.{fn}(ckpt_path=None)` was called without a model."
  f"DataModule.{name} has already been called, so it will not be called again. "
Restoring states from the checkpoint path at /content/lightning_logs/version_1/checkpoints/epoch=10-step=2749.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /content/lightning_logs/version_1/checkpoints/epoch=10-step=2749.ckpt
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8655970096588135,
 'test_f1': 0.9254817366600037,
 'test_loss': 0.5006301999092102}
--------------------------------------------------------------------------------
CPU times: user 7min 45s, sys: 3.44 s, total: 7min 49s
Wall time: 7min 46s


[{'test_acc': 0.8655970096588135,
  'test_f1': 0.9254817366600037,
  'test_loss': 0.5006301999092102}]

In [None]:
run.stop()