<a href="https://colab.research.google.com/github/GiovanniPioDelvecchio/GCNs_on_text/blob/issue-%235/bertweet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install emoji

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m97.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
Col

In [1]:
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
import pandas as pd

import numpy as np

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [3]:
train_frame = pd.read_csv("train_split.csv")
test_frame = pd.read_csv("test_split.csv")
val_frame = pd.read_csv("val_split.csv")

In [4]:
train_tweets = list(train_frame.OriginalTweet.values)
train_labels = list(train_frame.Sentiment.values)
val_tweets = list(val_frame.OriginalTweet.values)
val_labels = list(val_frame.Sentiment.values)
test_tweets = list(test_frame.OriginalTweet.values)
test_labels = list(test_frame.Sentiment.values)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
def encode_split(tweet_list, max_len=None):
  pad = 'max_length' if max_len is not None else False
  idxs = []
  attn_masks = []
  for t in tweet_list:
    tok_seq = tokenizer.encode_plus(t,
        add_special_tokens=True,
        max_length=max_len,
        padding=pad,
        #pad_to_max_length=True,
        truncation=True,
        return_attention_mask=True,
        #return_tensors='pt'

    )
    idxs.append(tok_seq.get('input_ids'))
    attn_masks.append(tok_seq.get('attention_mask'))

  return idxs, attn_masks

tok_train, _ = encode_split(train_tweets)
print(np.percentile([len(t) for t in tok_train], 90))

59.0


In [6]:
tok_train, train_mask = encode_split(train_tweets, 60)
tok_val, val_mask = encode_split(val_tweets, 60)
tok_test, test_mask = encode_split(test_tweets, 60)

In [7]:
tok_train = torch.tensor(tok_train)
train_mask = torch.tensor(train_mask)
tok_test = torch.tensor(tok_test)
test_mask = torch.tensor(test_mask)
tok_val = torch.tensor(tok_val)
val_mask = torch.tensor(val_mask)

In [8]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(tok_train, train_mask, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(tok_val, val_mask, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [9]:
# Create the BertClassfier class
class FTBert(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self,bert, num_classes, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    num_classes: number of classes
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(FTBert, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        bert_hidden_size = bert.config.hidden_size
        self.linear_1 = nn.Linear(bert_hidden_size, bert_hidden_size//2)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(bert_hidden_size//2, num_classes)

        # Instantiate BERT model
        self.bert = bert

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   out (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        out = outputs.last_hidden_state[:,0,:]
        out = self.linear_1(out)
        out = self.relu(out)
        out = self.linear_2(out)

        return out

In [10]:
def train(model, optimizer, scheduler, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    loss_fn = nn.CrossEntropyLoss()
    for epoch_i in range(epochs):
    # =======================================
    #               Training
    # =======================================
    # Print the header of the result table
      print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9}")
      print("-"*70)
      # Reset tracking variables at the beginning of each epoch
      total_loss, batch_loss, batch_counts = 0, 0, 0
      # Put the model into the training mode
      model.train()
      # For each batch of training data...
      for step, batch in enumerate(train_dataloader):
        batch_counts +=1
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        # Zero out any previously calculated gradients
        model.zero_grad()
        # Perform a forward pass. This will return logits.
        logits = model(b_input_ids, b_attn_mask)
        # Compute loss and accumulate the loss values
        loss = loss_fn(logits, b_labels)
        batch_loss += loss.item()
        total_loss += loss.item()
        # Perform a backward pass to calculate gradients
        loss.backward()
        # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and the learning rate
        optimizer.step()
        scheduler.step()
        # Print the loss values and time elapsed for every 20 batches
        if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
          # Calculate time elapsed for 20 batches
          # Print training results
          print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9}")
          # Reset batch tracking variables
          batch_loss, batch_counts = 0, 0
          # Calculate the average loss over the entire training data
          avg_train_loss = total_loss / len(train_dataloader)
          print("-"*70)

      if evaluation == True:
        # After the completion of each training epoch, measure the model's performance
        # on our validation set.
        val_loss, val_accuracy = evaluate(model, val_dataloader)
        print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f}")

def evaluate(model, dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()
    loss_fn = nn.CrossEntropyLoss()
    # Tracking variables
    accuracies = []
    losses = []
    # For each batch in our validation set...
    for batch in dataloader:
      # Load batch to GPU
      b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
      # Compute logits
      with torch.no_grad():
        logits = model(b_input_ids, b_attn_mask)
        # Compute loss
        loss = loss_fn(logits, b_labels)
        losses.append(loss.item())
        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()
        # Calculate the accuracy rate
        acc = accuracy(preds, b_labels)
        accuracies.append(acc)
    # Compute the average accuracy and loss over the validation set.
    loss = np.mean(losses)
    acc = np.mean(accuracies)
    return loss, acc

def accuracy(pred_y, y):
    """Calculate accuracy."""
    return ((pred_y == y).sum() / len(y)).item()

In [12]:
print(len(np.unique(train_labels)))

5


In [11]:
bert_tweet = AutoModel.from_pretrained("vinai/bertweet-base")
bert_tweet.to(device)
n_classes = len(np.unique(train_labels))

epochs = 4
optimizer = AdamW(bert_tweet.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

# Total number of training steps
total_steps = len(train_dataloader) * epochs

# Set up the learning rate scheduler
scheduler =get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0, # Default value
                                            num_training_steps=total_steps)

model = FTBert(bert_tweet, len(np.unique(train_labels)))
model.to(device)




FTBert(
  (linear_1): Linear(in_features=768, out_features=384, bias=True)
  (relu): ReLU()
  (linear_2): Linear(in_features=384, out_features=5, bias=True)
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          

In [12]:
train(model, optimizer, scheduler, train_dataloader, val_dataloader, 4, True)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc 
----------------------------------------------------------------------
   1    |   20    |   1.584475   |     -      |     -    
----------------------------------------------------------------------
   1    |   40    |   1.542093   |     -      |     -    
----------------------------------------------------------------------
   1    |   60    |   1.463528   |     -      |     -    
----------------------------------------------------------------------
   1    |   80    |   1.422577   |     -      |     -    
----------------------------------------------------------------------
   1    |   100   |   1.373463   |     -      |     -    
----------------------------------------------------------------------
   1    |   120   |   1.294061   |     -      |     -    
----------------------------------------------------------------------
   1    |   140   |   1.292121   |     -      |     -    
--------------------

In [16]:
# Create the DataLoader for our validation set
test_labels = torch.tensor(test_labels)
test_data = TensorDataset(tok_test, test_mask, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
test_loss, test_acc = evaluate(model, test_dataloader)
print(f"Test accuracy {test_acc}")

  test_labels = torch.tensor(test_labels)


Test accuracy 0.819
