In [1]:
!pip install pytorch_pretrained_bert pytorch-nlp



In [2]:
!pip install transformers



In [3]:
import sys
import numpy as np
import random as rn
import torch
from torch import nn

from transformers import BertModel
from transformers import BertTokenizer
from transformers import BertConfig


from torchnlp.datasets import imdb_dataset
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output


In [4]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

Prepare data

In [5]:
train_data, test_data = imdb_dataset(train=True, test=True)
rn.shuffle(train_data)
rn.shuffle(test_data)
train_data = train_data[:1000]
test_data = test_data[:100]

In [6]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))

len(train_texts), len(train_labels), len(test_texts), len(test_labels)

(1000, 1000, 100, 100)

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
tokenizer.tokenize('Hi my name is Dima')

['hi', 'my', 'name', 'is', 'dim', '##a']

In [9]:
batch_train = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
batch_test = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

In [10]:
train_tokens_ids = batch_train['input_ids']
test_tokens_ids = batch_test['input_ids']

In [11]:
train_masks = batch_train['attention_mask']
test_masks = batch_test['attention_mask']

In [12]:
train_y = np.array(train_labels) == 'pos'
test_y = np.array(test_labels) == 'pos'
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

((1000,), (100,), 0.489, 0.5)

Baseline

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [14]:
baseline_model = make_pipeline(CountVectorizer(ngram_range=(1,3)), LogisticRegression()).fit(train_texts, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
baseline_predicted = baseline_model.predict(test_texts)

In [16]:
print(classification_report(test_labels, baseline_predicted))

              precision    recall  f1-score   support

         neg       0.81      0.86      0.83        50
         pos       0.85      0.80      0.82        50

    accuracy                           0.83       100
   macro avg       0.83      0.83      0.83       100
weighted avg       0.83      0.83      0.83       100



BERT model

In [17]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        #model_config = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, tokens, masks=None):
        outputs = self.bert(tokens, attention_mask=masks)
        pooled_output = outputs.pooler_output  # This is the correct way to access the pooled output with newer versions.
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba


In [18]:
class BertWithSinusoidalPositionalEmbedding(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()  # Adding a sigmoid layer
        self.position_embeddings = nn.Parameter(self.sinusoidal_embeddings(512, config.hidden_size), requires_grad=False)

    def sinusoidal_embeddings(self, num_positions, hidden_size):
        position = torch.arange(num_positions).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_size, 2) * -(math.log(10000.0) / hidden_size))
        sinusoidal = torch.zeros(num_positions, hidden_size)
        sinusoidal[:, 0::2] = torch.sin(position * div_term)
        sinusoidal[:, 1::2] = torch.cos(position * div_term)
        return sinusoidal

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        sequence_output += self.position_embeddings[:sequence_output.size(1), :]
        pooled_output = sequence_output[:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        probabilities = self.sigmoid(logits)  # Apply sigmoid to convert logits to probabilities
        return probabilities



In [19]:
import torch
import torch.nn as nn

class ALiBiPositionalEmbedding(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
        self.w = nn.Parameter(torch.randn(dim // 2) * 0.02)  # initialize weights

    def forward(self, length):
        positions = torch.arange(length, device=self.w.device).float().unsqueeze(1)
        angles = positions * self.w.view(1, -1)
        return torch.cat((angles.sin(), angles.cos()), dim=1)

class BertWithALiBi(BertBinaryClassifier):
    def __init__(self, dropout=0.1):
        super().__init__(dropout)
        self.alibi_embeddings = ALiBiPositionalEmbedding(768)  # Assuming BERT hidden size is 768

    def forward(self, tokens, masks=None):
        outputs = self.bert(tokens, attention_mask=masks)

        # Apply ALiBi embeddings
        seq_length = tokens.size(1)
        alibi_emb = self.alibi_embeddings(seq_length)
        alibi_emb = alibi_emb.unsqueeze(0).repeat(tokens.size(0), 1, 1)  # Repeat for batch size

        # Modify embeddings
        inputs_embeds = outputs.last_hidden_state + alibi_emb
        outputs = self.bert(inputs_embeds=inputs_embeds, attention_mask=masks)

        pooled_output = outputs.pooler_output
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba




In [20]:
class RoPEPositionalEmbedding(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
        # Generate frequencies for half the dimensions
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)

    def forward(self, pos):
        # Create the full range sinusoidal pattern
        sinusoid_inp = torch.ger(pos, self.inv_freq)
        pos_emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
        return pos_emb

    @staticmethod
    def apply_rotary_pos_emb(x, sinusoid_inp):
        sinusoid_inp = sinusoid_inp.unsqueeze(0)  # Expand for batch size
        sin, cos = sinusoid_inp.split(sinusoid_inp.shape[-1] // 2, dim=-1)
        x1 = x[..., 0::2]
        x2 = x[..., 1::2]
        rotated_x1 = x1 * cos - x2 * sin
        rotated_x2 = x2 * cos + x1 * sin
        return torch.cat([rotated_x1, rotated_x2], dim=-1)

class BertWithRoPE(BertBinaryClassifier):
    def __init__(self, dropout=0.1):
        super().__init__(dropout)
        self.rope_embeddings = RoPEPositionalEmbedding(768)  # Assuming BERT's hidden size is 768

    def forward(self, tokens, masks=None):
        outputs = self.bert(tokens, attention_mask=masks)
        seq_length = tokens.size(1)
        position = torch.arange(seq_length, device=tokens.device).float()
        rope_emb = self.rope_embeddings(position)

        # Apply rotary embeddings
        transformed = self.rope_embeddings.apply_rotary_pos_emb(outputs.last_hidden_state, rope_emb)

        pooled_output = outputs.pooler_output
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba



In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [22]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'0.0M'

In [23]:
bert_clf = BertBinaryClassifier()
#config = BertConfig.from_pretrained('bert-base-uncased')
#bert_clf = BertWithSinusoidalPositionalEmbedding(config).to(device)
#bert_clf = BertWithALiBi()
#bert_clf = bert_clf.to(device)
#bert_clf = BertWithRoPE().to(device)

bert_clf = bert_clf.cuda()

In [24]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'439.07328M'

In [25]:
#x = torch.tensor(train_tokens_ids[:3]).to(device)
#y, pooled = bert_clf.bert(x)
#x.shape, y.shape, pooled.shape

x = torch.tensor(train_tokens_ids[:3]).to(device)
outputs = bert_clf.bert(x)  # Access outputs from the model
pooled = outputs.pooler_output  # Access the pooled output
y = outputs.last_hidden_state  # Access the sequence of hidden-states at the output of the last layer


print(x.shape, y.shape, pooled.shape)


  x = torch.tensor(train_tokens_ids[:3]).to(device)
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


torch.Size([3, 512]) torch.Size([3, 512, 768]) torch.Size([3, 768])


In [26]:
y = bert_clf(x)
y.cpu().detach().numpy()

array([[0.48952854],
       [0.42610326],
       [0.43423972]], dtype=float32)

In [27]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'3185.033728M'

In [28]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'1816.313344M'

FINE tune BERT

In [29]:
BATCH_SIZE = 8
EPOCHS = 10

In [30]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = train_masks
test_masks_tensor = test_masks

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

  train_tokens_tensor = torch.tensor(train_tokens_ids)
  test_tokens_tensor = torch.tensor(test_tokens_ids)


'1816.313344M'

In [31]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [32]:
param_optimizer = list(bert_clf.sigmoid.named_parameters())
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [33]:
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [34]:
torch.cuda.empty_cache()

In [35]:
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        #print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
        logits = bert_clf(token_ids, masks)

        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()


        bert_clf.zero_grad()
        batch_loss.backward()


        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()

        #clear_output(wait=True)

    print('Epoch: ', epoch_num + 1)
    print("{0}/{1} train loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

    bert_clf.eval()
    bert_predicted = []
    all_logits = []
    test_loss = 0
    with torch.no_grad():
        for step_num_e, batch_data in enumerate(test_dataloader):

            token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

            logits = bert_clf(token_ids, masks)
            loss_func = nn.BCELoss()
            loss = loss_func(logits, labels)
            test_loss += loss.item()
            numpy_logits = logits.cpu().detach().numpy()

            bert_predicted += list(numpy_logits[:, 0] > 0.5)
            all_logits += list(numpy_logits[:, 0])

    print("{0}/{1} val loss: {2} ".format(step_num_e, len(test_data) / BATCH_SIZE, test_loss / (step_num_e + 1)))


Epoch:  1
124/125.0 train loss: 0.6792915062904358 
12/12.5 val loss: 0.6376867523560157 
Epoch:  2
124/125.0 train loss: 0.5829170470237732 
12/12.5 val loss: 0.49626650030796343 
Epoch:  3
124/125.0 train loss: 0.4149294217824936 
12/12.5 val loss: 0.35381848422380596 
Epoch:  4
124/125.0 train loss: 0.28490878629684446 
12/12.5 val loss: 0.30162300627965194 
Epoch:  5
124/125.0 train loss: 0.20884878569841384 
12/12.5 val loss: 0.27722866374712724 
Epoch:  6
124/125.0 train loss: 0.16598083633184432 
12/12.5 val loss: 0.39180204157645887 
Epoch:  7
124/125.0 train loss: 0.1231381486132741 
12/12.5 val loss: 0.3530760524221338 
Epoch:  8
124/125.0 train loss: 0.09181714333593845 
12/12.5 val loss: 0.46327959284043085 
Epoch:  9
124/125.0 train loss: 0.0682623934186995 
12/12.5 val loss: 0.598803683793029 
Epoch:  10
124/125.0 train loss: 0.06323728488571942 
12/12.5 val loss: 0.5894786770670459 


In [36]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()

        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])

In [37]:
np.mean(bert_predicted)

0.57

In [38]:
print(classification_report(test_y, bert_predicted))

              precision    recall  f1-score   support

       False       0.93      0.80      0.86        50
        True       0.82      0.94      0.88        50

    accuracy                           0.87       100
   macro avg       0.88      0.87      0.87       100
weighted avg       0.88      0.87      0.87       100



In [39]:
for i in range(5):
  clear_output(wait=True)
  print('a', i+1)
  print("{0}/{1} loss: {2} ".format(i, i / 5, i / (i + 1)))

a 5
4/0.8 loss: 0.8 


Sinusoidal