In [1]:
#https://github.com/harenlin/IMDB-Sentiment-Analysis-Using-BERT-Fine-Tuning/blob/main/BERT_Fine_Tune.ipynb
#https://www.kaggle.com/code/atulanandjha/bert-testing-on-imdb-dataset-extensive-tutorial
#https://github.com/jlealtru/website_tutorials/blob/main/notebooks/BigBird%20text%20classification.ipynb
!pip -q install transformers

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, SubsetRandomSampler
import torch.optim as opt
from transformers import ElectraTokenizer, ElectraModel
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, auc, roc_curve
from copy import copy, deepcopy
import zipfile
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # specify which GPU(s) to be used
#torch.backends.cudnn.benchmark = True'''

Mounted at /content/drive


In [3]:
zf = zipfile.ZipFile('/content/drive/My Drive/Colab Notebooks/text classification/imdb 50k/archive.zip')
df = pd.read_csv(zf.open('IMDB Dataset.csv'))
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
seq_len = [len(sent) for sent in df['review']]
fig = go.Figure()
#fig.add_trace(go.Scatter(x=np.arange(len(df)), y=seq_len, mode='markers', name='Seq len'))
fig.add_trace(go.Scatter(x=np.arange(len(df)), y=[np.mean(seq_len)]*len(seq_len), mode='lines', name='Avg seq len'))
fig.add_trace(go.Scatter(x=np.arange(len(df)), y=[np.median(seq_len)]*len(seq_len), mode='lines', name='Med seq len'))
fig.show()

In [13]:
tokenizer = ElectraTokenizer.from_pretrained("google/electra-base-discriminator")

'''example_text = ['I will watch Memento tonight']
bert_input = tokenizer(example_text,padding='max_length', max_length = 10, truncation=True, return_tensors="pt")
tokenizer.decode(bert_input.input_ids[1])'''
text_column, out_column = 'review', 'sentiment'
labels = dict(zip(df[out_column].unique(), range(df[out_column].nunique())))
df.replace({out_column: labels}, inplace=True)
df_train, df_valid, df_test = np.split(df.sample(frac=1, random_state=42),  [int(.8*len(df)), int(.9*len(df))])
df_train.shape, df_valid.shape, df_test.shape

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

((40000, 2), (5000, 2), (5000, 2))

In [None]:
len(df[text_column].values[26582]), df[text_column].values[26582]

In [None]:
tmp = tokenizer([df[text_column].values[26582]], padding = 'max_length', truncation=True, return_tensors='pt')
tmp['input_ids'].shape, tmp['input_ids'], tmp

In [None]:
tokenizer.decode(tmp['input_ids'][0])

In [None]:
df_train

Unnamed: 0,review,sentiment
33553,I really liked this Summerslam due to the look...,0
9427,Not many television shows appeal to quite as m...,0
199,The film quickly gets to a major chase scene w...,1
12447,Jane Austen would definitely approve of this o...,0
39489,Expectations were somewhat high for me when I ...,1
...,...,...
1559,"This kind of ""inspirational"" saccharine is eno...",1
13313,"When people nowadays hear of a 1940s drama, th...",0
13528,This is a low budget Roger Corman horror/creat...,1
25017,"First off, let it be known that I came into th...",0


In [16]:
class SeqClsLoader(Dataset): #Sequence classification dataloader
    def __init__(self, encoded_data, labels):
        self.labels = labels
        self.encoded_data = encoded_data

    def classes(self):
        return self.labels
 
    def __len__(self):
        return len(self.encoded_data['input_ids'])
 
    def __getitem__(self, idx):
        print(idx)
        return self.encoded_data['input_ids'][idx], self.encoded_data['attention_mask'][idx], self.labels[idx]

#encoded_train = tokenizer(df_train[text_column].tolist(), padding = 'max_length', truncation=True, return_tensors='pt')
train_load = DataLoader(SeqClsLoader(encoded_train,  df_train[out_column].values.astype(np.float32)[:,None]), batch_size=2, sampler=SubsetRandomSampler(np.arange(len(df_train))))  # DATALOADER obj
for i, (ids, mask, Y) in enumerate(train_load):
  print(i, ids.shape, mask.shape, Y.shape)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
24295
16885
521 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
21296
23235
522 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
2172
32545
523 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
27451
16740
524 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
39308
22340
525 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
13511
9167
526 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
17808
24324
527 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
15345
36768
528 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
34236
2663
529 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
14862
5686
530 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
13045
6130
531 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
2278
16081
532 torch.Size([2, 512]) torch.Size([2, 512]) torch.Size([2, 1])
25967
32112
533 t

KeyboardInterrupt: ignored

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class SeqClsLoader(Dataset): #Sequence classification dataloader
    def __init__(self, encoded_data, labels):
        self.labels = labels
        self.encoded_data = encoded_data

    def classes(self):
        return self.labels
 
    def __len__(self):
        return len(self.encoded_data['input_ids'])
 
    def __getitem__(self, idx):
        return self.encoded_data['input_ids'][idx], self.encoded_data['attention_mask'][idx], self.labels[idx] if self.labels is not None else None

class NNModel(nn.Module):
    def __init__(self, input_shape, units=None, factors=None, activ=True, norm=False, dropout=False, slops=None):
        super().__init__()
        self.input_shape = input_shape
        self.units = units
        self.factors = factors
        self.activ, self.norm = activ, norm
        self.network = nn.ModuleList()
        if self.factors:
            self.units = np.round(self.input_shape * np.asarray(self.factors)).astype(int)
        if self.units is not None:
            self.dropout = np.zeros_like(self.units) if not dropout else dropout
            self.slops = np.full(len(self.units), 1) if slops is None else slops
            for i, j, k in zip(self.units, self.dropout, self.slops):
                if i >= 1:
                    block = self.__build_block__(input_shape, i, p=j, slop=k)
                    self.network.extend(block)
                    input_shape = i
        self.output_shape = input_shape
        self.reset_parameters()
    
    def __build_block__(self, input_shape, units, p, slop):
        block = []
        block.append(nn.Linear(input_shape, units, bias=not self.norm))
        if self.norm:
            block.append(nn.BatchNorm1d(units))
            #block.append(nn.LayerNorm(units, eps=1e-5))
        if self.activ:
            #block.append(nn.LeakyReLU())
            block.append(nn.ELU(slop))
            #block.append(nn.GELU())
        if p > 0:
            block.append(nn.Dropout(p))
        return block
 
    def forward(self, x):
        for layer in self.network:
          tmp = layer(x)
          x = tmp
        return x
 
    def reset_parameters(self):
        for layer in self.network:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_normal_(layer.weight)
                layer.bias.data.fill_(0.1)
 

class TransSeqClassifier(nn.Module):
    def __init__(self, model, emb_dim, mlp_units, mlp_dropout, nb_class):
        super(TransSeqClassifier, self).__init__()
        self.model = model
        self.mlp = NNModel(emb_dim, units=mlp_units, factors=None, dropout=[mlp_dropout]*len(mlp_units)) if mlp_units is not None else None
        cls_units = self.mlp.output_shape if mlp_units is not None else emb_dim
        self.classifier = nn.Linear(cls_units, nb_class)

    def forward(self, input_id, mask):
        _, pooled_output = self.model(input_ids= input_id, attention_mask=mask,return_dict=False)
        z = self.mlp(pooled_output) if self.mlp is not None else pooled_output
        pred = self.classifier(z)
        return z, pooled_output, pred

class BaseSeqClassifier:
    def __init__(self, model, tokenizer):
        self.model = model.to(device)
        self.losses = {'Epoch': [], 'Train': [], 'Test': [], 'BState': [], 'LState': [], 'LR': []}
        self.tokenizer = tokenizer
 
    def train_model(self, optim, train_loader, grad_clip, l2_reg):
          total_loss = 0
          self.model = self.model.train()
        #with autograd.detect_anomaly():
          for i, (ids, mask, Y) in enumerate(train_loader):
              ids, mask, Y = ids.to(device), mask.to(device), Y.to(device)
              #self.model.get_weight()
              optim.zero_grad()
              loss = self.loss_function(ids, mask, Y, l2_reg)
              loss.backward()
              torch.nn.utils.clip_grad_norm_(self.model.parameters(), grad_clip)
              optim.step()
              total_loss += loss.item()
          return total_loss/(i+1)
        
 
    def eval_model(self, test_loader):
        self.model = self.model.eval()
        total_loss = 0
        for i, (ids, mask, Y) in enumerate(test_loader):
            ids, mask, Y = ids.to(device), mask.to(device), Y.to(device)
            loss = self.loss_function(ids, mask, Y, l2_reg=0)
            total_loss += loss.item()
        return total_loss/(i+1)#np.abs(-100. - total_loss)
 
    def fit(self, X_train, Y_train, epoch, lr, opt_kwarg, batch_size=None,  grad_clip=100, momentum=0.9, X_test=None, Y_test=None, l2_reg=0, verbose=True, save=True):
        batch_size = len(Y_train) if batch_size is None else batch_size
        encoded_train = self.tokenizer(X_train, padding = 'max_length', truncation=True, return_tensors='pt')
        train_load = DataLoader(SeqClsLoader(encoded_train, Y_train), batch_size=batch_size, shuffle=True)  # DATALOADER obj
        if X_test is not None:
            encoded_test = self.tokenizer(X_test, padding = 'max_length', truncation=True, return_tensors='pt')
            test_load = DataLoader(SeqClsLoader(encoded_test, Y_test), batch_size=batch_size, shuffle=True)
 
        best_loss = 1e100
        optim = opt.Adam(self.model.parameters(), lr=lr)
        #optim = opt.SGD(self.model.parameters(), lr=lr, momentum=momentum, nesterov=True)

        scheduler = None
        #scheduler = opt.lr_scheduler.CyclicLR(optim, **opt_kwarg)
        #scheduler = opt.lr_scheduler.ReduceLROnPlateau(optim, **opt_kwarg)
        #scheduler = opt.lr_scheduler.MultiStepLR(optim, milestones=[28, 120], gamma=0.1)

        eval_score = ''
        for i in range(epoch):
            if verbose:
                print('##### EPOCH ' + str(i) + ' #####')
               
            train_loss = self.train_model(optim, train_load, grad_clip, l2_reg)
            self.losses['LState'] = deepcopy(self.model.state_dict())
    
            if verbose:
                print('train loss : ', train_loss)
            self.losses['Epoch'].append(i), self.losses['Train'].append(train_loss)
    
            if df_test is not None:
                valid_loss = self.eval_model(test_load)

                if verbose:
                    print('test loss : ', valid_loss)
                self.losses['Test'].append(valid_loss)
    
                if scheduler is not None:
                    scheduler.step(valid_loss)
                    self.losses['LR'].append(optim.param_groups[0]['lr'])
                    '''scheduler.step()
                    self.losses['LR'].append(scheduler.get_last_lr()[0])'''
    
                if valid_loss < best_loss:
                    self.losses['BState'] = deepcopy(self.model.state_dict())
                    best_loss = valid_loss
                    print('===========SAVE===========')


class Binaryclass(BaseSeqClassifier):#Binaryclass classification
    def __init__(self, model, tokenizer):
        super(Binaryclass, self).__init__(model, tokenizer,)

    def loss_function(self, input_id, mask, Y, l2_reg):
        _, _, pred = self.model(input_id, mask)
        bce_loss = nn.BCEWithLogitsLoss()
        loss = bce_loss(pred, Y)
        return loss

    def prdict(self, X, batch_size):
        self.model.eval()
        encoded_data = self.tokenizer(X, add_special_tokens=True, return_attention_mask=True, pad_to_max_length=True, max_length=512, return_tensors='pt')
        data_load = DataLoader(TensorDataset(encoded_data['input_ids'],  encoded_data['attention_mask']),batch_size=batch_size)
        outputs = {'z': [], 'pooled_output': [], 'pred': []}
        for i, (ids, mask) in enumerate(data_load):
            ids, mask = ids.to(device), mask.to(device)
            z, pooled_output, pred = self.model(ids, mask)
            pred = nn.Sigmoid()(pred)
            z, pooled_output, pred = z.cpu().data.numpy(), pooled_output.cpu().data.numpy(), pred.cpu().data.numpy()
            outputs['z'].extend(z), outputs['pooled_output'].extend(pooled_output), outputs['pred'].extend(pred)
        return outputs

class Multiclass(BaseSeqClassifier):#multiclass classification
    def __init__(self, model, tokenizer):
        super(Multiclass, self).__init__(model, tokenizer,)

    def loss_function(self, input_id, mask, Y, l2_reg):
        _, _, pred = self.model(input_id, mask)
        ce_loss = nn.CrossEntropyLoss()
        loss = ce_loss(pred, Y)
        return loss

    def prdict(self, X, batch_size):
        self.model.eval()
        encoded_data = self.tokenizer(X, add_special_tokens=True, return_attention_mask=True, pad_to_max_length=True, max_length=512, return_tensors='pt')
        data_load = DataLoader(TensorDataset(encoded_data['input_ids'],  encoded_data['attention_mask']),batch_size=batch_size)
        outputs = {'z': [], 'pooled_output': [], 'pred': []}
        for i, (ids, mask) in enumerate(data_load):
            ids, mask = ids.to(device), mask.to(device)
            z, pooled_output, pred = self.model(ids, mask)
            pred = nn.Softmax()(pred)
            z, pooled_output, pred = z.cpu().data.numpy(), pooled_output.cpu().data.numpy(), pred.cpu().data.numpy()
            outputs['z'].extend(z), outputs['pooled_output'].extend(pooled_output), outputs['pred'].extend(pred)
        return outputs

def gradient_clipper(model: nn.Module, val: float) -> nn.Module:
    def process_grad(grad):
        grad[grad != grad] = 1e-10
        return torch.clamp(grad, -val, val)
    for parameter in model.parameters():
        parameter.register_hook(lambda grad: process_grad(grad))
    
    return model

In [None]:
epoch, lr, batch_size, d, mlp_d = 50000, 1e-6, 2, 0.000001, 1e-6
#cyclic_kwarg = {'base_lr': lr, 'max_lr': 1e-2, 'step_size_up':200, 'step_size_down':200}
plateau_kwarg = {'factor':0.5, 'patience':200, 'verbose':True, 'min_lr':1e-7, 'mode':'min'}

bigbird = BigBirdModel.from_pretrained("google/bigbird-roberta-base")
model = TransSeqClassifier(bigbird, 768, mlp_units=None, mlp_dropout=1e-6, nb_class=1)
model = gradient_clipper(model, 10)
#nn_model.load_state_dict(best_state)
print(device)
print(model)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))
seq_bc = Binaryclass(model, tokenizer)
seq_bc.fit(df_train[text_column].tolist(), df_train[out_column].values.astype(np.float32)[:,None], epoch, lr, plateau_kwarg, batch_size=batch_size, grad_clip=10, momentum=0.9,
        X_test=df_valid[text_column].tolist(), Y_test=df_valid[out_column].values.astype(np.float32)[:,None], l2_reg=0, verbose=True)

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


cuda
TransSeqClassifier(
  (model): BigBirdModel(
    (embeddings): BigBirdEmbeddings(
      (word_embeddings): Embedding(50358, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BigBirdEncoder(
      (layer): ModuleList(
        (0): BigBirdLayer(
          (attention): BigBirdAttention(
            (self): BigBirdBlockSparseAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): BigBirdSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
             

OutOfMemoryError: CUDA out of memory. Tried to allocate 180.00 MiB (GPU 0; 14.76 GiB total capacity; 13.22 GiB already allocated; 89.75 MiB free; 13.90 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")

Let's use 2 GPUs!


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

best_state = deepcopy(bert_bc.losses['BState'])
bert_bc.model.load_state_dict(best_state)
print(np.min(bert_bc.losses['Test']))

fig = make_subplots(rows=3, cols=1)
s = 0
fig.append_trace(go.Scatter(x=bert_bc.losses['Epoch'][s:], y=bert_bc.losses['Train'][s:],mode='lines',name='Train'), row=1, col=1)
fig.append_trace(go.Scatter(x=bert_bc.losses['Epoch'][s:], y=bert_bc.losses['Test'][s:],mode='lines',name='Test'), row=2, col=1)
fig.append_trace(go.Scatter(x=bert_bc.losses['Epoch'][s:], y=bert_bc.losses['LR'][s:],mode='lines',name='LR'), row=3, col=1)
fig.update_layout(height=1000, width=1500, title_text="Stacked Subplots")
fig.show()

In [None]:
output = bert_bc.prdict(df_valid[text_column].values, 2)
pred = np.round(np.asarray(output['pred']))
auc = roc_auc_score(df_valid[out_column].values, pred)
pd.DataFrame({'AUC': auc, 'ACC': accuracy_score(df_valid[out_column].values, np.round(pred)), 
              'PRE': precision_score(df_valid[out_column].values, np.round(pred)), 'REC': recall_score(df_valid[out_column].values, np.round(pred)), 
              'F1':f1_score(df_valid[out_column].values, np.round(pred))}, index=[0])

In [None]:
output = bert_bc.prdict(df_test[text_column].values, 2)
pred = np.round(np.asarray(output['pred']))
auc = roc_auc_score(df_test[out_column].values, pred)
pd.DataFrame({'AUC': auc, 'ACC': accuracy_score(df_test[out_column].values, np.round(pred)), 
              'PRE': precision_score(df_test[out_column].values, np.round(pred)), 'REC': recall_score(df_test[out_column].values, np.round(pred)), 
              'F1':f1_score(df_test[out_column].values, np.round(pred))}, index=[0])

In [None]:
confusion_matrix(df_test[out_column].values, pred)

In [None]:
pd.concat((df_test.reset_index(drop=True), pd.DataFrame({'Pred': pred.ravel()})), axis=1)

In [None]:
tmp = tokenizer.batch_encode_plus([df_test[text_column].values[89]], add_special_tokens=True, return_attention_mask=True, pad_to_max_length=True, max_length=512, return_tensors='pt')
tmp

In [None]:
tokenizer.decode(tmp['input_ids'][0])