In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [68]:
import re
import math
import torch
import random
import shutil
import collections
import transformers
import numpy as np
import pandas as pd
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import AutoTokenizer
from sklearn.metrics import roc_auc_score
from torch.nn.utils import clip_grad_norm_
from torch.optim import AdamW, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import get_cosine_schedule_with_warmup
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

### $\text{Preparing data}$

In [9]:
data = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')

In [10]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [11]:
MAX_SEQ_LEN = 64

In [12]:
# def normilize_text(seq):
#     norm = re.sub(r'[^a-zA-Z1-9\s<br/>]', '', seq)
#     norm = re.sub(r'[\s+]', ' ', norm).strip()
#     norm = re.sub(r'<br\s?/?>', ' ', norm)

#     return norm

In [13]:
# data['review'] = data['review'].apply(normilize_text)
data['sentiment'] = data['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)
# data['sentiment' == 'negative'] = 0
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [69]:
print(collections.Counter(data['sentiment']))

Counter({1: 25000, 0: 25000})


In [14]:
X_train, X_test, y_train, y_test = train_test_split(data['review'],
                                                    data['sentiment'],
                                                    test_size = 0.2,
                                                    random_state = 42)

In [71]:
class IMDBDataset(Dataset):
    def __init__(self, seqs, labels, tokenizer):
        self.seqs = seqs
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, ind):
        seq = self.seqs[ind]
        label = torch.tensor(self.labels[ind], dtype=torch.long)

        inf = self.tokenizer.encode_plus(
            text=seq,
            max_length = MAX_SEQ_LEN,
            add_special_tokens = True,
            truncation = True,
            padding = 'max_length',
            return_tensors = 'pt',
        )

        tokenised = inf['input_ids'].view(-1)
        att_mask = inf['attention_mask'].unsqueeze(0)
        seq_len = torch.LongTensor([torch.count_nonzero(tokenised)])

        out = {'seq':       seq,
               'tokenized': tokenised,
               'att_mask':  att_mask,
               'seq_len':   seq_len,
               'label':     label
               }

        return out

In [72]:
train_data = IMDBDataset(X_train.values,
                         y_train.values,
                         tokenizer)
test_data = IMDBDataset(X_test.values,
                        y_test.values,
                        tokenizer)

In [73]:
test_data[10]['att_mask'].size()

torch.Size([1, 1, 64])

In [74]:
train_loader = DataLoader(train_data,
                          batch_size=128,
                          shuffle=True,
                          drop_last=True)

In [75]:
test_loader = DataLoader(test_data,
                         batch_size=128,
                         shuffle=True,
                         drop_last=True)

### $\text{Model components}$

In [76]:
class Embeddings(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(Embeddings, self).__init__()

        self.vocab_size = vocab_size
        self.emb_size = emb_size

        self.emb = nn.Embedding(self.vocab_size, self.emb_size)

    def forward(self, X):
        emb = self.emb(X)

        return emb # size = [batch_size*seq_len*emb_size]

### $\text{Next, in BERT we use Positional Encoding}$

### $$\text{PE}_{(pos, 2i)} = \sin \left(\frac{pos}{10000^{\frac{2i}{d_{\text{model}}}}}\right)$$

### $$\text{PE}_{(pos, 2i + 1)} = \cos \left(\frac{pos}{10000^{\frac{2i}{d_{model}}}}\right)$$

In [77]:
class PositionalEncoding(nn.Module):
    def __init__(self, seq_len, emb_size=512):
        super(PositionalEncoding, self).__init__()

        self.seq_len = seq_len
        self.emb_size = emb_size

        pe = torch.zeros(size=(self.seq_len, self.emb_size))

        for position in range(self.seq_len):
            for i in range(0, self.emb_size, 2):
                pe[position, i] = math.sin(position/10000**(2*i/self.emb_size))
                if i + 1 < self.emb_size:
                    pe[position, i + 1] = math.cos(position/10000**(2*i/self.emb_size))

        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)

    def forward(self, X):
        current_seq_len = X.size(1)
        pe = self.pe[:, :current_seq_len].to(device)  # [1, current_seq_len, emb_size]

        X = X + pe

        return X

In [78]:
class Sublayer(nn.Module):
    def __init__(self, emb_size=512, dropout_p=0.2):
        super(Sublayer, self).__init__()
        """
        param emb_size: size of model
        param dropout_p: probability of dropout
        """
        self.norm = nn.LayerNorm(emb_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, X, result):
        """
        param X: input data
        param result: x which we passed through some function
        """
        out = X + self.dropout(self.norm(result))

        return out

### $\text {FFN class}$

### $$\text{FFN}(x) = \max(0, xW_{1} + b_{1})W_{2} + b_{2}$$

In [79]:
class FFN(nn.Module):
    def __init__(self, emb_size=512, scaling_factor=4, dropout_p=0.2):
        super(FFN, self).__init__()

        """
        param emb_size: size of model
        param scaling_factor: scaling_factor*emb_size = size of hidden layer
        param dropout_p: probability of dropout

        """

        self.linear = nn.Sequential(
            nn.Linear(emb_size, emb_size*scaling_factor),
            nn.ReLU(),
            nn.Linear(emb_size*scaling_factor, emb_size)
        )

        self.dropout = nn.Dropout(dropout_p)

    def forward(self, X):
        out = self.dropout(self.linear(X))

        return out

### $\text {Next, we need to realize the main idea of transformer`s model - Multihead Attention}$

### $$\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^{T}}{\sqrt{d_\text{model}}}\right)V$$

In [92]:
class MultiheadAttention(nn.Module):
    def __init__(self, emb_size=512, n_heads=8):
        super(MultiheadAttention, self).__init__()

        self.emb_size = emb_size
        self.n_heads = n_heads

        self.single_head_dim = int(self.emb_size / self.n_heads)

        self.k_matrix = nn.Linear(self.single_head_dim, self.single_head_dim)
        self.q_matrix = nn.Linear(self.single_head_dim, self.single_head_dim)
        self.v_matrix = nn.Linear(self.single_head_dim, self.single_head_dim)

        self.out = nn.Linear(self.n_heads * self.single_head_dim, self.emb_size)

        self.att_weights = None

    def forward(self, query, key, value, mask=None):
        """
        key, query, value - same embeddings, after nn.Embedding layer in encoder

        key.shape=value.shape!=query.shape in decoder

        """

        batch_size = key.shape[0]
        seq_len = key.shape[1]

        # for decoder we use another length for query as the length will be different
        query_len = query.shape[1]

        # divide into 8 heads
        query = query.view(batch_size, query_len, self.n_heads, self.single_head_dim)
        key = key.view(batch_size, seq_len, self.n_heads, self.single_head_dim)
        value = value.view(batch_size, seq_len, self.n_heads, self.single_head_dim)

        k = self.k_matrix(key)
        q = self.q_matrix(query)
        v = self.v_matrix(value)

        # transpose matrix for better multiplication in Pytorch

        q = q.transpose(1, 2) #[batch_size*n_heads*query_len*single_head_dim]
        k = k.transpose(1, 2) #[batch_size*n_heads*seq_len*single_head_dim]
        v = v.transpose(1, 2) #[batch_size*n_heads*seq_len*single_head_dim]

        product = torch.matmul(q, k.transpose(2, 3))

        # scaling
        product = product / math.sqrt(self.single_head_dim)

        # print(f'product shape of q and k in multiheadatt: {product.size()}')
        # print(f'mask shape in multiheadatt: {mask.size()}')

        # for case in which we have mask

        if mask is not None:
            product = product.masked_fill(mask == 0, float('-inf'))

        att_weights = torch.softmax(product, dim=-1)
        self.att_weights = att_weights

        scores = torch.matmul(att_weights, v) # [batch_size*n_heads*seq_len*single_head_dim]

        concat = scores.transpose(1, 2).contiguous().view(batch_size,
                                                          query_len,
                                                          self.n_heads *
                                                          self.single_head_dim)

        # print(concat.shape)
        output = self.out(concat) # [batch_size*seq_len*emb_dim]

        return output

In [93]:
class EncoderBlock(nn.Module):
    def __init__(self, emb_size=512, dropout_p=0.2, scaling_factor=4, n_heads=8):
        super(EncoderBlock, self).__init__()

        self.att = MultiheadAttention(emb_size, n_heads)
        self.sublayer = Sublayer(emb_size, dropout_p)
        self.ffn = FFN(emb_size, scaling_factor)

    def forward(self, X, src_mask):
        # print(f'Shape of X in EncoderBlock: {X.shape}')
        # print(f'src_mask shape in EncoderBlock: {src_mask.shape}')
        out = self.sublayer(X, self.att(X, X, X, src_mask))
        out = self.sublayer(out, self.ffn(out))

        return out

### $\text{Model structure}$

In [94]:
class Encoder(nn.Module):
    def __init__(self,
                 seq_len,
                 vocab_size,
                 emb_size=512,
                 dropout_p=0.2,
                 scaling_factor=4,
                 n_heads=8,
                 num_layers=6):
        super(Encoder, self).__init__()

        self.emb = Embeddings(vocab_size, emb_size)
        self.pe = PositionalEncoding(seq_len, emb_size)
        self.enc_layers = nn.ModuleList([
            EncoderBlock(emb_size, dropout_p, scaling_factor, n_heads)
            for _ in range(num_layers)
        ])

    def forward(self, X, mask):
        embeded = self.emb(X)
        out = self.pe(embeded)

        for layer in self.enc_layers:
            out = layer(out, mask)

        return out

In [95]:
class BERT(nn.Module):
    def __init__(self, encoder, emb_size, vocab_size, n_labels):
        super(BERT, self).__init__()

        self.encoder = encoder

        # token prediction
        self.generator = nn.Sequential(
            nn.Linear(emb_size, emb_size),
            nn.GELU(),
            nn.Linear(emb_size, vocab_size)
        )

        # classification
        self.classifier = nn.Sequential(
            nn.Linear(emb_size, n_labels)
        )

    def forward(self, X, att_mask):
        out = self.encoder(X, att_mask)

        generator_out = self.generator(out)
        classifier_out = self.classifier(out[:, 0, :])

        return generator_out, classifier_out

In [96]:
emb_size = 512
n_heads = 8
vocab_size = tokenizer.vocab_size
dropout_p = 0.2
scaling_factor = 4
num_layers = 6
n_labels = 2

In [129]:
encoder = Encoder(seq_len=MAX_SEQ_LEN,
                  vocab_size=vocab_size,
                  emb_size=emb_size,
                  dropout_p=dropout_p,
                  scaling_factor=scaling_factor,
                  n_heads=n_heads,
                  num_layers=num_layers).to(device)
model = BERT(encoder=encoder,
             emb_size=emb_size,
             vocab_size=vocab_size,
             n_labels=n_labels).to(device)

### $\text{Initialize weights}$

In [130]:
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

model

BERT(
  (encoder): Encoder(
    (emb): Embeddings(
      (emb): Embedding(30522, 512)
    )
    (pe): PositionalEncoding()
    (enc_layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (att): MultiheadAttention(
          (k_matrix): Linear(in_features=64, out_features=64, bias=True)
          (q_matrix): Linear(in_features=64, out_features=64, bias=True)
          (v_matrix): Linear(in_features=64, out_features=64, bias=True)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (sublayer): Sublayer(
          (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (ffn): FFN(
          (linear): Sequential(
            (0): Linear(in_features=512, out_features=2048, bias=True)
            (1): ReLU()
            (2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
  )
  (generat

### $\text{Let`s create a masking funcion for MLM task}$

In [131]:
def mask_tokens(batch):
    tokenized = batch['tokenized']
    labels = batch['label']
    seq_lens = batch['seq_len']
    n_seq = batch['tokenized'].size(0)

    n_masked_tokens = [math.ceil(seq_len.item()*0.15) for seq_len in seq_lens]
    masked_position = [random.sample(range(1, seq_lens[i].item()-1), n_masked_tokens[i])
                       if seq_lens[i] > 4 else []
                       for i in range(n_seq)]
    # Тензор для того, чтобы хранить только замаскированные токены
    masked_tokens = torch.full_like(batch["tokenized"], -100)
    for i in range(n_seq):
        # batch["att_mask"][i, 0, 0, masked_position[i]] = 0
        masked_tokens[i, masked_position[i]] = batch["tokenized"][i, masked_position[i]]
        batch["tokenized"][i, masked_position[i]] = tokenizer.mask_token_id

    return batch, masked_tokens

In [132]:
epochs = 5
lr = 1e-5

In [133]:
mlm_criterion = nn.CrossEntropyLoss()
clf_criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=lr)

### $\text{Training process}$

In [134]:
model.train()

for epoch in range(epochs):
    epoch_mlm_loss = []
    epoch_clf_loss = []
    for batch in tqdm(train_loader):
        batch, masked_tokens = mask_tokens(batch)

        # print(batch['tokenized'].size())
        # print(batch['att_mask'].size())
        gen_out, clf_out = model(batch['tokenized'].to(device),
                                 batch['att_mask'].to(device))
        # print(clf_out.shape)

        gen_loss = mlm_criterion(gen_out.view(-1, vocab_size),
                             masked_tokens.view(-1).to(device))
        clf_loss = clf_criterion(clf_out,
                            batch['label'].view(-1).to(device))

        loss = gen_loss + clf_loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        epoch_mlm_loss.append(gen_loss.item())
        epoch_clf_loss.append(clf_loss.item())

    print(f'Num of epoch: {epoch}')
    print(f'epoch_mlm_loss = {np.mean(epoch_mlm_loss)}')
    print(f'epoch_clf_loss = {np.mean(epoch_clf_loss)}')

100%|██████████| 312/312 [03:36<00:00,  1.44it/s]


Num of epoch: 0
epoch_mlm_loss = 7.9160190881826935
epoch_clf_loss = 1.788615046785428


100%|██████████| 312/312 [03:36<00:00,  1.44it/s]


Num of epoch: 1
epoch_mlm_loss = 6.720668208904756
epoch_clf_loss = 1.3103017050486345


100%|██████████| 312/312 [03:35<00:00,  1.45it/s]


Num of epoch: 2
epoch_mlm_loss = 6.701140767488724
epoch_clf_loss = 1.0729208242816803


100%|██████████| 312/312 [03:35<00:00,  1.45it/s]


Num of epoch: 3
epoch_mlm_loss = 6.7010243107111025
epoch_clf_loss = 0.9582189271847407


100%|██████████| 312/312 [03:35<00:00,  1.45it/s]

Num of epoch: 4
epoch_mlm_loss = 6.6848666148308
epoch_clf_loss = 0.8932566510943266





### $\text{Let`s validate or model}$

In [143]:
model.eval()
with torch.no_grad():
    test_mlm_loss = []
    test_clf_loss = []

    for batch in tqdm(test_loader):
        batch, masked_tokens = mask_tokens(batch)

        gen_out, clf_out = model(batch['tokenized'].to(device),
                                 batch['att_mask'].to(device))

        gen_loss = mlm_criterion(gen_out.view(-1, vocab_size),
                                 masked_tokens.view(-1).to(device))
        clf_loss = clf_criterion(clf_out,
                                 batch['label'].view(-1).to(device))

        test_mlm_loss.append(gen_loss.item())
        test_clf_loss.append(clf_loss.item())

    print('\n')
    print(f"Validation MLM loss: {np.mean(test_mlm_loss):.4f}")
    print(f"Validation CLS loss: {np.mean(test_clf_loss):.4f}")

100%|██████████| 78/78 [00:28<00:00,  2.74it/s]



Validation MLM loss: 6.6763
Validation CLS loss: 0.7020





### $\text{Inference}$

In [144]:
model.eval()

with torch.no_grad():
    msg = 'It`s a good film, I watched it twice'

    encoded_msg = tokenizer.encode_plus(text=msg,
                                        max_length=MAX_SEQ_LEN,
                                        add_special_tokens=True,
                                        truncation=True,
                                        padding="max_length",
                                        return_attention_mask=True,
                                        return_tensors="pt")

    _, cls_out = model(encoded_msg["input_ids"].to(device),
                       encoded_msg["attention_mask"].to(device))

    cls_out = cls_out.argmax(dim=-1).item()

    print(f"msg: {msg}")
    print(f"label: 1")
    print(f"predict: {cls_out}")

msg: It`s a good film, I watched it twice
label: 1
predict: 1
