In [1]:
import datasets
from datasets import load_dataset_builder, load_dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


#### Step 1 : Load Dataset
- Simple dataset for sentiment analysis

In [2]:
sentiment_dataset = load_dataset("javalove93/sentiment-analysis-dataset")

In [3]:
import re
text = sentiment_dataset['train']['text'][0]

# Split by both '.' and '!'
sentences = re.split(r'[.!]', text)
sentences[:-1]

sentiment_dataset['train']['label']

['positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative']

**Matching label strategy**
- Split each sequence into single sentence.
- Regrouping sentences with same label to create pairs : focus on relationship between them (even if not in the original dataset) !
- Here 2 sentiments : positive or negative
- Pairs should be list of tuples or list

In [4]:
from typing import List, Tuple # for cool decorations of functions
MAX_LEN = 64

# Splitting initial sequences
def create_sequences(dataset: datasets.Dataset) -> List[Tuple[str, str]]:
    sequences = dataset['train']
    seq_pos = []
    seq_neg = []
    for sequence,label in zip(sequences['text'],sequences['label']):
        sentences = re.split(r'([.!])',sequence)
        sentences = [sentences[i] + sentences[i+1] for i in range(0, len(sentences)-1,2)]
        if len(sentences[-1]) == 0:
            sentences = [s[:MAX_LEN] for s in sentences[:-1]] # remove last sep empty
        if label == 'positive':
            seq_pos += sentences
        else:
            seq_neg += sentences
    return seq_pos, seq_neg

seq_pos, seq_neg = create_sequences(sentiment_dataset)

# Generating pairs
def generate_pairs(sequences: list) -> list:
    return [(s1,s2) for s1,s2 in zip(sequences[:-1],sequences[1:])]

pairs_pos = generate_pairs(seq_pos)
pairs_neg = generate_pairs(seq_neg)

sentences = seq_pos + seq_neg
pairs = pairs_pos + pairs_neg
pairs

[('I love this movie!', " It's amazing."),
 (" It's amazing.", 'What a great experience!'),
 ('What a great experience!', ' Highly recommended.'),
 (' Highly recommended.', "This is the best book I've ever read."),
 ("This is the best book I've ever read.", "I'm so happy with my purchase!"),
 ("I'm so happy with my purchase!", 'I had a fantastic time!'),
 ('I had a fantastic time!', 'Absolutely loved it!'),
 ('Absolutely loved it!', 'This is incredible!'),
 ('This is incredible!', "I'm very impressed with the performance."),
 ("I'm very impressed with the performance.", "I can't wait to try it again!"),
 ("I can't wait to try it again!", 'Excellent service and friendly staff.'),
 ('Excellent service and friendly staff.',
  'Highly satisfied with the results.'),
 ('Highly satisfied with the results.', 'This is a must-see!'),
 ('This is a must-see!', 'It was a wonderful evening.'),
 ('It was a wonderful evening.', 'I highly recommend this service.'),
 ('I highly recommend this service.',

### Step 2 : Tokenization
- Using WordPiece tokenizer to produce BERT inputs

In [5]:
import tqdm
from tokenizers import BertWordPieceTokenizer
from pathlib import Path
from transformers import BertTokenizer
# Creating batches
batch_size = 30


def create_batches(batch_size : int,sentences : list):
    text_data = []
    file_count = 0
    for word in tqdm.tqdm(sentences):

        text_data.append(word)

        if len(text_data) == batch_size:
            with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as file:
                file.write('\n'.join(text_data))
            text_data = []
            file_count += 1

    with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as file:
                file.write('\n'.join(text_data))

create_batches(batch_size,sentences)

paths = [str(x) for x in Path('./data').glob('**/*.txt')]

# Training the tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

tokenizer.train( 
    files=paths,
    vocab_size=30_000, 
    min_frequency=5,
    limit_alphabet=1000, 
    wordpieces_prefix='##',
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']
    )

# os.mkdir('./bert-it-1')
tokenizer.save_model('./bert-it-1', 'bert-it')
tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)

enc = tokenizer.encode("Hello friends")
# tokenizer.decode(enc)
enc

100%|██████████| 59/59 [00:00<00:00, 109935.11it/s]







[1, 16, 44, 43, 43, 48, 14, 88, 101, 47, 35, 2]

### Step 3 : Sequence Embedding
- Transform sequences to account for uniform representation as inputs for BERT

In [6]:
import random
import torch
from torch.utils.data import Dataset


class BERTDataset(Dataset):
    def __init__ (self, tokenizer, data_pair: List[Tuple[str, str]], max_len: int):
        self.tokenizer = tokenizer
        self.lines = data_pair
        self.max_len = max_len

    def __getitem__(self,index: int) -> dict:
        t1, t2, is_next_label = self.get_seq(index)

        t1_token_ids = self.tokenizer(t1)['input_ids'][1:-1] # remove CLS and SEP
        t2_token_ids = self.tokenizer(t2)['input_ids'][1:-1]

        t1_random, t1_label = self.random_word(t1_token_ids)
        t2_random, t2_label = self.random_word(t2_token_ids)

        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.max_len]
        bert_input = (t1 + t2)[:self.max_len]
        bert_label = (t1_label + t2_label)[:self.max_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.max_len - len(bert_input))]
        bert_input += padding
        bert_label += padding
        segment_label += padding

        return {"bert_input": torch.tensor(bert_input),
                  "bert_label": torch.tensor(bert_label),
                  "segment_label": torch.tensor(segment_label),
                  "is_next": torch.tensor(is_next_label)}
    
    def __len__(self) -> int:
        return len(self.lines)

    def random_word(self,token_ids: list) -> Tuple[list, list]:
        n_tokens = len(token_ids)
        output_label_id = [0] * n_tokens # real output for masked info
        output_token_id = token_ids # masked output

        for i,token_id in enumerate(token_ids):
            p = random.random()

            if p <= 0.15:
                p = random.random()

                if p <= 0.8:
                    output_token_id[i] = self.tokenizer.vocab['[MASK]']
                
                elif p <= 0.9:
                    output_token_id[i] = random.randrange(len(self.tokenizer.vocab))
                else:
                    output_token_id[i] = token_id
                
                output_label_id[i] = token_id
        
        return output_token_id, output_label_id


    def get_seq(self, index):
        """
            Returns a sentence pair (str) with index is_next
        """
        t1,t2 = self.get_corpus_line(index)

        if random.random() > 0.5:
            return t1, t2, 1
        else:
            return t1, self.get_random_line(), 0
        
    def get_corpus_line(self, index):
        return self.lines[index]
        
    def get_random_line(self):
        return self.lines[random.randrange(len(self.lines))][1]
    
train_data = BERTDataset(tokenizer,pairs,MAX_LEN)
# train_data[3]

### Step 4 : Embeddings
- Token Embeddings : projection to vector space of each token
- Positional Embeddings : Keeps track of position of words within sequences, essential to get context about sentence structure
- Segment Embeddings : label precising which part of the sequence a word belongs to

In [13]:
import torch
import numpy as np

def positional_encoding(L: int, d_model: int, N: int = 10000) -> np.array:
    pos = np.arange(L)[:, np.newaxis] # [L,1]
    i = np.arange(d_model)[np.newaxis, :] # [1,d_model]

    angle_rates = 1 / np.power(N, (2*(i//2)) / d_model)
    angle_rads = pos * angle_rates # [L,d_model]

    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    return angle_rads

class PositionalEmbedding(torch.nn.Module):
    def __init__(self, d_model, max_len=64):
        super().__init__()

        pe = torch.zeros(max_len, d_model).float()
        pe.requires_grad = False

        pe += positional_encoding(max_len, d_model)

        self.pe = pe.unsqueeze(0) # extra batch dimension : [1, 64, 128]
    
    def forward(self):
        return self.pe
    

class BERTEmbedding(torch.nn.Module):
    def __init__(self, vocab_size: int, embed_size: int, max_len: int, dropout: float):
        super().__init__()
        self.embed_size = embed_size

        self.token = torch.nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.segment = torch.nn.Embedding(3, embed_size, padding_idx=0) # padding, seqA, seqB
        self.position = PositionalEmbedding(embed_size,max_len)
        self.dropout = torch.nn.Dropout(p=dropout)
    
    def forward(self, seq: list, segment_label: list):
        embs = self.token(seq) + self.position() + self.segment(segment_label)
        return self.dropout(embs)

# Test PE
# d_model = 128
# pos_encoder = PositionalEmbedding(d_model,MAX_LEN)
# pos_encoder.forward().shape == torch.Size([1,64,128])

# Test Embeddings
d_model = 32
seq = train_data[3]["bert_input"]
segment_label = train_data[3]["segment_label"]
bert_emb = BERTEmbedding(vocab_size=len(tokenizer.vocab),embed_size=d_model, max_len=MAX_LEN, dropout=0.1)
inputs = bert_emb.forward(seq=seq, segment_label=segment_label)
inputs.shape

torch.Size([1, 64, 32])

### Step 5 : Encoder Architecture

* Single Head Attention
$$\underbrace{\vec{E}}_{n*d_{model}}\underbrace{Q}_{d_{model}*d_k} \rightarrow \underbrace{Q_{\vec{E}}}_{n*d_k}$$
$$(QK^T)_{i,j} = Q_{\vec{E}}[i]Q_{\vec{K}}^T[j]$$

* Need to **mask** scores when j > i : prevent future tokens to give info to the previous ones.

In [None]:
import torch.nn.functional as F

class SingleHeadAttention(torch.nn.Module):
    def __init__(self,d_model: int,d_k: int,dropout_rate=0.1):
        super(SingleHeadAttention, self).__init__()

        self.query = torch.nn.Linear(d_model,d_k)
        self.key = torch.nn.Linear(d_model, d_k)
        self.values = torch.nn.Linear(d_model, d_model)
        self.dropout = torch.nn.Dropout(dropout_rate)
        ## TODO : improve self.vals with low-rank strat

    def forward(self, E: torch.Tensor, masked: bool= False) -> torch.Tensor:
        Q_emb = self.query(E)
        K_emb = self.key(E)
        V_emb = self.values(E)

        dk = self.query.size(-1)
        scores = torch.matmul(Q_emb, K_emb.transpose()) / torch.sqrt(dk)
        if masked: # no masking in the case of the encoder part
            mask = torch.triu(torch.ones_like(scores),diagonal=1).bool()
            scores = scores.masked_fill_(mask,-1e12) # use very low values to mask scores and prevent future words to influence
        
        attention_scores = F.softmax(scores, dim=-1) # softmax normalization on rows of QK^T
        attention_scores = self.dropout(attention_scores)

        attention_values = torch.matmul(attention_scores,V_emb)
        return attention_values

In [51]:
inputs.shape
d_model = inputs.shape[2]
d_k = 16

inputs = inputs.float()
linear = torch.nn.Linear(d_model, d_k)
output = linear(inputs[0])
output.shape

M1 = torch.randn(size=(2,3))
M2 = torch.randn(size=(4,3))

M = torch.matmul(M1,M2.transpose(-2,-1))
M.shape

M = torch.Tensor(np.array([[1,2,3],[5,5,6]]))
F.softmax(M,dim=-1)

tensor([[0.0900, 0.2447, 0.6652],
        [0.2119, 0.2119, 0.5761]])