# todo
- Проверить на задаче классификации отзывов что получаемые эмбединги текстов - норм

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch

# Masking language modelling MLM

In [3]:
%%writefile masking_language_modelling/models.py

import torch

from multihead_attention import MultiHeadAttention

class BaseEncoderModel(torch.nn.Module):
    def __init__(self, vocab_size, n_heads, emb_size, vdim=None, kdim=None, padding_idx=None):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx),
            MultiHeadAttention(n_heads=n_heads, emb_size=emb_size),
            MultiHeadAttention(n_heads=n_heads, emb_size=emb_size),
        )
    def forward(self, X: torch.Tensor):
        return self.layers(X)


class MLMHead(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        out_features = self.model.layers[-1].ffn.inplace.out_features
        vocab_size = self.model.layers[0].num_embeddings
        self.mlm_layer = torch.nn.Linear(out_features, vocab_size, bias=False)
        
    def forward(self, X: torch.Tensor): #, masked_tokens: torch.Tensor): 
        state = self.model(X)
        logits = self.mlm_layer(state)
        # you can't truncate only masked tokens here because the shape of the bantch will be broken.
        #         masked_tokens_logits = logits[:, masked_tokens, :]
#         result = torch.softmax(logits, axis=-1) # will add this in loss
        return logits

Overwriting masking_language_modelling/models.py


In [4]:
batch_size, seq_len, emb_size = 2, 3, 7
vocab_size = 3
torch.nn.Embedding(vocab_size, emb_size)(torch.randint(vocab_size, size=(batch_size, seq_len))).shape

torch.Size([2, 3, 7])

In [5]:
from masking_language_modelling.models import BaseEncoderModel, MLMHead
from masking_language_modelling.dataproc import MLMDataset
# batch_size, seq_len, emb_size = 11, 30, 36
# X = torch.Tensor(batch_size, seq_len, emb_size).random_()



batch_size, seq_len, emb_size = 11, 47, 36
vocab_size = 1000
base_model = BaseEncoderModel(vocab_size=vocab_size, n_heads=12, emb_size=emb_size)
model = MLMHead(model=base_model)

X = torch.randint(vocab_size, size=(batch_size, seq_len))
probs = model(X)
probs.shape

torch.Size([11, 47, 1000])

# Prepare dataset

In [12]:
%%writefile masking_language_modelling/dataproc.py

import torch
from torch.utils.data import Dataset
from collections import Counter
from itertools import chain
from typing import List, Optional

class MLMDataset(Dataset):
    def __init__(self, text_fpath: str, 
                 max_seq_len: int, 
                 mask_ratio:float = 0.15
                ):
        self.max_seq_len = max_seq_len
        self.mask_ratio = mask_ratio
        with open(text_fpath, 'r') as f:
            self.lines = f.readlines()
        
    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        return self.lines[idx]
    
import string
import re

class Tokenizer:
    punctuation = string.punctuation.replace('-', '')
    def __init__(self, 
                 max_vocab_size: int,
                 truncation: bool = True,
                 max_seq_len: Optional[int] = None,
                 padding: bool = True,
                ):
        self.pad_token = '<PAD>'
        self.mask_token = '<MASK>'
        self.cls_token = '<CLS>'
        self.sep_token = '<SEP>'
        self.special_tokens = [self.cls_token, self.sep_token, self.pad_token, self.mask_token]
        self.max_vocab_size = max_vocab_size
        self.truncation = truncation
        self.padding = padding
        
        if self.padding or self.truncation:
            assert not(max_seq_len is None)
            self.max_seq_len = max_seq_len
    
    def fit(self, dataset):
        most_common_words = Counter(chain.from_iterable(map(self._preproc, dataset)))\
                                                     .most_common(self.max_vocab_size-len(self.special_tokens))
        most_common_words = list(map(lambda x: x[0], most_common_words))
        self.vocab = dict(map(lambda x: (x[1], x[0]), enumerate(self.special_tokens + most_common_words)))
        return self
    
    def apply(self, text: str):
        pad_token_idx = self.vocab[self.pad_token]
        input_seq = self._preproc(text)[:self.max_seq_len]
        payload_tokens = list(map(lambda x: self.vocab.get(x, pad_token_idx), input_seq))
        padding_tokens = [pad_token_idx]*(self.max_seq_len-len(payload_tokens))
        return [self.vocab[self.cls_token]] + payload_tokens + padding_tokens
    
    def _preproc(self, text: str) -> List[str]:
        sub_pattern = f'[{re.escape(Tokenizer.punctuation)}]'
        return re.sub(sub_pattern, ' ', text.lower()).split()
    

def spawn_collate_fn(tokenizer, mask_ratio=0.15):
    cls_token_id = tokenizer.vocab[tokenizer.cls_token]
    sep_token_id = tokenizer.vocab[tokenizer.sep_token]
    mask_token_id = tokenizer.vocab[tokenizer.mask_token]
    
    def mask_objective(batch_token_ids, mask_ratio):
            masked_tokens = torch.rand(batch_token_ids.shape)<mask_ratio
            mask_arr = masked_tokens * (batch_token_ids != cls_token_id) * (batch_token_ids != sep_token_id)
            return mask_arr
        
    def custom_collate_fn(batch):
        input_ids = torch.Tensor(batch).long()
        mlm_mask = mask_objective(input_ids, mask_ratio)
        masked_input_ids = torch.where(mlm_mask, mask_token_id, input_ids).long()

        return {
            'input_tokens': input_ids,
            'masked_input_tokens': masked_input_ids,
#             'attention_mask': 1, # is this the same as mlm mask?
            'mlm_mask': mlm_mask,
        }
    return custom_collate_fn

Overwriting masking_language_modelling/dataproc.py


# Train loop

In [13]:
%%writefile test.txt
kek
mda ok na
aagaaa
kek
mda ok na
aagaaa
kek
mda ok na
aagaaa
kek
mda ok na
aagaaa

Overwriting test.txt


In [14]:
from itertools import chain
from datasets import load_dataset
# tmp_dataset = load_dataset("rotten_tomatoes")
# # with open('rt.txt', 'w') as f:
# #     for x in tmp_dataset['train']:
# #         if len(x['text'].split())<10:
# #             continue
# #         f.write(x['text']+'\n')

tmp_dataset = load_dataset("imdb")
with open('imdb_100k.txt', 'w') as f:
    for x in tmp_dataset['unsupervised']:
        if len(x['text'].split())<10:
            continue
        f.write(x['text']+'\n')

Found cached dataset imdb (/home/grigory/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

data

In [25]:
from masking_language_modelling.dataproc import MLMDataset, spawn_collate_fn, Tokenizer
from torch.utils.data import DataLoader

seq_len = 30
batch_size = 50
max_vocab_size = 30000

training_data = MLMDataset('imdb_100k.txt', seq_len)
tokenizer = Tokenizer(max_vocab_size=max_vocab_size, max_seq_len=seq_len)\
                        .fit(training_data)

# print(tokenizer.apply(f'kek mda {tokenizer.mask_token}'))
proc_train_dataset = list(map(tokenizer.apply, training_data))

collate_fn = spawn_collate_fn(tokenizer)
train_dataloader = DataLoader(proc_train_dataset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collate_fn)
# next(iter(train_dataloader))

model

In [26]:
emb_size = 54

base_model = BaseEncoderModel(n_heads=3, emb_size=emb_size, 
                              vocab_size=len(tokenizer.vocab),
                              padding_idx=tokenizer.vocab[tokenizer.pad_token])
model = MLMHead(model=base_model)

In [27]:
from tqdm.auto import tqdm
import numpy as np

def train(model, train_dataloader, num_epochs=1, lr=1e-3):    
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss = torch.nn.CrossEntropyLoss(reduction='none')
    for epoch in range(num_epochs):
        losses = []
        print(f'Epoch {epoch}')
        pbar=tqdm(train_dataloader)
        for batch in pbar:
            # x is tensor with masked tokens [batch_size, seq_len]
            x = batch['masked_input_tokens']

            # y is tensor without masked_tokens [batch_size, seq_len]
            y = batch['input_tokens']

            # masked_tokens is boolean tensor with masked tokens [batch_size, seq_len]
            masked_tokens = batch['mlm_mask']
            model.zero_grad()
            logits = model(x)
            loss_res = loss(logits.view(-1,logits.shape[2]), y.view(-1))
            masked_loss = loss_res*masked_tokens.view(-1)
            avg_masked_loss = masked_loss.mean()
            avg_masked_loss.backward()
            optimizer.step()
            losses.append(avg_masked_loss.detach().numpy())
            pbar.set_description(f'CEL: {np.mean(losses):.3f}')
        
train(model, train_dataloader, num_epochs=100, lr=1e-3)

Epoch 0


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 1


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 2


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 3


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 4


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 5


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 6


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 7


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 8


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 9


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 10


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 11


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 12


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 13


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 14


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 15


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 16


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 17


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 18


  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 19


  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [28]:
import joblib

def save_state(model, tokenizer, save_path):
    # will use joblib for simplicity
    state = {
        'tokenizer': tokenizer,
        'model': model
    }
    joblib.dump(state, save_path)
    
save_state(model, tokenizer, 'long_run_model.j')

sanity check

In [29]:
import pandas as pd
from scipy import spatial


def get_embedding(query, tokenizer, base_model):
    tokens = tokenizer.apply(query)
    return base_model(torch.tensor(tokens).view(1, -1))[0, 0].detach().numpy()

queries = list(training_data)[:1000]
embs = []
base_model.eval()

query = 'fairytale for yound ladies. miracle fantasy'
query_emb = get_embedding(query, tokenizer, base_model)

res_lst = []
for entry in queries:
    entry_emb = get_embedding(entry, tokenizer, base_model)
    score = 1 - spatial.distance.cosine(query_emb, entry_emb)
    res_lst.append((query, entry, score))

scores_df = pd.DataFrame(res_lst, columns=['a', 'b', 'score'])
scores_df.sort_values('score', ascending=False)

Unnamed: 0,a,b,score
841,fairytale for yound ladies. miracle fantasy,La Moustache opens with a man thinking about p...,0.882530
966,fairytale for yound ladies. miracle fantasy,"Great movie, great cast. Why an American and a...",0.878254
688,fairytale for yound ladies. miracle fantasy,Old Bobby De Niro strolls through this film wi...,0.875376
410,fairytale for yound ladies. miracle fantasy,The Blues Brothers franchise self destructs. W...,0.857263
262,fairytale for yound ladies. miracle fantasy,This sequel picks up shortly after the first f...,0.853891
...,...,...,...
380,fairytale for yound ladies. miracle fantasy,I gave this a ten in a futile attempt to up th...,0.167051
786,fairytale for yound ladies. miracle fantasy,"i wanted to like this movie,and i really tied....",0.166706
672,fairytale for yound ladies. miracle fantasy,"I could tear this piece of crap apart, frame b...",0.145186
808,fairytale for yound ladies. miracle fantasy,"Sorry folks, but I had some real problems with...",0.141119


In [30]:
# baseline loss
kek_loss = torch.nn.CrossEntropyLoss(reduction='none')
n, C = 5, len(tokenizer.vocab)

a1 = torch.Tensor(np.random.uniform(size=(n, C)))
b1 = torch.Tensor(np.random.randint(C, size=n)).long()
kek_loss(a1, b1).mean()

tensor(10.1464)

## Estimate quliaty with review classification task

In [31]:
import joblib
loaded_state = joblib.load('long_run_model.j')
l_model, l_tokenizer = loaded_state['model'], loaded_state['tokenizer']

In [32]:
import pandas as pd
from datasets import load_dataset
import torch
tmp_dataset = load_dataset("rotten_tomatoes")
train_df, val_df, test_df = pd.DataFrame(tmp_dataset['train']), \
                            pd.DataFrame(tmp_dataset['validation']), \
                            pd.DataFrame(tmp_dataset['test'])

Found cached dataset rotten_tomatoes (/home/grigory/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


  0%|          | 0/3 [00:00<?, ?it/s]

In [33]:
import numpy as np

def batch_emb(inputs, bs = 500):
    train_embs = []
    inputs_p = inputs.text.apply(l_tokenizer.apply).tolist()
    n = len(inputs_p)
    for i in range(0, n, bs):
        yield l_model.model(torch.Tensor(inputs_p[i:i+bs]).long())[:, 0, :].detach().numpy()
    
train_emb = np.concatenate(list(batch_emb(train_df)), axis=0)
val_emb = np.concatenate(list(batch_emb(val_df)), axis=0)
test_emb = np.concatenate(list(batch_emb(test_df)), axis=0)

y_tr, y_val, y_te = train_df.label.values, \
                    val_df.label.values, \
                    test_df.label.values

In [36]:
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

lr = LogisticRegression(max_iter=2000).fit(train_emb, y_tr)
# lr = KNeighborsClassifier(n_neighbors=3).fit(train_emb, y_tr)
# lr = LGBMClassifier(n_estimators=500, max_depth=5, learning_rate=1e-3).fit(train_emb, y_tr)
roc_auc_score(y_true=y_tr, y_score=lr.predict_proba(train_emb)[:, 1]),\
roc_auc_score(y_true=y_te, y_score=lr.predict_proba(test_emb)[:, 1]),\
roc_auc_score(y_true=y_val, y_score=lr.predict_proba(val_emb)[:, 1]),\


# scores_df.sort_values('score', ascending=False)[['b', 'score']].to_dict('records')

(0.6495478478138671, 0.6421825554667727, 0.6459137805406052)

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tfidf = TfidfVectorizer().fit(train_df.text)
tfidf_tr_emb = tfidf.transform(train_df.text)
tfidf_val_emb = tfidf.transform(val_df.text)
tfidf_te_emb = tfidf.transform(test_df.text)


lr = LogisticRegression(max_iter=2000).fit(tfidf_tr_emb, y_tr)
roc_auc_score(y_true=y_tr, y_score=lr.predict_proba(tfidf_tr_emb)[:, 1]),\
roc_auc_score(y_true=y_te, y_score=lr.predict_proba(tfidf_te_emb)[:, 1]),\
roc_auc_score(y_true=y_val, y_score=lr.predict_proba(tfidf_val_emb)[:, 1]),\


# scores_df.sort_values('score', ascending=False)[['b', 'score']].to_dict('records')

(0.9601955445850725, 0.859547536159443, 0.8289444505067074)