In [1]:
# default_exp data_process

### Experimenting DataLoader/Dataset class
### Should we just use fast.ai? (load the model into learner class in fastai as well)

#### start from torch dataloader/dataset

In [2]:
from torch.utils.data import DataLoader, Dataset

In [3]:
# we can uncomment below as well
# from model_api import label2int
label2int = {
    "sadness": 0,
    "joy": 1,
    "love": 2,
    "anger": 3,
    "fear": 4, 
    "surprise": 5
}

In [4]:
# from example nb (01 nb)
# map style dataset: impl. __getitem__() (this one)
class EmoDataset(Dataset):
    def __init__(self, path):
        super().__init__()
        self.data_column = "text"
        self.class_column = "class"
        self.data = pd.read_csv(path, sep=";", header=None, names=[self.data_column, self.class_column],
                               engine="python")

    def __getitem__(self, idx):
        return self.data.loc[idx, self.data_column], label2int[self.data.loc[idx, self.class_column]]

    def __len__(self):
        return self.data.shape[0]

In [5]:
# from example as well
def create_dataloader(ds_path: str, shuffle=False, batch_size=32):
    from torch.data.utils import DataLoader, Dataset
    return DataLoader(EmoDataset(ds_path), batch_size=batch_size, shuffle=shuffle)

In [6]:
# subset of training dataset
# #of data = 128
PATH = "dev_train.txt"

#### Let's first rebuild the model class and see what happens

In [7]:
# export
import torch
import torch.nn as nn
import torch.nn.functional as F

In [8]:
# from 01 nb
class EmoModel(nn.Module):
    def __init__(self, base_model, n_classes, base_model_output_size=768, dropout=0.05):
        super().__init__()
        self.base_model = base_model
        
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(base_model_output_size, base_model_output_size),
            Mish(),
            nn.Dropout(dropout),
            nn.Linear(base_model_output_size, n_classes)
        )
        
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                layer.weight.data.normal_(mean=0.0, std=0.02)
                if layer.bias is not None:
                    layer.bias.data.zero_()

    def forward(self, input_, *args):
        X, attention_mask = input_
        hidden_states = self.base_model(X, attention_mask=attention_mask)
        # maybe do some pooling / RNNs... go crazy here!
        # use the <s> representation
        return self.classifier(hidden_states[0][:, 0, :])

### feed w/ dev_train for speed

In [9]:
from transformers import AutoModelWithLMHead

In [10]:
@torch.jit.script
def mish(input):
    return input * torch.tanh(F.softplus(input))

class Mish(nn.Module):
    def forward(self, input):
        return mish(input)


In [11]:
model = EmoModel(AutoModelWithLMHead.from_pretrained("distilroberta-base").base_model, 6)



In [12]:
model.eval()

EmoModel(
  (base_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,

#### get tokenizer

In [13]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
# needed for tokenize the user input
class TokenizersCollateFn:
    def __init__(self, max_tokens=512):

        # I still need this to parse the input
        # try to figure out where to store these tokens
        # instead of re-download it every time
        # reload is probably fine if the model is on AWS?
        ## RoBERTa uses BPE tokenizer similar to GPT
        t = ByteLevelBPETokenizer(
            "tokenizer/vocab.json",
            "tokenizer/merges.txt"
        )
        t._tokenizer.post_processor = BertProcessing(
            ("</s>", t.token_to_id("</s>")),
            ("<s>", t.token_to_id("<s>")),
        )
        t.enable_truncation(max_tokens)
        t.enable_padding(length=max_tokens, pad_id=t.token_to_id("<pad>"))
        self.tokenizer = t

    def __call__(self, batch):
        encoded = self.tokenizer.encode_batch([x[0] for x in batch])
        sequences_padded = torch.tensor([enc.ids for enc in encoded])
        attention_masks_padded = torch.tensor([enc.attention_mask for enc in encoded])
        labels = torch.tensor([x[1] for x in batch])

        return (sequences_padded, attention_masks_padded), labels

In [14]:
_ = TokenizersCollateFn()
tokenizer = _.tokenizer

In [15]:
# try to feed it
t = "Elvis is the king of rock"
# might not be "encode_plus since we're using ByteLevel... instead of AutoTokenizer"
enc = tokenizer.encode(t)
enc

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [16]:
ref = tokenizer.encode_batch(t)

In [17]:
ref[0]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [18]:
len(enc.attention_mask)

512

In [19]:
tokenizer

Tokenizer(vocabulary_size=50265, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [20]:
X = torch.tensor(enc.ids).unsqueeze(0)
Attn = torch.tensor(enc.attention_mask).unsqueeze(0)

In [21]:
with torch.no_grad():
    rep = model((X,Attn))

In [22]:
rep

tensor([[-0.0937,  0.0017, -0.0608, -0.0633,  0.0188, -0.0855]])

In [23]:
# (input_batch_size, depth_of_model (see model = Emomodle(...)))
print(rep.shape)

torch.Size([1, 6])


In [24]:
# it worked !
tokenizer.decode(enc.ids)[:(len(t)+10)], len(tokenizer.decode(enc.ids))

('<s>Elvis is the king of rock</s><pa', 2547)

#### wrap string conversion into a function

In [25]:
#export
def get_tokenizer(max_tokens=512):
    from tokenizers import ByteLevelBPETokenizer
    from tokenizers.processors import BertProcessing
    # add error checking
    voc_file = "tokenizer/vocab.json"
    merg_file = "tokenizer/merges.txt"

    import os.path
    if not os.path.isfile(voc_file) or not os.path.isfile(merg_file):
        from EMO_AI.model_api import setup_tokenizer
        setup_tokenizer()

    t = ByteLevelBPETokenizer(
        voc_file,
        merg_file
    )
    t._tokenizer.post_processor = BertProcessing(
        ("</s>", t.token_to_id("</s>")),
        ("<s>", t.token_to_id("<s>")),
    )
    t.enable_truncation(max_tokens)
    t.enable_padding(length=max_tokens, pad_id=t.token_to_id("<pad>"))
    return t

In [26]:
#export
def convert_text_to_tensor(text, tokenizer=None):
    if tokenizer is None:
        tokenizer = get_tokenizer()
    enc = tokenizer.encode(text)
    X = torch.tensor(enc.ids).unsqueeze(0)
    Attn = torch.tensor(enc.attention_mask).unsqueeze(0)
    return (X, Attn)

In [27]:
t

'Elvis is the king of rock'

In [28]:
x, attn = convert_text_to_tensor(t)
with torch.no_grad():
    model.eval()
    rep = model((x,attn))
    print(rep)

tensor([[-0.0937,  0.0017, -0.0608, -0.0633,  0.0188, -0.0855]])


#### Let's write a function to load text from file (.txt, .csv, ..etc) too

In [29]:
# export
def load_text_from_file(file, use_iter=False, mode="r"):
    """change 'r' to read non-regular files, e.g., change to 'rb' to read byte files"""
    # TODO: figure out how to make this an iterator (e.g., use Dataset class in torch?)
    # since it'll save much RAM space
    if mode != 'r':
        print("attention, you're using not regular read mode\n the returning type is not guaranteed to work")
    ret = []
    with open(file, mode) as f:
        for line in f:
            ret.append(line)
    return ret

In [30]:
tmpfile = load_text_from_file(PATH)
with torch.no_grad():
    model.eval()
    repp = model(convert_text_to_tensor(tmpfile[1]))
    print(repp)

tensor([[-0.0900,  0.0021, -0.0561, -0.0747,  0.0262, -0.0774]])


#### Do prediction

In [49]:
# this is the output
# label are from 1 to 6
out = torch.argmax(repp, dim=1)
out

tensor([4])

In [50]:
label2int.values()

dict_values([0, 1, 2, 3, 4, 5])

In [52]:
for key in label2int:
    if label2int[key] == out:
        print(key, out)
print(label2int)

fear tensor([4])
{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}


In [32]:
assert 1 == 0, "stop here, below not working"

AssertionError: stop here, below not working

#### integrate w/ dataloader

In [None]:
create_dataloader??

[1;31mSignature:[0m [0mcreate_dataloader[0m[1;33m([0m[0mds_path[0m[1;33m:[0m [0mstr[0m[1;33m,[0m [0mshuffle[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mbatch_size[0m[1;33m=[0m[1;36m32[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m <no docstring>
[1;31mSource:[0m   
[1;32mdef[0m [0mcreate_dataloader[0m[1;33m([0m[0mds_path[0m[1;33m:[0m [0mstr[0m[1;33m,[0m [0mshuffle[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mbatch_size[0m[1;33m=[0m[1;36m32[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m    [1;32mfrom[0m [0mtorch[0m[1;33m.[0m[0mdata[0m[1;33m.[0m[0mutils[0m [1;32mimport[0m [0mDataLoader[0m[1;33m,[0m [0mDataset[0m[1;33m
[0m    [1;32mreturn[0m [0mDataLoader[0m[1;33m([0m[0mEmoDataset[0m[1;33m([0m[0mds_path[0m[1;33m)[0m[1;33m,[0m [0mbatch_size[0m[1;33m=[0m[0mbatch_size[0m[1;33m,[0m [0mshuffle[0m[1;33m=[0m[0mshuffle[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mFile:[0m      c:\users

In [None]:
import pandas as pd

In [None]:
test_dl = create_data_loader("training.csv")

In [None]:
a = iter(test_dl)
a

<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x23037de6f08>

In [None]:
# we don't actually need this for runtime inference
with torch.no_grad():
    model.eval()
    for i, batch_ in enumerate(test_dl, 0):
        (X, attn), y = batch_
        batch = X, attn
        y_pred = torch.argmax(model(batch), dim=1)
        break