In [None]:
from pathlib import Path
import tokenizers as tk
import mltrainer
mltrainer.__version__


We load the IMDB dataset. This is the MNIST for language models

In [None]:
from mads_datasets import DatasetFactoryProvider, DatasetType
imdbdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.IMDB)
datasets = imdbdatasetfactory.create_dataset()

In [None]:
traindataset = datasets["train"]
testdataset = datasets["valid"]

It consists of 50k movie reviews, labeled positive or negative

let's have a look at the first datapoint

In [None]:
x, y = traindataset[10]
x, y


In [None]:
for x,y in traindataset:
    if "<br" in x:
        print(x)
        break

This is messy data. We have Uppercase, punctuation, and even html tags. Let's clean some of that in order to reduce dimensionality, without loosing too much information about the sentiment.

In [None]:
test = "I'd like to think myself as a fairly open minded guy and it takes a lot(!) for me to dislike a movie but this one is without a doubt one of the suckiest, crappiest movie I've ever seen!<br />From the decrepit ranks of the already over-saturated \'Hillybilly Horror\'"

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Lowercase, StripAccents, Sequence, NFD, Replace

In [None]:
normalizer = Sequence([NFD(), Replace("<br />", ""), StripAccents(), Lowercase()])
normalizer.normalize_str(test)

In [None]:
tokenizer = Tokenizer(BPE())
trainer = BpeTrainer(special_tokens=["<unk>"])
tokenizer.pre_tokenizer = Whitespace()
tokenizer.normalizer = normalizer
tokenizer.train_from_iterator(traindataset, trainer=trainer)
print(f"the vocab size is {tokenizer.get_vocab_size()}")

In [None]:
tokenizer.get_vocab()

This maps a sentence of words to a sequence of integers

In [None]:
tokenizer.encode(test).ids

And we can create a tensor with this.

In [None]:
import torch
torch.tensor(tokenizer.encode(test).ids)

In [None]:
from typing import List, Tuple, Optional, Callable
from torch.nn.utils.rnn import pad_sequence
import torch

Tensor = torch.Tensor


class Preprocessor:
    def __init__(
        self, max: int, tokenizer
    ) -> None:
        self.max = max
        self.tokenizer = tokenizer

    def cast_label(self, label: str) -> int:
        if label == "neg":
            return 0
        else:
            return 1

    def __call__(self, batch: List) -> Tuple[Tensor, Tensor]:
        labels, text = [], []
        for x, y in batch:
            tokens = torch.tensor(self.tokenizer.encode(x).ids)
            tokens = tokens[:self.max]
            text.append(tokens)
            labels.append(self.cast_label(y))

        text_ = pad_sequence(text, batch_first=True, padding_value=0)
        return text_, torch.tensor(labels)


In [None]:
corpus = [("The cat sat on the mat", "neg"), ("Where is the cat?", "pos"), ("The cat is blasé", "neg")]
pp = Preprocessor(max = 5, tokenizer=tokenizer)
pp(corpus)

Preprocessing is necessary to:
- cut of long sentences to get equal length. 100 words will be enough to get the sentiment in most cases
- we need to cast the labels "neg" and "pos" to integers
- we also pad if a sentence is shorter than the max lenght

We can feed the preprocessor to the default dataloader from torch

In [None]:
from torch.utils.data import DataLoader

preprocessor = Preprocessor(max=512, tokenizer=tokenizer)
dataloader = DataLoader(
    traindataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)


We now get batched sentences and labels

In [None]:
x, y = next(iter(dataloader))

x.shape, y.shape


In [None]:
x[0]


All this code is wrapped into the DatasetFactoryProvider, which you can see in the next notebook.