# Preprocess text

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
#export
from exp.nb_11a import *

## Data

We will use the IMDB dataset that consists of 50,000 labeled reviews of movies (positive or negative) and 50,000 unlabelled ones.

In [3]:
# path = datasets.untar_data(datasets.URLs.IMDB)

In [6]:
path = Path('/home/jupyter/.fastai/data/imdb')

In [7]:
path.ls()

[PosixPath('/home/jupyter/.fastai/data/imdb/test'),
 PosixPath('/home/jupyter/.fastai/data/imdb/tmp_clas'),
 PosixPath('/home/jupyter/.fastai/data/imdb/imdb.vocab'),
 PosixPath('/home/jupyter/.fastai/data/imdb/README'),
 PosixPath('/home/jupyter/.fastai/data/imdb/unsup'),
 PosixPath('/home/jupyter/.fastai/data/imdb/train'),
 PosixPath('/home/jupyter/.fastai/data/imdb/tmp_lm')]

We define a subclass of `ItemList` that will read the texts in the corresponding filenames.

In [8]:
#export
def read_file(fn): 
    with open(fn, 'r', encoding = 'utf8') as f: return f.read()
    
class TextList(ItemList):
    @classmethod
    def from_files(cls, path, extensions='.txt', recurse=True, include=None, **kwargs):
        return cls(get_files(path, extensions, recurse=recurse, include=include), path, **kwargs)
    
    def get(self, i):
        if isinstance(i, Path): return read_file(i)
        return i

In [10]:
ItemList??

In [11]:
get_files??

Just in case there are some text log files, we restrict the ones we take to the training, test, and unsupervised folders.

In [12]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

We should expect a total of 100,000 texts.

In [13]:
len(il.items)

100000

Here is the first one as an example.

In [15]:
txt = il[1]
txt

"I saw this on television more years ago than I can remember, but never forgot the performance of Sammy Davis, Jr. I just by chance thought to look for it on video. This rendition of Porgy and Bess is a treasure. I would love to see it again and introduce my son to it as well. I just can't imagine why it is not heralded as one of the greatest performances Sammy Davis, Jr. every gave. Whoever is responsible for not bringing this to audiences should be ashamed of his/her ignorance. I will continue to look for it though. Maybe the execs responsible for such things will come to realize the forgotten work of so many African American actors."

For text classification, we will split by the grand parent folder as before, but for language modeling, we take all the texts and just put 10% aside.

In [16]:
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))

In [17]:
sd

SplitData
Train: TextList (90111 items)
[PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/6778_8.txt'), PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/354_10.txt'), PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/10746_8.txt'), PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/4801_10.txt'), PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/6579_7.txt'), PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/4187_7.txt'), PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/3827_9.txt'), PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/12157_7.txt'), PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/9054_8.txt'), PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/8103_10.txt')...]
Path: /home/jupyter/.fastai/data/imdb
Valid: TextList (9889 items)
[PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/7690_10.txt'), PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/9764_10.txt'), PosixPath('/home/jupyter/.fastai/data/imdb/test/pos/6263_10.txt'), PosixPath('/home/jupyte

## Tokenizing

We need to tokenize the dataset first, which is splitting a sentence in individual tokens. Those tokens are the basic words or punctuation signs with a few tweaks: don't for instance is split between do and n't. We will use a processor for this, in conjunction with the [spacy library](https://spacy.io/).

In [18]:
#export
import spacy,html

Before even tokenizeing, we will apply a bit of preprocessing on the texts to clean them up (we saw the one up there had some HTML code). These rules are applied before we split the sentences in tokens.

In [19]:
#export
#special tokens
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()

def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub("\n", t)

def spec_add_spaces(t):
    "Add spaces around / and #"
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    "Replace repetitions at the character level: cccc -> TK_REP 4 c"
    def _replace_rep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)
    
def replace_wrep(t):
    "Replace word repetitions: word word word -> TK_WREP 3 word"
    def _replace_wrep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def fixup_text(x):
    "Various messy things we've seen in documents"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
    
default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

In [20]:
replace_rep('cccc')

' xxrep 4 c '

In [21]:
replace_wrep('word word word word word ')

' xxwrep 5 word  '

These rules are applied after the tokenization on the list of tokens.  (they apply at the word level so apply after you have split into words/tokens)

In [22]:
#export
def replace_all_caps(x):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    res = []
    for t in x:
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
        else: res.append(t)
    return res

def deal_caps(x):
    "Replace all Capitalized tokens in by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
        if t == '': continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
        res.append(t.lower())
    return res


# these are really important tokens because tell RNN to reset state. we are not longer talking about the same topic after EOS
def add_eos_bos(x): return [BOS] + x + [EOS]

default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]

In [28]:
replace_all_caps(['I', 'AM', 'SHOUTING'])

['I', 'xxup', 'am', 'xxup', 'shouting']

In [29]:
deal_caps(['My', 'name', 'is', 'Jeremy'])

['xxmaj', 'my', 'name', 'is', 'xxmaj', 'jeremy']

In [27]:
deal_caps(["I"])

['i']

Since tokenizing and applying those rules takes a bit of time, we'll parallelize it using `ProcessPoolExecutor` to go faster.

In [30]:
#export
from spacy.symbols import ORTH
from concurrent.futures import ProcessPoolExecutor

def parallel(func, arr, max_workers=4):
    if max_workers<2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr)))
    if any([o is not None for o in results]): return results

In [31]:
#export
class TokenizeProcessor(Processor):
    def __init__(self, lang="en", chunksize=2000, pre_rules=None, post_rules=None, max_workers=4): 
        self.chunksize,self.max_workers = chunksize,max_workers
        self.tokenizer = spacy.blank(lang).tokenizer
        for w in default_spec_tok:
            self.tokenizer.add_special_case(w, [{ORTH: w}])
        self.pre_rules  = default_pre_rules  if pre_rules  is None else pre_rules
        self.post_rules = default_post_rules if post_rules is None else post_rules

    def proc_chunk(self, args):
        i,chunk = args
        chunk = [compose(t, self.pre_rules) for t in chunk]
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
        docs = [compose(t, self.post_rules) for t in docs]
        return docs

    def __call__(self, items): 
        toks = []
        if isinstance(items[0], Path): items = [read_file(i) for i in items]
        chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
        toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
        return sum(toks, [])
    
    def proc1(self, item): return self.proc_chunk([item])[0]
    
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

In [32]:
tp = TokenizeProcessor()

In [33]:
txt[:250]

'I saw this on television more years ago than I can remember, but never forgot the performance of Sammy Davis, Jr. I just by chance thought to look for it on video. This rendition of Porgy and Bess is a treasure. I would love to see it again and intro'

In [35]:
' • '.join(tp(il[:100])[1])[:400]

'xxbos • i • saw • this • on • television • more • years • ago • than • i • can • remember • , • but • never • forgot • the • performance • of • xxmaj • sammy • xxmaj • davis • , • xxmaj • jr. • i • just • by • chance • thought • to • look • for • it • on • video • . • xxmaj • this • rendition • of • xxmaj • porgy • and • xxmaj • bess • is • a • treasure • . • i • would • love • to • see • it • aga'

In [38]:
tp.deproc1('saw')

's a w'

In [42]:
tp.deprocess(["saw", "this"])

['s a w', 't h i s']

### Note that we are not removing stop words or stemmig (removig "ing"s) which is quite common in traditional NLP. It's a terrible idea because you are losing valuable info.

## Numericalizing

Once we have tokenized our texts, we replace each token by an individual number, this is called numericalizing. Again, we do this with a processor (not so different from the `CategoryProcessor`).

In [43]:
#export
import collections

class NumericalizeProcessor(Processor):
    def __init__(self, vocab=None, max_vocab=60000, min_freq=2): 
        self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq
    
    def __call__(self, items):
        #The vocab is defined on the first use.
        if self.vocab is None:
            freq = Counter(p for o in items for p in o)
            self.vocab = [o for o,c in freq.most_common(self.max_vocab) if c >= self.min_freq]
            for o in reversed(default_spec_tok):
                if o in self.vocab: self.vocab.remove(o)
                self.vocab.insert(0, o)
        if getattr(self, 'otoi', None) is None:
            self.otoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.vocab)}) 
        return [self.proc1(o) for o in items]
    def proc1(self, item):  return [self.otoi[o] for o in item]
    
    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    def deproc1(self, idx): return [self.vocab[i] for i in idx]

When we do language modeling, we will infer the labels from the text during training, so there's no need to label. The training loop expects labels however, so we need to add dummy ones.

In [62]:
proc_tok,proc_num = TokenizeProcessor(max_workers=8),NumericalizeProcessor()

In [63]:
proc_num.vocab

In [64]:
%time ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok,proc_num])

CPU times: user 20.5 s, sys: 2.98 s, total: 23.5 s
Wall time: 1min 7s


Once the items have been processed they will become list of numbers, we can still access the underlying raw data in `x_obj` (or `y_obj` for the targets, but we don't have any here).

In [65]:
ll.train.x_obj(1)

"xxbos * this comment will probably have spoilers ! ! i checked the spoilers box just in case but it might not have spoilers , but be aware anyways if i say something that you might consider a spoiler and i don't ! * xxmaj wow ... best game since xxmaj super xxmaj mario 64 . i got this game the first day it came out , and before i got it , i went on some gaming websites to look at its ratings ( yes , they already reviewed it before it came out ) , and i was shocked . i was expecting something like xxmaj sunshine because lately all the xxmaj mario games have kind of been getting worse and worse . xxmaj but this one totally beat the other games . xxmaj the scores on this game even beat xxmaj halo 3 ! xxmaj it 's simply amazing . \n\n story : xxmaj not the best , xxmaj mario games are never known for their plots , and this one is n't really much of a difference . xxmaj bowser once again kidnaps xxmaj peach but this time invades the xxmaj mushroom xxmaj kingdom on a festival celebrated onc

Since the preprocessing tajes time, we save the intermediate result using pickle. Don't use any lambda functions in your processors or they won't be able to pickle.

In [66]:
pickle.dump(ll, open(path/'ld.pkl', 'wb'))

In [67]:
ll = pickle.load(open(path/'ld.pkl', 'rb'))

In [71]:
ll.train[1]

([2,
  195,
  19,
  947,
  105,
  261,
  41,
  1068,
  50,
  50,
  18,
  4789,
  8,
  1068,
  883,
  56,
  17,
  438,
  30,
  16,
  251,
  37,
  41,
  1068,
  10,
  30,
  43,
  1855,
  3806,
  62,
  18,
  157,
  158,
  20,
  32,
  251,
  1251,
  12,
  1324,
  11,
  18,
  3310,
  50,
  195,
  7,
  1373,
  92,
  138,
  473,
  252,
  7,
  1266,
  7,
  4315,
  11784,
  9,
  18,
  209,
  19,
  473,
  8,
  107,
  271,
  16,
  410,
  61,
  10,
  11,
  181,
  18,
  209,
  16,
  10,
  18,
  434,
  34,
  65,
  11346,
  15935,
  14,
  185,
  45,
  114,
  2974,
  36,
  445,
  10,
  46,
  494,
  6761,
  16,
  181,
  16,
  410,
  61,
  33,
  10,
  11,
  18,
  25,
  2230,
  9,
  18,
  25,
  1037,
  158,
  53,
  7,
  5048,
  106,
  4618,
  44,
  8,
  7,
  4315,
  1489,
  41,
  265,
  13,
  98,
  401,
  460,
  11,
  460,
  9,
  7,
  30,
  19,
  42,
  483,
  1444,
  8,
  101,
  1489,
  9,
  7,
  8,
  4161,
  34,
  19,
  473,
  76,
  1444,
  7,
  18073,
  389,
  50,
  7,
  16,
  22,
  346,
  525,
  9,
  

## Batching

We have a bit of work to convert our `LabelList` in a `DataBunch` as we don't just want batches of IMDB reviews. We want to stream through all the texts concatenated. We also have to prepare the targets that are the newt words in the text. All of this is done with the next object called `LM_PreLoader`. At the beginning of each epoch, it'll shuffle the articles (if `shuffle=True`) and create a big stream by concatenating all of them. We divide this big stream in `bs` smaller streams. That we will read in chunks of bptt length.

In [72]:
# Just using those for illustration purposes, they're not used otherwise.
from IPython.display import display,HTML
import pandas as pd

Let's say our stream is:

In [73]:
stream = """
In this notebook, we will go back over the example of classifying movie reviews we studied in part 1 and dig deeper under the surface. 
First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the Processor used in the data block API.
Then we will study how we build a language model and train it.\n
"""
tokens = np.array(tp([stream])[0])

Then if we split it in 6 batches it would give something like this:

In [74]:
bs,seq_len = 6,15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
xxbos,\n,xxmaj,in,this,notebook,",",we,will,go,back,over,the,example,of
classifying,movie,reviews,we,studied,in,part,1,and,dig,deeper,under,the,surface,.
\n,xxmaj,first,we,will,look,at,the,processing,steps,necessary,to,convert,text,into
numbers,and,how,to,customize,it,.,xxmaj,by,doing,this,",",we,'ll,have
another,example,of,the,xxmaj,processor,used,in,the,data,block,api,.,\n,xxmaj
then,we,will,study,how,we,build,a,language,model,and,train,it,.,\n\n


Then if we have a `bptt` of 5, we would go over those three batches.  (mean 6 batches??)

### We are taking a batch size of 6 lines. we have bptt of 5, so we are taking 5 tokens in each line.  We need the 5 tokens of each line from each batch to match up with its corresponding next tokens so that the rnn state makes sense.  So we see how see how the 1st line and the 7th line fit together here (they will be the first part of every batch so the state will continue.  Same with the second line and the 8th line, etc.

In [75]:
bs,bptt = 6,5
for k in range(3):
    d_tokens = np.array([tokens[i*seq_len + k*bptt:i*seq_len + (k+1)*bptt] for i in range(bs)])
    df = pd.DataFrame(d_tokens)
    display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4
xxbos,\n,xxmaj,in,this
classifying,movie,reviews,we,studied
\n,xxmaj,first,we,will
numbers,and,how,to,customize
another,example,of,the,xxmaj
then,we,will,study,how


0,1,2,3,4
notebook,",",we,will,go
in,part,1,and,dig
look,at,the,processing,steps
it,.,xxmaj,by,doing
processor,used,in,the,data
we,build,a,language,model


0,1,2,3,4
back,over,the,example,of
deeper,under,the,surface,.
necessary,to,convert,text,into
this,",",we,'ll,have
block,api,.,\n,xxmaj
and,train,it,.,\n\n


### This LM_PreLoader class creates our data loader. Our y for every x is just the x shifted over 1 since we are trying to predict the next word.  So our getitem gives us our x and then the y is x indexed over 1

In [76]:
#export
class LM_PreLoader():
    def __init__(self, data, bs=64, bptt=70, shuffle=False):
        self.data,self.bs,self.bptt,self.shuffle = data,bs,bptt,shuffle
        total_len = sum([len(t) for t in data.x])
        self.n_batch = total_len // bs
        self.batchify()
    
    def __len__(self): return ((self.n_batch-1) // self.bptt) * self.bs
    
    def __getitem__(self, idx):
        source = self.batched_data[idx % self.bs]
        seq_idx = (idx // self.bs) * self.bptt
        return source[seq_idx:seq_idx+self.bptt],source[seq_idx+1:seq_idx+self.bptt+1]
    
    def batchify(self):
        texts = self.data.x
        if self.shuffle: texts = texts[torch.randperm(len(texts))]
        stream = torch.cat([tensor(t) for t in texts])
        self.batched_data = stream[:self.n_batch * self.bs].view(self.bs, self.n_batch)

In [77]:
dl = DataLoader(LM_PreLoader(ll.valid, shuffle=True), batch_size=64)

Let's check it all works ok: `x1`, `y1`, `x2` and `y2` should all be of size `bs`  by `bptt`. The texts in each row of `x1` should continue in `x2`. `y1` and `y2` should have the same texts as their `x` counterpart, shifted of one position to the right.

In [78]:
iter_dl = iter(dl)
x1,y1 = next(iter_dl)
x2,y2 = next(iter_dl)

In [79]:
x1.size(),y1.size()

(torch.Size([64, 70]), torch.Size([64, 70]))

In [80]:
vocab = proc_num.vocab

In [81]:
" ".join(vocab[o] for o in x1[0])

'xxbos xxmaj lipstick is another glossy movie failure . i am trying to think of one good thing that i could say about the movie , and i am having trouble coming up with something . i guess the red dress that xxmaj margaux xxmaj hemingway was wearing in the end of the movie was the best part . xxmaj the writing and the script was not the worst that'

In [82]:
" ".join(vocab[o] for o in y1[0])

'xxmaj lipstick is another glossy movie failure . i am trying to think of one good thing that i could say about the movie , and i am having trouble coming up with something . i guess the red dress that xxmaj margaux xxmaj hemingway was wearing in the end of the movie was the best part . xxmaj the writing and the script was not the worst that i'

In [83]:
" ".join(vocab[o] for o in x2[0])

"i have ever encountered , but it could have been a lot better . xxmaj lipstick was very pleasing to the eye to view . xxmaj the sets were very glossy and nice to look at . xxmaj the cast was okay . i felt like xxmaj anne xxmaj bancroft 's character was the only feasible character in the entire movie . xxmaj it was sad to see xxmaj chris"

In [84]:
" ".join(vocab[o] for o in y2[0])

"have ever encountered , but it could have been a lot better . xxmaj lipstick was very pleasing to the eye to view . xxmaj the sets were very glossy and nice to look at . xxmaj the cast was okay . i felt like xxmaj anne xxmaj bancroft 's character was the only feasible character in the entire movie . xxmaj it was sad to see xxmaj chris xxmaj"

And let's prepare some convenience function to do this quickly.

In [86]:
#export
def get_lm_dls(train_ds, valid_ds, bs, bptt, **kwargs):
    return (DataLoader(LM_PreLoader(train_ds, bs, bptt, shuffle=True), batch_size=bs, **kwargs),
            DataLoader(LM_PreLoader(valid_ds, bs, bptt, shuffle=False), batch_size=2*bs, **kwargs))

def lm_databunchify(sd, bs, bptt, **kwargs):
    return DataBunch(*get_lm_dls(sd.train, sd.valid, bs, bptt, **kwargs))

In [87]:
bs,bptt = 64,70
data = lm_databunchify(ll, bs, bptt)

## Batching for classification

When we will want to tackle classification, gathering the data will be a bit different: first we will label our texts with the folder they come from, and then we will need to apply padding to batch them together. To avoid mixing very long texts with very short ones, we will also use `Sampler` to sort (with a bit of randomness for the training set) our samples by length.

First the data block API calls shold look familiar.

In [85]:
proc_cat = CategoryProcessor()

In [88]:
CategoryProcessor??

In [89]:
il = TextList.from_files(path, include=['train', 'test'])
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='test'))
ll = label_by_func(sd, parent_labeler, proc_x = [proc_tok, proc_num], proc_y=proc_cat)

In [90]:
pickle.dump(ll, open(path/'ll_clas.pkl', 'wb'))

In [91]:
ll = pickle.load(open(path/'ll_clas.pkl', 'rb'))

Let's check the labels seem consistent with the texts.

In [92]:
[(ll.train.x_obj(i), ll.train.y_obj(i)) for i in [1,12552]]

[("xxbos xxmaj in a performance both volatile and graceful , xxmaj al xxmaj pacino re - teams with xxmaj sea of xxmaj love director , xxmaj harold xxmaj becker . \n\n xxmaj as xxmaj new xxmaj york xxmaj mayor xxmaj john xxmaj pappas in xxmaj city xxmaj hall . \n\n a savvy thriller that s the first film ever shot inside the lower xxmaj manhattan structure that 's ground zero for the xxmaj city 's government . \n\n xxmaj that the other nyc locations provide the vivid settings as an idealistic mayoral aide ( xxmaj john xxmaj cusack ) follows a trail of subversion and cover - up that may loop back to the man he serves and reveres . \n\n xxmaj bridget xxmaj fonda , xxmaj danny xxmaj aiello , xxmaj martin xxmaj landau , xxmaj tony xxmaj franciosa and xxmaj david xxmaj paymer add more starry brilliance to this gripping tale of power . \n\n xxmaj and the power behind power . xxeos",
  'pos'),
 ('xxbos a little girl lives with her father and brother in the middle of the countryside . xxmaj this

We saw samplers in notebook 03. For the validation set, we will simply sort the samples by length, and we begin with the longest ones for memory reasons (it's better to always have the biggest tensors first).

#### We sort so that all of the long documents are in the same batches.  This avoids you wasting gpu on going through padding of a shorter document mixed in with the longer documents. For validation set we can do a true sort. For training we want the order of our docs randomized so we do a "sortish" where we group into mega batches that are sorted and then shuffle within those

In [93]:
#export
from torch.utils.data import Sampler

class SortSampler(Sampler):
    def __init__(self, data_source, key): self.data_source,self.key = data_source,key
    def __len__(self): return len(self.data_source)
    def __iter__(self):
        return iter(sorted(list(range(len(self.data_source))), key=self.key, reverse=True))

For the training set, we want some kind of randomness on top of this. So first, we shuffle the texts and build megabatches of size `50 * bs`. We sort those megabatches by length before splitting them in 50 minibatches. That way we will have randomized batches of roughly the same length.

Then we make sure to have the biggest batch first and shuffle the order of the other batches. We also make sure the last batch stays at the end because its size is probably lower than batch size.

In [94]:
#export
class SortishSampler(Sampler):
    def __init__(self, data_source, key, bs):
        self.data_source,self.key,self.bs = data_source,key,bs

    def __len__(self) -> int: return len(self.data_source)

    def __iter__(self):
        idxs = torch.randperm(len(self.data_source))
        megabatches = [idxs[i:i+self.bs*50] for i in range(0, len(idxs), self.bs*50)]
        sorted_idx = torch.cat([tensor(sorted(s, key=self.key, reverse=True)) for s in megabatches])
        batches = [sorted_idx[i:i+self.bs] for i in range(0, len(sorted_idx), self.bs)]
        max_idx = torch.argmax(tensor([self.key(ck[0]) for ck in batches]))  # find the chunk with the largest key,
        batches[0],batches[max_idx] = batches[max_idx],batches[0]            # then make sure it goes first.
        batch_idxs = torch.randperm(len(batches)-2)
        sorted_idx = torch.cat([batches[i+1] for i in batch_idxs]) if len(batches) > 1 else LongTensor([])
        sorted_idx = torch.cat([batches[0], sorted_idx, batches[-1]])
        return iter(sorted_idx)

Padding: we had the padding token (that as an id of 1) at the end of each sequence to make them all the same size when batching them. Note that we need padding at the end to be able to use `PyTorch` convenience functions that will let us ignore that padding (see 12c).

In [95]:
#export
def pad_collate(samples, pad_idx=1, pad_first=False):
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    for i,s in enumerate(samples):
        if pad_first: res[i, -len(s[0]):] = LongTensor(s[0])
        else:         res[i, :len(s[0]) ] = LongTensor(s[0])
    return res, tensor([s[1] for s in samples])

In [96]:
bs = 64
train_sampler = SortishSampler(ll.train.x, key=lambda t: len(ll.train[int(t)][0]), bs=bs)
train_dl = DataLoader(ll.train, batch_size=bs, sampler=train_sampler, collate_fn=pad_collate)

In [97]:
iter_dl = iter(train_dl)
x,y = next(iter_dl)

In [98]:
lengths = []
for i in range(x.size(0)): lengths.append(x.size(1) - (x[i]==1).sum().item())
lengths[:5], lengths[-1]

([3311, 1782, 1577, 1399, 1371], 1016)

The last one is the minimal length. This is the first batch so it has the longest sequence, but if look at the next one that is more random, we see lengths are roughly the sames.

In [99]:
x,y = next(iter_dl)
lengths = []
for i in range(x.size(0)): lengths.append(x.size(1) - (x[i]==1).sum().item())
lengths[:5], lengths[-1]

([128, 128, 128, 127, 127], 115)

We can see the padding at the end:

In [100]:
x

tensor([[   2,    7, 4894,  ...,  754,    9,    3],
        [   2,   42,   13,  ...,  183,    9,    3],
        [   2,    7,   19,  ...,   15,    9,    3],
        ...,
        [   2,    7,  117,  ...,    1,    1,    1],
        [   2,    7,    8,  ...,    1,    1,    1],
        [   2,    7,   16,  ...,    1,    1,    1]])

And we add a convenience function:

In [101]:
#export
def get_clas_dls(train_ds, valid_ds, bs, **kwargs):
    train_sampler = SortishSampler(train_ds.x, key=lambda t: len(train_ds.x[t]), bs=bs)
    valid_sampler = SortSampler(valid_ds.x, key=lambda t: len(valid_ds.x[t]))
    return (DataLoader(train_ds, batch_size=bs, sampler=train_sampler, collate_fn=pad_collate, **kwargs),
            DataLoader(valid_ds, batch_size=bs*2, sampler=valid_sampler, collate_fn=pad_collate, **kwargs))

def clas_databunchify(sd, bs, **kwargs):
    return DataBunch(*get_clas_dls(sd.train, sd.valid, bs, **kwargs))

In [102]:
bs,bptt = 64,70
data = clas_databunchify(ll, bs)

## Export

In [None]:
!python notebook2script.py 12_text.ipynb