# Preprocess text

In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
#export
from exp.nb_11 import *

## Data

We will use the IMDB dataset.

In [None]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [None]:
path.ls()

[PosixPath('/home/ubuntu/.fastai/data/imdb/test'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/tmp_clas'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/README'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/unsup'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/train'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/tmp_lm'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/imdb.vocab'),
 PosixPath('/home/ubuntu/.fastai/data/imdb/ld.pkl')]

An ItemList that will read the texts in the corresponding filenames.

In [None]:
#export
def read_file(fn): 
    with open(fn, 'r', encoding = 'utf8') as f: return f.read()
    
class TextList(ItemList):
    @classmethod
    def from_files(cls, path, extensions=None, recurse=True, include=None, **kwargs):
        if extensions is None: extensions = {'.txt'}
        return cls(get_files(path, extensions, recurse=recurse, include=include), path, **kwargs)
    
    def get(self, i):
        if isinstance(i, Path): return read_file(i)
        return i

In [None]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [None]:
len(il.items)

100000

For text classification, we will split by the grand parent folder as before, but for language modeling, we take all the texts and just put 10% aside.

In [None]:
#export
@classmethod
def _split_by_rand_pct(cls, il, pct=0.2):
    rand_idx = np.random.permutation(range(len(il.items)))
    cut = int(pct * len(il.items))
    train, valid = il.new(il[rand_idx[cut:]]),il.new(il[rand_idx[:cut]])
    return cls(train, valid)

SplitData.split_by_rand_pct = _split_by_rand_pct

In [None]:
sd = SplitData.split_by_rand_pct(il, pct=0.1)

When we do language modeling, we will infer the labels from the text during training, so there's no need to label. The training loop expects labels however, so we need to add dummy ones. However there is a lot to do in terms of preprocessing the inputs!

### Tokenizing

We need to tokenize the dataset first. We will use a processor for this, in conjunction with the [spacy library](https://spacy.io/).

In [None]:
#export
import spacy,html

Before even tokenizeing, we will apply a bit of preprocessing on the texts to clean them up:

In [None]:
#export
BOS, EOS, UNK, PAD, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxbox", "xxeos", "xxunk", "xxpad", "xxrep", "xxwrep", "xxup", "xxmaj"

def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub("\n", t)

def spec_add_spaces(t):
    "Add spaces between special characters"
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    "Replace repetitions at the character level"
    def _replace_rep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)
    
def replace_wrep(t):
    "Replace word repetitions"
    def _replace_wrep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def fixup(x):
    "List of replacements from html strings"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
    
default_pre_rules = [fixup, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [BOS, UNK, PAD, TK_REP, TK_WREP, TK_UP, TK_MAJ]

In [None]:
#export
def replace_all_caps(x):
    "Replace tokens in ALL CAPS in `x` by their lower version and add `TK_UP` before."
    res = []
    for t in x:
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
        else: res.append(t)
    return res

def deal_caps(x):
    "Replace all Capitalized tokens in `x` by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
        if t == '': continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
        res.append(t.lower())
    return res

def add_eos_bos(x): return [BOS] + x + [EOS]

default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]

In [None]:
#export
from spacy.symbols import ORTH

class TokenizeProcessor(Processor):
    def __init__(self, lang="en", chunksize=5000, pre_rules=None, post_rules=None): 
        self.chunksize = chunksize
        self.tokenizer = spacy.blank(lang)
        for w in default_spec_tok:
            self.tokenizer.tokenizer.add_special_case(w, [{ORTH: w}])
        self.pre_rules  = default_pre_rules  if pre_rules  is None else pre_rules
        self.post_rules = default_post_rules if post_rules is None else post_rules
    
    def process(self, items): 
        toks = []
        for i in progress_bar(range(0, len(items), self.chunksize)):
            chunk = items[i: i+self.chunksize]
            chunk = [compose(t, self.pre_rules) for t in chunk]
            docs = [[d.text for d in doc] for doc in self.tokenizer.tokenizer.pipe(chunk)]
            docs = [compose(t, self.post_rules) for t in docs]
            toks += docs
        return toks
    
    def proc1(self, item): 
        text = compose(item, self.pre_rules)
        toks = list(self.tokenizer.tokenizer(text))
        return compose(toks, self.post_rules)
    
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

### Numericalizing

In [None]:
#export
import collections

class NumericalizeProcessor(Processor):
    def __init__(self, vocab=None, max_vocab=60000, min_freq=2): 
        self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq
    
    def process(self, items):
        #The vocab is defined on the first use.
        if self.vocab is None:
            freq = Counter(p for o in items for p in o)
            self.vocab = [o for o,c in freq.most_common(self.max_vocab) if c >= self.min_freq]
            for o in reversed(default_spec_tok):
                if o in self.vocab: self.vocab.remove(o)
                self.vocab.insert(0, o)
            self.otoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.vocab)})  
        return [self.proc1(o) for o in items]
    def proc1(self, item):  return [self.otoi[o] for o in item]
    
    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    def deproc1(self, idx): return [self.vocab[i] for i in idx]

In [None]:
ll = label_by_func(sd, lambda x: 0, proc_x = [TokenizeProcessor(), NumericalizeProcessor()])

In [None]:
pickle.dump(ll, open(path/'ld.pkl', 'wb'))

In [None]:
ll = pickle.load(open(path/'ld.pkl', 'rb'))

### Batching

We have a bit of work to convert our `LabelList` in a `DataBunch` as we don't just want batches of IMDB reviews. We want to stream through all the texts concatenated. We also have to prepare the targets that are the newt words in the text.

In [None]:
#export
class LanguageModelPreLoader():
    def __init__(self, data, bs=64, bptt=70, shuffle=False):
        self.data,self.bs,self.bptt,self.shuffle = data,bs,bptt,shuffle
        total_len = sum([len(t) for t in data.x])
        self.n_batch = total_len // bs
        self.batchify()
    
    def __len__(self): return ((self.n_batch-1) // self.bptt) * self.bs
    
    def __getitem__(self, idx):
        source = self.batched_data[idx % self.bs]
        seq_idx = (idx // self.bs) * self.bptt
        return source[seq_idx:seq_idx+self.bptt],source[seq_idx+1:seq_idx+self.bptt+1]
    
    def batchify(self):
        texts = self.data.x
        if self.shuffle: texts = texts[torch.randperm(len(texts))]
        stream = torch.cat([tensor(t) for t in texts])
        self.batched_data = stream[:self.n_batch * self.bs].view(self.bs, self.n_batch)

In [None]:
dl = DataLoader(LanguageModelPreLoader(ll.valid, shuffle=True), batch_size=64)

In [None]:
iter_dl = iter(dl)
x1,y1 = next(iter_dl)
x2,y2 = next(iter_dl)

In [None]:
x1.size(),y1.size()

(torch.Size([64, 70]), torch.Size([64, 70]))

In [None]:
vocab = ll.train.x.processors[1].vocab

In [None]:
" ".join(vocab[o] for o in x1[0])

"xxbox xxmaj when watching a xxmaj bug 's xxmaj life for the first time in a long while , i could n't help but see the comparisons with last year 's xxmaj happy xxmaj feet . xxmaj as far as the main storyline goes , they are very similar , an outcast doing what he can to fit in while also attempting to be special . xxmaj it just goes"

In [None]:
" ".join(vocab[o] for o in x2[0])

"to show you how much better that film could have been without its liberal diatribe conclusion . a lot of people disagree with me when i say that i really like xxmaj pixar 's sophomore effort . xxmaj sure it does n't manage to capture the splendor of xxmaj toy xxmaj story , nor is the animation out of this world . xxmaj however , the story is top -"

In [None]:
" ".join(vocab[o] for o in y1[0])

"xxmaj when watching a xxmaj bug 's xxmaj life for the first time in a long while , i could n't help but see the comparisons with last year 's xxmaj happy xxmaj feet . xxmaj as far as the main storyline goes , they are very similar , an outcast doing what he can to fit in while also attempting to be special . xxmaj it just goes to"

In [None]:
#export
def get_lm_dls(train_ds, valid_ds, bs, bptt, **kwargs):
    return (DataLoader(LanguageModelPreLoader(train_ds, bs, bptt, shuffle=True), batch_size=bs, **kwargs),
            DataLoader(LanguageModelPreLoader(valid_ds, bs, bptt, shuffle=False), batch_size=2*bs, **kwargs))

In [None]:
#export
def lm_databunchify(sd, bs, bptt, **kwargs):
    dls = get_lm_dls(sd.train, sd.valid, bs, bptt, **kwargs)
    return DataBunch(*dls)

In [None]:
bs,bptt = 64,70
data = lm_databunchify(ll, bs, bptt)

## Export

In [None]:
!python notebook2script.py 12_text.ipynb

Converted 12_text.ipynb to exp/nb_12.py
