In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
from nb_007 import *
import pandas as pd, re, spacy, html, os
from spacy.symbols import ORTH
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

# IMDB

## Data

### Reading the texts

In [None]:
PATH = Path('../data/aclImdb/')
CLAS_PATH = PATH/'clas'
LM_PATH = PATH/'lm'
os.makedirs(CLAS_PATH, exist_ok=True)
os.makedirs(LM_PATH, exist_ok=True)

In [None]:
BOS = 'xxbos'
FLD = 'xxfld'

In [None]:
CLASSES = ['neg', 'pos', 'unsup']

def get_texts(path):
    texts,labels = [],[]
    for idx,label in enumerate(CLASSES):
        for fname in (path/label).glob('*.*'):
            texts.append(fname.open('r', encoding='utf8').read())
            labels.append(idx)
    return np.array(texts),np.array(labels)

train_texts,train_labels = get_texts(PATH/'train')
valid_texts,valid_labels = get_texts(PATH/'test')

In [None]:
train_idx = np.random.permutation(len(train_texts))
valid_idx = np.random.permutation(len(valid_texts))

In [None]:
train_texts,train_labels = train_texts[train_idx],train_labels[train_idx]
valid_texts,valid_labels = valid_texts[valid_idx],valid_labels[valid_idx]

In [None]:
train_df = pd.DataFrame({'text':train_texts, 'labels':train_labels}, columns=['labels','text'])
valid_df = pd.DataFrame({'text':valid_texts, 'labels':valid_labels}, columns=['labels','text'])

We put aside the unsup labels for the classification but keep them to finetune the language model.

In [None]:
train_df[train_df['labels']!=2].to_csv(CLAS_PATH/'train.csv', header=False, index=False)
valid_df.to_csv(CLAS_PATH/'valid.csv', header=False, index=False)

In [None]:
all_texts = np.concatenate([train_texts,valid_texts])
idx = np.random.permutation(len(all_texts))
cut = int(0.1 * len(idx))
train_df = pd.DataFrame({'text':all_texts[idx[cut:]], 'labels':[0] * (len(all_texts)-cut)}, columns=['labels','text'])
valid_df = pd.DataFrame({'text':all_texts[idx[:cut]], 'labels':[0] * cut}, columns=['labels','text'])

In [None]:
train_df.to_csv(LM_PATH/'train.csv', header=False, index=False)
valid_df.to_csv(LM_PATH/'valid.csv', header=False, index=False)

### Tokenization + Numericalization

In [None]:
def partition(a, sz): 
    """splits iterables a in equal parts of size sz"""
    return [a[i:i+sz] for i in range(0, len(a), sz)]

def partition_by_cores(a, n_cpus):
    return partition(a, len(a)//n_cpus + 1)

def num_cpus():
    try:
        return len(os.sched_getaffinity(0))
    except AttributeError:
        return os.cpu_count()

In [None]:
class SpacyTokenizer():
    "Little wrapper around a spacy tokenizer"
    
    def __init__(self, lang):
        self.tok = spacy.load(lang)
    
    def tokenizer(self, t):
        return [t.text for t in self.tok.tokenizer(t)]
    
    def add_special_cases(self, toks):
        for w in toks:
            self.tok.tokenizer.add_special_case(w, [{ORTH: w}])

In [None]:
train_df = pd.read_csv(LM_PATH/'train.csv', header=None, chunksize=10)
trn_df = next(train_df)
test_tok = SpacyTokenizer('en')
test_txt = trn_df.iloc[0][1]
test_tok.tokenizer(test_txt)

In [None]:
class Tokenizer():
    def __init__(self, tok_fn=SpacyTokenizer, lang:str='en', rules:Collection[Callable[[str],str]]=None, 
                 special_cases:Collection[str]=None, n_cpus = None):
        self.tok_fn,self.lang,self.special_cases = tok_fn,lang,special_cases
        self.rules = rules if rules else []
        for rule in self.rules:
            if hasattr(rule, 'compile'): rule.compile()
        self.n_cpus = n_cpus or num_cpus()//2
    
    def proc_text(self, t, tok):
        for rule in self.rules: t = rule(t)
        return tok.tokenizer(t)
    
    def process_all_1thread(self, texts):
        tok = self.tok_fn(self.lang)
        if self.special_cases: tok.add_special_cases(self.special_cases)
        return [self.proc_text(t, tok) for t in texts]

    def process_all(self, texts):
        if self.n_cpus <= 1: return self.process_all_1thread(texts)
        with ProcessPoolExecutor(self.n_cpus) as e:
            return sum(e.map(self.process_all_1thread, partition_by_cores(texts, self.n_cpus)), [])

In [None]:
def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub("\n", t)

def spec_add_spaces(t):
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    def _replace_rep(m):
        TK_REP = 'xxrep'
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)
    
def replace_wrep(t):
    def _replace_wrep(m):
        TK_WREP = 'xxwrep'
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def deal_caps(t):
        TOK_UP = 'xxup'
        res = []
        for s in re.findall(r'\w+|\W+', t):
            res += ([TOK_UP,s.lower()] if (s.isupper() and (len(s)>2)) else [s.lower()])
        return ' '.join(res)

def fixup(x):
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
    
rules = [sub_br, spec_add_spaces, rm_useless_spaces, replace_rep, replace_wrep, deal_caps, fixup]

In [None]:
sub_br('end <br /><br /> begins again')

In [None]:
spec_add_spaces('\#%')

In [None]:
rm_useless_spaces('this             is')

In [None]:
replace_rep('ffffffive .')

In [None]:
replace_wrep('five five five five .')

In [None]:
deal_caps('ANGRY')

In [None]:
def get_total_length(csv_name, chunksize):
    df = pd.read_csv(csv_name, header=None, chunksize=chunksize)
    l = 0
    for _ in df: l+=1
    return l

In [None]:
class TextDataset():
    "Put a train.csv and valid.csv files in a folder and this will take care of the rest."
    
    def __init__(self, path, tokenizer, max_vocab=30000, chunksize=10000, train_name='train', valid_name='valid',
                 min_freq=2, n_labels=1):
        self.path,self.tokenizer,self.max_vocab,self.min_freq = Path(path),tokenizer,max_vocab,min_freq
        self.chunksize,self.train_name,self.valid_name,self.n_labels = chunksize,train_name,valid_name,n_labels
        os.makedirs(self.path/'tmp', exist_ok=True)
        id_files = [self.path/f'{name}_ids.npy' for name in [train_name,valid_name]] + [self.path/'itos.pkl']
        if not np.all([os.path.isfile(fname) for fname in id_files]):
            tok_files = [self.path/f'{name}_tok.npy' for name in [train_name,valid_name]]
            if not np.all([os.path.isfile(fname) for fname in tok_files]):
                self.tokenize()
            self.numericalize()
        self.itos = pickle.load(open(self.path/'itos.pkl', 'rb'))
        self.train_ids = np.load(self.path/'train_ids.npy')
        self.valid_ids = np.load(self.path/'valid_ids.npy')
    
    def tokenize(self):
        print('Tokenizing the texts. This might take a while so you should grab a coffee.')
        for name in [self.train_name, self.valid_name]:
            print(f'Tokenizing {name}')
            curr_len = get_total_length(self.path/f'{name}.csv', self.chunksize)
            dfs = pd.read_csv(self.path/f'{name}.csv', header=None, chunksize=self.chunksize)
            tokens,labels = [],[]
            for _ in progress_bar(range(curr_len), leave=False):
                df = next(dfs)
                lbls = df.iloc[:,range(self.n_labels)].values.astype(np.int64)
                texts = f'\n{BOS} {FLD} 1 ' + df[self.n_labels].astype(str)
                for i in range(self.n_labels+1, len(df.columns)): 
                    texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
                toks = tokenizer.process_all(texts)
                tokens += toks
                labels += labels
            np.save(self.path/f'{name}_tok.npy', np.array(tokens))
            np.save(self.path/f'{name}_lbl.npy', np.array(labels))
        
    def numericalize(self):
        print('Changing tokens to numbers.')
        train_tokens = np.load(self.path/f'{self.train_name}_tok.npy')
        freq = Counter(p for o in train_tokens for p in o)
        itos = [o for o,c in freq.most_common(self.max_vocab) if c > self.min_freq]
        itos.insert(0, 'xxpad')
        itos.insert(0, 'xxunk')
        stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
        pickle.dump(itos, open(self.path/'itos.pkl', 'wb'))
        for name in [self.train_name, self.valid_name]:
            toks = np.load(self.path/f'{name}_tok.npy')
            ids = np.array([([stoi[w] for w in s]) for s in toks])
            np.save(self.path/f'{name}_ids.npy', ids)

In [None]:
tokenizer = Tokenizer(rules=rules, special_cases=[BOS, FLD, 'xxunk', 'xxpad'])

In [None]:
data = TextDataset(LM_PATH, tokenizer, chunksize=10000)