## Preprocessing version 3
### Details
this version 
- uses all data
- has validation split
- uses pretrained embedding

In [1]:
import csv
import sys
from multiprocessing import Pool
from tqdm import tqdm_notebook as tqdm
import json
import os
import blingfire
from joblib import Parallel, delayed

UNK = "[UNK]"
BOS = "[CLS]"
EOS = "[SEP]"
PAD = "[PAD]"

def tokenize(s, omit=[]):
    s = s.lower()
    for t in omit:
        s = s.replace(t, '')
    ts = [BOS] + blingfire.text_to_words(s).split() + [EOS]
    return ts
def tokenize_add_vocab(sents, vocab, omit=[]):
    sent_toks = []
    for s in tqdm(sents):
        toks = tokenize(s, omit)
        for t in toks:
            vocab[t] = vocab.get(t, 0) + 1
        sent_toks.append(toks)
    return sent_toks
def tokens_to_ids(tokens, vocab):    
    return [vocab.get(t, vocab[UNK]) for t in tokens]

class Processor:
    def __init__(self, threads):
        self.score = []
        self.summary = []
        self.text = []
        self.omit = ["<br />"]
        self.n_worker = threads        
        self.size = 0

    def make_vocab_and_ids(self, cutoff, verbose=True):
        vocab = {UNK:99999, PAD:99999}
        summary_tok = tokenize_add_vocab(self.summary, vocab, self.omit)
        if verbose:
            print("sum tok done")
        text_tok = tokenize_add_vocab(self.text, vocab, self.omit)
        if verbose:
            print("text tok done")
            print("original vocab", len(vocab))
        vocab = sorted(vocab.items(), key=lambda x: -x[1])[:cutoff]
        vocab = {a:i for i,(a,b) in enumerate(vocab)}
        self.vocab = vocab
        
        summary_gen = tqdm(summary_tok)
        text_gen = tqdm(text_tok)
        
        if self.n_worker < 2:
            self.summary_ids = [tokens_to_ids(t, vocab) for t in summary_gen]
            self.text_ids = [tokens_to_ids(t, vocab) for t in text_gen]
        else:
            #with Pool(self.n_worker) as p:
            #    chunksize = 100                
                #self.summary_ids = list(p.imap(tokens_to_ids, summary_gen))
                #self.text_ids = list(p.imap(tokens_to_ids, text_gen))
            self.summary_ids = Parallel(n_jobs=self.n_worker, backend='multiprocessing')(delayed(tokens_to_ids)(tokens, vocab) for tokens in summary_gen)
            self.text_ids = Parallel(n_jobs=self.n_worker, backend='multiprocessing')(delayed(tokens_to_ids)(tokens, vocab) for tokens in text_gen)
            
        if verbose:
            print("vocab and id done")
        
    def __len__(self):
        return self.size
    
    def dump(self, name, vocabname):
        seqdata = {"text":self.text_ids, "summary":self.summary_ids, "score":self.score}
        json.dump(seqdata, open(name, 'w'))
        if vocabname is not None:
            json.dump(self.vocab, open(vocabname, 'w'))
    
    @staticmethod
    def readcsv(input_file, quotechar="\""):
        """Reads a comma separated value file."""
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter=",", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines
        
class FoodProcessor(Processor):
     def __init__(self, name, threads):
        super(FoodProcessor, self).__init__(threads)
        lines = self.readcsv(name)

        for d in lines[1:]:
            #self.score.append(int(d[6]) > 3) # convert to 0/1
            self.score.append(int(d[6])-1)
            self.summary.append(d[9].strip("\""))
            self.text.append(d[9].strip("\""))
        self.size = len(self.text)   
            
class IMDBProcessor(Processor):
    def __init__(self, dirname, threads):
        super(IMDBProcessor, self).__init__(threads)
        
        posdir = os.path.join(dirname, 'pos')
        negdir = os.path.join(dirname, 'neg')
        
        self.score = []
        self.text = []
        self.summary = []
        self.omit = ["<br />"]
        
        for lb,subdir in enumerate(['neg','pos']):
            subdir = os.path.join(dirname, subdir)
            for entry in os.listdir(subdir):
                fname = os.path.join(subdir, entry)
                with open(fname, 'r') as f:
                    t = f.read().strip()
                self.score.append(lb)
                self.text.append(t)
                self.summary.append(t)
        
        self.size = len(self.text)
            
class BBCProcessor(Processor):
     def __init__(self, name, threads):
        super().__init__(threads)
        lines = self.readcsv(name)
        labeldict = {}
        for lb, txt in lines[1:]:
            labeldict[lb] = labeldict.get(lb, len(labeldict))
            self.score.append(labeldict[lb]) # convert to 0/1
            self.summary.append(txt)
            self.text.append(txt)
        self.size = len(self.text)   

In [2]:
# take a break

In [3]:
#p = IMDBProcessor("data/aclImdb/train", 1)
p = FoodProcessor("/tmp2/Food/Reviews.csv", 1)

In [4]:
p.make_vocab_and_ids(cutoff=100000)

HBox(children=(IntProgress(value=0, max=568454), HTML(value='')))


sum tok done


HBox(children=(IntProgress(value=0, max=568454), HTML(value='')))


text tok done
original vocab 149373


HBox(children=(IntProgress(value=0, max=568454), HTML(value='')))

HBox(children=(IntProgress(value=0, max=568454), HTML(value='')))

vocab and id done


In [5]:
#p.dump("data/IMDB/data.json", "data/IMDB/vocab.json")
p.dump("/tmp2/Food/data.json", "/tmp2/Food/vocab.json")

In [6]:
index = 999
vocab_inv = {a:b for b,a in p.vocab.items()}

print([vocab_inv[i] for i in p.text_ids[index]])

['[CLS]', 'i', 'have', 'to', 'admit', ',', 'i', 'was', 'a', 'sucker', 'for', 'the', 'large', 'quantity', ',', '12', 'oz', ',', 'when', 'shopping', 'for', 'hot', 'sauces', '.', '.', '.but', 'now', 'seeing', 'the', 'size', 'of', 'the', 'bottle', ',', 'it', 'reminds', 'of', 'wing', '-', 'sauce', 'bottle', 'sizes', '.', 'plastic', 'bottle', '.', 'it', 'does', 'have', 'a', 'convenient', 'squirt', 'top', '.', 'but', 'overall', ',', 'not', 'very', 'hot', 'or', 'tasty', ',', 'and', 'made', 'mostly', 'from', 'jalape&ntilde', ';', 'os', '.', 'if', 'i', 'had', 'seen', 'the', 'ingredients', 'list', 'i', 'would', 'not', 'have', 'bought', 'it', ':', 'jalapenoswatervinegarbrown', 'sugarlime', 'juicefish', 'saucecilantrohabanerogarlicspice', 'blendsaltpotassium', 'sorbatexanthan', 'gum', '[SEP]']


In [7]:
lens = [len(x) for x in p.text_ids]
import matplotlib.pyplot as plt
plt.hist(lens, bins=50)

(array([3.49468e+05, 1.42566e+05, 4.42840e+04, 1.64700e+04, 7.25800e+03,
        3.50400e+03, 1.92500e+03, 1.08300e+03, 5.57000e+02, 3.51000e+02,
        2.14000e+02, 1.77000e+02, 2.82000e+02, 7.40000e+01, 4.40000e+01,
        3.00000e+01, 3.80000e+01, 2.90000e+01, 2.30000e+01, 1.50000e+01,
        1.10000e+01, 1.20000e+01, 1.10000e+01, 9.00000e+00, 2.00000e+00,
        5.00000e+00, 3.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        4.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 3.00000e+00]),
 array([   5. ,   85.4,  165.8,  246.2,  326.6,  407. ,  487.4,  567.8,
         648.2,  728.6,  809. ,  889.4,  969.8, 1050.2, 1130.6, 1211. ,
        1291.4, 1371.8, 1452.2, 1532.6, 1613. , 1693.4, 1773.8, 1854.2,
        1934.6, 2015. , 2095.4, 2175.8, 2256.2, 2336

In [8]:
import numpy as np
print(np.mean(lens), np.std(lens))

96.33421701668033 94.46542534643673
