## Preprocessing version 3
### Details
this version 
- uses all data
- has validation split
- uses pretrained embedding

In [1]:
doc_name = "data/train.article.txt"
summ_name = "data/train.title.txt"
vocab_name = "vocab.json"
data_seq_name = "train_seq.json"
valid_seq_name = "valid_seq.json"
matrixname = "wv_matrix"
min_count = 7
num_threads = 6
n_dim = 300
corpus_size = 3803957
validation_split = 0.01
epochs = 0

In [2]:
import os
import csv
import json
import numpy as np
from nltk import word_tokenize as tokenize
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool
from gensim.models.callbacks import CallbackAny2Vec

In [None]:
def getlines(name):
    total = !wc -l {name}
    return int(total[0].split()[0])

In [None]:
summaries = []
articles = []

# Reader = csv.reader(open(train_name, newline='', encoding='ISO-8859-1'), delimiter=',')
with open(doc_name, newline='', encoding='utf-8') as f:
    total = getlines(doc_name)
    for i,line in tqdm(enumerate(f), total=total):
        line = line.strip()
        articles.append(line)
with open(summ_name, newline='', encoding='utf-8') as f:
    total = getlines(summ_name)
    for i,line in tqdm(enumerate(f), total=total):
        line = line.strip()
        summaries.append(line)

In [None]:
summaries = summaries[:min(corpus_size, len(summaries))]
articles = articles[:min(corpus_size, len(summaries))]

In [None]:
data = []
if num_threads > 1:
    def task(s):
        words = tokenize(s)
        return ['<bos>'] + words + ['<eos>']
    
    with Pool(num_threads) as p:
        chunksize = 200
        summaries = list(tqdm(p.imap(task, summaries, chunksize=chunksize), total=len(summaries)))
        articles = list(tqdm(p.imap(task, articles, chunksize=chunksize), total=len(articles)))
    data = summaries+articles
else:
    for i, summ in tqdm(enumerate(summaries+articles), total=len(summaries+articles)):
        words = tokenize(summ)
        data.append(['<bos>'] + words + ['<eos>'])

In [None]:
num_summ = len(summaries)
index = 99999
print("[summary]", summaries[index])
print("[documen]", articles[index])

In [None]:
from gensim.models import FastText
model = FastText(size=n_dim, window=5, min_count=1, workers=num_threads)
model.build_vocab(data)
# total_examples = model.corpus_count
# print(total_examples)
# logger = EpochLogger()
# model.train(data, total_examples=total_examples, epochs=epochs, callbacks=[logger])

In [None]:
VOCAB_SZ = len(model.wv.vocab)
print(VOCAB_SZ)

In [None]:
print(model.wv.vocab['isabelle'])

In [None]:
vocab = {}
for w, voc in model.wv.vocab.items():    
    if voc.count > min_count:
        vocab[w] = len(vocab)
print(len(vocab))

In [None]:
vocab['<unk>'] = len(vocab)
vocab['<pad>'] = len(vocab)
json.dump(vocab, open(vocab_name, "w"))

In [None]:
seqdata = {'summary':[], 'document':[]}
valseqdata = {'summary':[], 'document':[]}
# num_summ = len(data) / 2
val_set = np.random.randint(0, num_summ, size=int(validation_split*num_summ))
for i,words in tqdm(enumerate(summaries), total=num_summ):    
    seq = []
    for w in words:        
        try:
            wid = vocab[w]
        except KeyError:
            wid = vocab["<unk>"]
#             wid = OOVindex(w)
        seq.append(wid)
    if i in val_set:
        valseqdata['summary'].append(seq)
    else:
        seqdata['summary'].append(seq)
for i,words in tqdm(enumerate(articles), total=num_summ):
    seq = []
    for w in words:
        try:
            wid = vocab[w]
        except KeyError:
            wid = vocab["<unk>"]
#             wid = OOVindex(w)
        seq.append(wid)
    if i in val_set:
        valseqdata['document'].append(seq)
    else:
        seqdata['document'].append(seq)
print(len(seqdata['document']), len(valseqdata['document']))

In [None]:
json.dump(seqdata, open(data_seq_name, "w"))
json.dump(valseqdata, open(valid_seq_name, "w"))

In [None]:
lengths = []
key = 'document'
for doc in seqdata[key]:
    lens = len(doc)
    lengths.append(lens)

In [None]:
import matplotlib.pyplot as plt
plt.hist(lengths, 50)
plt.title(key +' length')
plt.show()

In [None]:
# take a break

### Word Embedding

In [4]:
import numpy as np
import json
from gensim.models import FastText
pretrained = 'data/cc.en.300.bin'
vocab_name = "preprocessing-300d-all/vocab.json"
pmodel = FastText.load_fasttext_format(pretrained) 
vocab = json.load(open(vocab_name, "r"))

In [5]:
def l2_normd_np(a):
    b = np.linalg.norm(a)
    return 0*a if b == 0 else a / b
VOCAB_SZ = len(vocab)

oovs = []
wv_matrix = (np.random.rand(VOCAB_SZ, n_dim) - 0.5) / 5.0
for word, wid in tqdm(vocab.items()):    
    try:
        vec = pmodel.wv[word]
    except KeyError:
        oovs.append(word)
        vec = (np.random.rand(n_dim) - 0.5) / 5.0
    wv_matrix[wid] = l2_normd_np(vec)

print("done.")


HBox(children=(IntProgress(value=0, max=98058), HTML(value='')))


done.


In [6]:
print(len(oovs))
print(oovs)

157
['\\/', 'omv', 'gkn', 'wfp', 'ebrd', 'o.j', 'kfw', 'vsel', 'dpj', 'saez', 'wbf', 'ypf', '\\*', 'zew', 'ktg', 'ufj', 'kuok', 'paok', 'kfa', 'f\\/a', 'johl', 'md-', 'ctbt', 'usx', 'asml', 'rhj', 'vtb', '-bln', 'khd', 'gfk', 'g.i', 'nbg', 'kdd', 'cnh', 'txu', 'os\\/', 'seol', 'k+s', 'j.m', 'fivb', 'p-i', 'ifad', 'cbgb', 'ibsa', 'xwb', 'ldv', 'd.r', 'n.j', 'e.u', '_', 'k.t', 'luol', 'c.k', 'hjk', 'fasb', 'g.e', 'k.c', 'ksb', 'mihm', 'vsb', 'w.h', 'efp', 'f.w', 'vagn', 'fisu', 'j.s', 'wvt', 'mjib', 'ttwb', 'epzs', 'ofws', 'lafd', 'wnbf', 'cplp', 'jvp', 'mko', 'cpifa', 'wtca', 'hkia', 'zoc', 'csic', 'k.l', 'rvf', '``', '`', 'smmt', 'iipf', 'nzc', 'dgb', 'vonk', 'fuad', 'l.k', 'zbc', 'k.r', 'paek', 'agca', 'frmf', 'dgac', 'c.m', 'rthk', 'gatx', 'eqt', 'mv-', "''", 'icbl', 'm.r', 'rizk', 'csat', 'tfg', 'spdc', 'vvd', 'ldk', 'qunu', 'zccm', 'ecz', 'kc-', 'zib', 'zta', 'wpk', 'ljr', 'ibge', 'kcb', 'gtz', 'y.d', 'pvv', 'rfef', 'npf', 't.d', '__', 'r-pa', 'ep-', 'y.e', 'i.k', 'c.r', 'zec', 'f.

In [7]:
np.save(matrixname, wv_matrix)