In [9]:
import sys, os
from gensim.corpora import WikiCorpus
from tqdm import tqdm
import pickle
import pandas as pd
import numpy as np
import regex
import lxml.etree as ET
from nltk.tokenize import sent_tokenize
import glob
from bs4 import BeautifulSoup
import random
from collections import Counter

### Wikinews

In [2]:
def clean_text(text):
    text = regex.sub("\[http[^]]+? ([^]]+)]", r"\1", text) 
    text = regex.sub("\[http[^]]+]", "", text) 
    text = regex.sub("(?s)<ref>.+?</ref>", "", text) # remove reference links
    text = regex.sub("(?s)<[^>]+>", "", text) # remove html tags
    text = regex.sub("&[a-z]+;", "", text) # remove html entities
    text = regex.sub("(?s){{.+?}}", "", text) # remove markup tags
    text = regex.sub("(?s){.+?}", "", text) # remove markup tags
    text = regex.sub("(?s)\[\[([^]]+\|)", "", text) # remove link target strings
    text = regex.sub("(?s)\[\[([^]]+\:.+?]])", "", text) # remove media links
    
    text = regex.sub("[']{5}", "", text) # remove italic+bold symbols
    text = regex.sub("[']{3}", "", text) # remove bold symbols
    text = regex.sub("[']{2}", "", text) # remove italic symbols
    
    text = regex.sub(u"[^ \r\n\p{Latin}\d\-'.?!]", " ", text)
    text = text.lower()
    
    text = regex.sub("[ ]{2,}", " ", text) # Squeeze spaces.
    return text

In [3]:
def token_adder(sentence) :
    return '<s> ' + sentence +  ' </s>'

In [37]:
result_text = list()
fs = glob.glob('../data/raw/wikinews/*.xml')
ns = "{http://www.mediawiki.org/xml/export-0.10/}" # namespace
for f in tqdm(fs):
    for _, elem in ET.iterparse(f, tag=ns+"text"):
        try:
            running_text = elem.text
            running_text = running_text.split("===")[0]
            running_text = clean_text(running_text)
            paras = running_text.split("\n")
            for para in paras:
                if len(para) > 200 :
                    sents = [regex.sub("([.!?]+$)", "", sent) for sent in sent_tokenize(para.strip())]
                    result_text.extend(list(map(token_adder, sents)))
        except:
            continue

        elem.clear() # We need to save memory!

100%|██████████| 10/10 [03:25<00:00, 19.75s/it]


In [38]:
del f, fs, elem, running_text, paras, para, sents

In [42]:
result_text = np.array(result_text)

In [43]:
train_index = random.sample(range(len(result_text) -1), int(len(result_text) * 0.7))

In [44]:
def create_word_vocab(text):
    from collections import Counter
    from itertools import chain
    
    word2cnt = Counter(' '.join(text).split())
    vocab = ["<EMP>", "<UNK>"] + [word for word, cnt in word2cnt.items() if cnt > 50]
    word2idx = {word:idx for idx, word in enumerate(vocab)}
    idx2word = {idx:word for idx, word in enumerate(vocab)} 
    pickle.dump( (word2idx, idx2word), open("../data/wikinews/word_vocab.pkl", "wb") )


In [45]:
create_word_vocab(result_text)

In [46]:
test_index = np.delete(np.arange(len(result_text)), train_index)

In [47]:
with open('../data/wikinews/wiki_train.txt', 'w') as f :
    for sentence in result_text[train_index] :
        f.write(sentence + '\n')
        

In [48]:
with open('../data/wikinews/wiki_test.txt', 'w') as f :
    for sentence in result_text[test_index] :
        f.write(sentence + '\n')
    

In [49]:
del result_text

### Keyboard prediction data

In [31]:
with open('../data/unked-clean-dict-15k/en-sents-shuf.00.test.txt', 'r') as f :
    test_data = f.readlines()

In [33]:
with open('../data/unked-clean-dict-15k/en-sents-shuf.00.train.txt', 'r') as f :
    train_data = f.readlines()

In [34]:
with open('../data/unked-clean-dict-15k/en-sents-shuf.00.valid.txt', 'r') as f :
    valid_data = f.readlines()

In [50]:
c = Counter()
for sentence in [train_data, test_data, valid_data] :
    c.update(' '.join(sentence).split())

In [52]:
vocab = ["<EMP>", "<UNK>"] + [word for word, cnt in c.items() if cnt > 50]
word2idx = {word:idx for idx, word in enumerate(vocab)}
idx2word = {idx:word for idx, word in enumerate(vocab)} 
pickle.dump( (word2idx, idx2word), open("../data/unked-clean-dict-15k/keyboard_vocab.pkl", "wb") )

In [55]:
del c, sentence, test_data, train_data, valid_data, vocab, word2idx, idx2word