In [1]:
import sys, os
from gensim.corpora import WikiCorpus
from tqdm import tqdm
import pickle
import pandas as pd
import numpy as np
import regex
import lxml.etree as ET
from nltk.tokenize import sent_tokenize
import glob
from bs4 import BeautifulSoup
import random
from collections import Counter

### Wikinews

In [2]:
def clean_wiki(text):
    text = regex.sub("\[http[^]]+? ([^]]+)]", r"\1", text) 
    text = regex.sub("\[http[^]]+]", "", text) 
    text = regex.sub("(?s)<ref>.+?</ref>", "", text) # remove reference links
    text = regex.sub("(?s)<[^>]+>", "", text) # remove html tags
    text = regex.sub("&[a-z]+;", "", text) # remove html entities
    text = regex.sub("(?s){{.+?}}", "", text) # remove markup tags
    text = regex.sub("(?s){.+?}", "", text) # remove markup tags
    text = regex.sub("(?s)\[\[([^]]+\|)", "", text) # remove link target strings
    text = regex.sub("(?s)\[\[([^]]+\:.+?]])", "", text) # remove media links
    
    text = regex.sub("[']{5}", "", text) # remove italic+bold symbols
    text = regex.sub("[']{3}", "", text) # remove bold symbols
    text = regex.sub("[']{2}", "", text) # remove italic symbols
    
    text = regex.sub(u"[^ \r\n\p{Latin}\d\-'.?!]", " ", text)
    text = text.lower()
    
    text = regex.sub("[ ]{2,}", " ", text) # Squeeze spaces.
    return text

In [3]:
def token_adder(sentence) :
    return '<s> ' + sentence +  ' </s>'

In [4]:
result_text = list()
fs = glob.glob('../data/raw/wikinews/*.xml')
ns = "{http://www.mediawiki.org/xml/export-0.10/}" # namespace
for f in tqdm(fs):
    for _, elem in ET.iterparse(f, tag=ns+"text"):
        try:
            running_text = elem.text
            running_text = running_text.split("===")[0]
            running_text = clean_wiki(running_text)
            paras = running_text.split("\n")
            for para in paras:
                if len(para) > 200 :
                    sents = [regex.sub("([.!?]+$)", "", sent) for sent in sent_tokenize(para.strip())]
                    result_text.extend(list(map(token_adder, sents)))
        except:
            continue

        elem.clear() # We need to save memory!

100%|██████████| 10/10 [02:56<00:00, 17.69s/it]


In [5]:
del f, fs, elem, running_text, paras, para, sents

In [6]:
def build_dataset(text, maximum_length):
    from collections import Counter
    
    unique = Counter(text)
    orders = unique.most_common()
    count = [['<unk>', -1], ['<pad>', -1]]
    count.extend(orders)
    
    dictionary = {}
    for word, _ in count :
        dictionary[word] = len(dictionary)
        
    data = []
    seq_lengths = []
    sentence_data = []
    for word in tqdm(text) :
        if word in dictionary :
            index = dictionary[word]
        else :
            index = 0
            count[0][1] += 1
        sentence_data.append(index)
        if word == '</s>' :
            if len(sentence_data) > maximum_length :
                sentence_data = []
            else :
                seq_lengths.append(len(sentence_data))
                sentence_data.extend([1] * (maximum_length - len(sentence_data)))
                data.append(sentence_data)
                sentence_data = []
                
    return np.array(data), np.array(seq_lengths), count, list(dictionary.keys())


In [7]:
data, seq_lengths, count, ordered_words = build_dataset(' '.join(result_text).split(), 30)

100%|██████████| 80734990/80734990 [00:39<00:00, 2040692.14it/s]


In [8]:
del result_text

In [9]:
train_index = random.sample(range(len(data) -1), int(len(data) * 0.7))

In [10]:
test_index = np.delete(np.arange(len(data)), train_index)

In [11]:
with open('../data/wikinews/word_vocab.pkl', 'wb') as f :
    pickle.dump(ordered_words, f)

In [12]:
with open('../data/wikinews/wiki_train.pkl', 'wb') as f :
    pickle.dump([data[train_index], seq_lengths[train_index]], f)     

In [13]:
with open('../data/wikinews/wiki_test.pkl', 'wb') as f :
    pickle.dump([data[test_index], seq_lengths[test_index]], f)

In [14]:
del data, seq_lengths, count, ordered_words

### Keyboard prediction data

In [2]:
with open('../data/wordprediction/en-sents-shuf.00.train.txt', 'r') as f :
    temp_data = f.readlines()

In [3]:
with open('../data/wordprediction/en-sents-shuf.00.valid.txt', 'r') as f :
    temp_data.extend(f.readlines())

In [4]:
with open('../data/wordprediction/en-sents-shuf.00.test.txt', 'r') as f :
    temp_data.extend(f.readlines())

In [7]:
data, seq_lengths, count, ordered_words = build_dataset(' '.join(temp_data).split(), 30)

100%|██████████| 193939466/193939466 [01:42<00:00, 1890862.44it/s]


In [19]:
train_index = random.sample(range(len(data) -1), int(len(data) * 0.7))

In [20]:
test_index = np.delete(np.arange(len(data)), train_index)

In [21]:
with open('../data/wordprediction/word_vocab.pkl', 'wb') as f :
    pickle.dump(ordered_words, f)

In [22]:
with open('../data/wordprediction/wordprediction_train.pkl', 'wb') as f :
    pickle.dump([data[train_index], seq_lengths[train_index]], f)     

In [23]:
with open('../data/wordprediction/wordprediction_test.pkl', 'wb') as f :
    pickle.dump([data[test_index], seq_lengths[test_index]], f)

In [24]:
del data, seq_lengths, count, ordered_words