In [1]:
# Imports
import pickle
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from typing import Union
from os.path import join as join_path
from utils import filter_word, clean_sents
from nltk.tokenize import sent_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tqdm.auto import tqdm

[nltk_data] Downloading package stopwords to /Users/triki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/triki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Download and preprocess data
We download the raw enwik8 data: http://mattmahoney.net/dc/textdata.html. Furthermore, we apply the wikifil.pl Perl script as specified in [Appendix A](http://mattmahoney.net/dc/textdata.html#appendixa) to parse the raw Wikipedia dump to clean text. We have made a minor justification to the last bit of the Perl script to further exclude the period "." character such that we can split the text into sentences later (from `tr/a-z/ /cs;` to `tr/a-z\./ /cs;`).

In [2]:
data_preprocessing = False
if data_preprocessing:
    
    # Download the raw data
    !wget -c http://mattmahoney.net/dc/enwik8.zip -P data
    !unzip data/enwik8.zip -d data

    # A raw Wikipedia dump contains a lot of HTML / XML data.
    # We pre-process it with the wikifil.pl script
    # (originally developed by Matt Mahoney, and can be found on his website).
    !perl data/wikifil.pl data/enwik8 > data/text8

    # Sanity checking first words of new file
    !head -c 2000 data/text8

In [3]:
# Define constants
# ----------------
data_dir = 'data'
text8_data_path = join_path(data_dir, 'text8')
text8_data_tokenizer_config_path = join_path(data_dir, 'text8-tokenizer.json')
text8_data_sequences_path = join_path(data_dir, 'text8-seqs.p')
max_vocab_size = 1000
# ----------------

## Clean dataset
To clean the dataset, we first use `sent_tokenize` from [NLTK](https://www.nltk.org/api/nltk.tokenize.html) to split the dataset into sentences. It is more convenient to use sentences instead of a big text, due to computational restrictions. To further clean the sentences, we convert the words to lowercase, apply lemmatization, filter out stopwords and remove words that are one character long.

In [4]:
with open(text8_data_path, 'r') as file:
    text8_content = file.read()
text8_sents = sent_tokenize(text8_content)

# Remove stop words and lemmatize
text8_sents = clean_sents(text8_sents, verbose=True)
text8_sents[:10]

Cleaning sentences...


HBox(children=(FloatProgress(value=0.0, max=504749.0), HTML(value='')))




['anarchism originated term abuse first used early working class radical including digger english revolution sans culotte french revolution',
 'whilst term still used pejorative way describe act used violent mean destroy organization society also taken positive label self defined anarchist',
 'word anarchism derived greek without archons ruler chief king',
 'anarchism political philosophy belief ruler unnecessary abolished although differing interpretation mean',
 'anarchism also refers related social movement advocate elimination authoritarian institution particularly state',
 'word anarchy anarchist use imply chaos nihilism anomie rather harmonious anti authoritarian society',
 'place regarded authoritarian political structure coercive economic institution anarchist advocate social relation based upon voluntary association autonomous individual mutual aid self governance',
 'anarchism easily defined anarchist also offer positive vision believe truly free society',
 'however idea anar

Next we use Tensorflows Tokenizer class to tokenize the sentences. The tokenization process starts with calling `fit_on_texts` which creates a vocabulary out of the top-N most common words. In our case, we are interested in the top 1000 words and fit the Tokenizer accordingly.

In [5]:
# Create vocabulary from texts
fit_tokenizer = False
if fit_tokenizer:
    print('Creating vocabulary...')
    
    # Here we set the filters to empty string and lower case to false
    # because we have already performed the nessecary preprocessing steps.
    tokenizer = Tokenizer(max_vocab_size, filters='', lower=False)
    tokenizer.fit_on_texts(tqdm(text8_sents, unit='text'))

    # Save to file
    with open(text8_data_tokenizer_config_path, 'w') as file:
            file.write(tokenizer.to_json())
    print('Done!')
else:
    print('Reading vocabulary...')
    
    # Read tokenizer from file
    with open(text8_data_tokenizer_config_path, 'r') as file:
        tokenizer = tokenizer_from_json(file.read())
    print('Done!')

vocab_size = np.minimum(max_vocab_size, len(tokenizer.word_index))

Creating vocabulary...


HBox(children=(FloatProgress(value=0.0, max=504749.0), HTML(value='')))


Done!


We now have a Tokenizer that has created a vocabulary for our sentences. We now convert them into sequences of numbers corresponding to how frequent then occur in the sentences. We also write the sequences to disk for easier use later on.

In [6]:
text8_seqs = tokenizer.texts_to_sequences(tqdm(text8_sents))

HBox(children=(FloatProgress(value=0.0, max=504749.0), HTML(value='')))




In [7]:
# Sanity checks
tokenizer.word_index['man'], tokenizer.word_index['woman']

(188, 239)

In [8]:
# Write text8 sequences to file
with open(text8_data_sequences_path, 'wb') as file:
    pickle.dump(text8_seqs, file)