In [None]:
# Imports
import pickle
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from os.path import join as join_path
from utils import clean_sents
from nltk.tokenize import sent_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tqdm.auto import tqdm

## Download and preprocess data
We download the raw enwik9 data: http://mattmahoney.net/dc/textdata.html. Furthermore, we apply the wikifil.pl Perl script as specified in [Appendix A](http://mattmahoney.net/dc/textdata.html#appendixa) to parse the raw Wikipedia dump to clean text. We have made a minor justification to the last bit of the Perl script to further exclude the period "." character such that we can split the text into sentences later (from `tr/a-z/ /cs;` to `tr/a-z\./ /cs;`).

In [None]:
data_preprocessing = False
if data_preprocessing:
    
    # Download the raw data
    !wget -c http://mattmahoney.net/dc/enwik9.zip -P data
    !unzip data/enwik9.zip -d data

    # A raw Wikipedia dump contains a lot of HTML / XML data.
    # We pre-process it with the wikifil.pl script
    # (originally developed by Matt Mahoney, and can be found on his website).
    !perl data/wikifil.pl data/enwik9 > data/fil9

    # Sanity checking first words of new file
    !head -c 2000 data/fil9

In [None]:
# Define constants
# ----------------
data_dir = 'data'
fil9_data_path = join_path(data_dir, 'fil9')
fil9_data_tokenizer_config_path = join_path(data_dir, 'fil9-tokenizer.json')
fil9_data_sentences_path = join_path(data_dir, 'fil9-sents.p')
fil9_data_sequences_path = join_path(data_dir, 'fil9-seqs.p')
# ----------------

## Clean dataset
To clean the dataset, we first use `sent_tokenize` from [NLTK](https://www.nltk.org/api/nltk.tokenize.html) to split the dataset into sentences. It is more convenient to use sentences instead of a big text, due to computational restrictions. To further clean the sentences, we convert the words to lowercase, apply lemmatization, filter out stopwords and remove words that are one character long.  

**TODO**: Elaborate on lemmatization, stopwords etc. Why are they used? This also applies to other terms later.

In [None]:
with open(fil9_data_path, 'r') as file:
    fil9_content = file.read()
fil9_sents = sent_tokenize(fil9_content)

# Remove stop words and lemmatize
fil9_sents = clean_sents(fil9_sents, verbose=True)
fil9_sents[:10]

In [None]:
# Save fil9 sentences to file
with open(fil9_data_sentences_path, 'wb') as file:
    pickle.dump(fil9_sents, file)

## Create vocabulary

In [None]:
# Create vocabulary from texts
fit_tokenizer = True
if fit_tokenizer:
    print('Creating vocabulary...')
    
    # Here we set the filters to empty string and lower case to false
    # because we have already performed the nessecary preprocessing steps.
    tokenizer = Tokenizer(filters='', lower=False)
    tokenizer.fit_on_texts(tqdm(fil9_sents, unit='text'))

    # Save to file
    with open(fil9_data_tokenizer_config_path, 'w') as file:
        file.write(tokenizer.to_json())
    print('Done!')
else:
    print('Reading vocabulary...')
    
    # Read tokenizer from file
    with open(fil9_data_tokenizer_config_path, 'r') as file:
        tokenizer = tokenizer_from_json(file.read())
    print('Done!')

vocab_size = len(tokenizer.word_index)

In [None]:
fil9_seqs = tokenizer.texts_to_sequences(tqdm(fil9_sents))

In [None]:
# Sanity checks
tokenizer.word_index['man'], tokenizer.word_index['woman']

In [None]:
# Write fil9 sequences to file
with open(fil9_data_sequences_path, 'wb') as file:
    pickle.dump(fil9_seqs, file)