In [13]:
import bz2
import nltk
import numpy as np
import re
import os
import torch
from collections import Counter
from argparse import Namespace
from torch.utils.data import Dataset, DataLoader
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/leo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Below some useful utility functions.

In [14]:
'''
This is responsible for collecting links for generating the target classes.
'''
def collect_links(input_file, valid_links, INNER_SEP, OUTER_SEP):
    links = []
    for sentence in input_file:
        for word in sentence:
            if word[0] == OUTER_SEP and word[-1] == OUTER_SEP:
                _link = list(filter(None, word.split(OUTER_SEP)))
                if len(_link) == 2 and _link[1] in valid_links:
                    links.append(_link[1])
    return input_file, links
'''
This takes the text corpus as input and while clearing it from links' markup, it creates a list of labels
which has the same shape of the input sentence but consists of the target links.
'''
def encode_labels(text_corpus, link_to_idx, INNER_SEP, OUTER_SEP, default_no_link):
    labels = []
    sentences = []
    for sentence in text_corpus:
        _label = []
        _sentence = []
        for word in sentence:
            if word[0] == OUTER_SEP and word[-1] == OUTER_SEP:
                _split = list(filter(None, word.split(OUTER_SEP)))
                if len(_split) == 2:
                    text, link = _split
                    sub_links = filter(None, text.split(INNER_SEP))
                    for sub_link in sub_links:
                        _label.append(link_to_idx.get(link, default_no_link))
                        _sentence.append(sub_link)
                else:
                    word = word.replace(OUTER_SEP, '')
                    _label.append(default_no_link)
                    _sentence.append(word)
            else:
                _label.append(default_no_link)
                _sentence.append(word)
        labels.append(_label)
        sentences.append(_sentence)
    return labels, sentences

# This function takes a batch of sentences and pads/trims every sentence to seq_len
def pad_input(sentences, seq_len, pad_token):
    features = np.ones((len(sentences), seq_len)) * pad_token
    for ii, sentence in enumerate(sentences):
        features[ii, :len(sentence)] = np.array(sentence)[:seq_len]
    return features

The following cell is responsible for preprocessing input files into sentences, creating the vocabularies and vectorizing them into useful information for the model.

In [15]:
INNER_SEP = '_'
OUTER_SEP = '|'

# Read input files
train_file = bz2.BZ2File('../input_data/train.txt.bz2').read().decode('utf-8')
test_file = bz2.BZ2File('../input_data/test.txt.bz2').read().decode('utf-8')

# Tokenize sentences using NLTK
train_file = [[word.lower() for word in nltk.word_tokenize(sentence)] for sentence in nltk.sent_tokenize(train_file)]
test_file = [[word.lower() for word in  nltk.word_tokenize(sentence)] for sentence in nltk.sent_tokenize(test_file)]

# Set a cutoff to consider a smaller subset of links
link_cutoff = 1
valid_links = Counter([word.split(OUTER_SEP)[-2] for sentence in train_file for word in sentence if word[0] == OUTER_SEP and word[-1] == OUTER_SEP])
valid_links = set([link for link,frequence in valid_links.items() if frequence >= link_cutoff])

# Collect links for the target vocabulay, in essence for creating output classes
train_file, train_links = collect_links(train_file, valid_links, INNER_SEP, OUTER_SEP)
test_file, test_links = collect_links(test_file, valid_links, INNER_SEP, OUTER_SEP)

# Create mappings for the input vocabulary
output = train_links + test_links + ["_TEXT"]
_NO_LINK = len(output) -1
link_to_idx = {l:i for i,l in enumerate(output)}
idx_to_link = {i:l for i,l in enumerate(output)}

# Create vectorized classes and input files are cleared from link's markup, hence they are now simple text
train_labels, train_file = encode_labels(train_file, link_to_idx, INNER_SEP, OUTER_SEP, _NO_LINK)
test_labels, test_file = encode_labels(test_file, link_to_idx, INNER_SEP, OUTER_SEP, _NO_LINK)

# Create the input vocabulary
MASK_TOKEN = "_MASK"
UNKNOWN_TOKEN = "_UNK"
vocabulary = Counter([word for sentence in train_file for word in sentence])
vocabulary = [MASK_TOKEN, UNKNOWN_TOKEN] + sorted(vocabulary, key=vocabulary.get, reverse=True)

# Create the mapping necessary for the vocabulary, in essence this is needed for vectorizing the input
word_to_idx = {w:i for i,w in enumerate(vocabulary)}
idx_to_word = {i:w for i,w in enumerate(vocabulary)} # probably vocabulary array is enough

In [16]:
# Encode words as integers ...

train_sentences = [[word_to_idx[word] if word in word_to_idx else word_to_idx[UNKOWN_TOKEN] for word in sentence] for sentence in train_file]
test_sentences = [[word_to_idx[word] if word in word_to_idx else word_to_idx[UNKOWN_TOKEN] for word in sentence] for sentence in test_file]

#train_sentences.sort(key=len, reverse=True)
#test_sentences.sort(key=len, reverse=True)

batch_size = 32



#train_lengths = [len(sentence) for sentence in train_sentences]
#test_lengths = [len(sentence) for sentence in test_sentences]