In [1]:
import bz2
import nltk
import numpy as np
import json
from collections import Counter
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/leo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [38]:
train_file = bz2.BZ2File('../input_data/train.json.bz2')
test_file = bz2.BZ2File('../input_data/test.json.bz2')
train_file = json.load(train_file)
test_file = json.load(test_file)
print(len(train_file))
print(len(test_file))

615243
126152


In [34]:
def process_input_file(input_file, valid_links, link_delim):
    links = []
    for i, sentence in enumerate(input_file):
        sentence, link = next(iter(sentence.items()))
        input_file[i] = sentence.lower().strip()
        if link in valid_links:
            input_file[i] = link_delim + input_file[i] + link_delim
            links.append(link)
    return input_file, links

def encode_labels(sentences_with_link_markup, links, link2idx, link_delim):
    encoded_labels = []
    link_index = 0;
    for sentence in sentences_with_link_markup:
        encoded_label = []
        for word in sentence:
            if (word[0] == link_delim and word[-1] == link_delim):
                encoded_label.append(link2idx[links[link_index]])
                link_index += 1
            else:
                encoded_label.append(0)
        encoded_labels.append(encoded_label)
    return encoded_labels

In [39]:
# Preprocess data ...

LINK_DELIM = '_'
link_treshold = 2
valid_links = Counter([next(iter(sentence.values())) for sentence in train_file if next(iter(sentence.values())) is not None])
valid_links = set([link for link,frequence in valid_links.items() if frequence >= link_treshold])

train_file, train_links = process_input_file(train_file, valid_links, LINK_DELIM)
test_file, test_links = process_input_file(test_file, valid_links, LINK_DELIM)

train_file = [nltk.word_tokenize(sentence) for sentence in nltk.sent_tokenize(" ".join(train_file))]
test_file = [nltk.word_tokenize(sentence) for sentence in nltk.sent_tokenize(" ".join(test_file))]
train_sentences = [[word.replace(LINK_DELIM,'') for word in sentence] for sentence in train_file]
test_sentences = [[word.replace(LINK_DELIM,'') for word in sentence] for sentence in test_file]

vocabulary = Counter([word for sentence in train_sentences for word in sentence for char in word])
vocabulary = ['_PAD','_UNK'] + sorted(vocabulary, key=vocabulary.get, reverse=True)
output = ["_TEXT"] + train_links

word2idx = {w:i for i,w in enumerate(vocabulary)}
idx2word = {i:w for i,w in enumerate(vocabulary)} # probably vocabulary array is enough
link2idx = {t:i for i,t in enumerate(output)}

train_encoded_labels = encode_labels(train_file, train_links, link2idx, LINK_DELIM)
test_encoded_labels = encode_labels(test_file, test_links, link2idx, LINK_DELIM)


In [41]:
y=10
print(train_encoded_labels[y])
print(len(train_encoded_labels[y]))
print(train_file[y])
print(train_sentences[y])
print(len(train_sentences[y]))

        
train_sentences_len = [len(sentence) for sentence in train_sentences]
max_train_sentences_len = max(train_sentences_len)

test_sentences_len = [len(sentence) for sentence in test_sentences]
max_test_sentences_len = max(test_sentences_len)


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 202144, 0, 0, 0, 0, 0, 0]
17
['a', 'polyhedron', 'is', 'a', '3-dimensional', 'example', 'of', 'the', 'more', 'general', '_polytope_', 'in', 'any', 'number', 'of', 'dimensions', '.']
['a', 'polyhedron', 'is', 'a', '3-dimensional', 'example', 'of', 'the', 'more', 'general', 'polytope', 'in', 'any', 'number', 'of', 'dimensions', '.']
17


In [None]:
# Encode words as integers ...

for i, sentence in enumerate(train_sentences):
    train_sentences[i] = [word2idx[word] if word in word2idx else 1 for word in sentence]
    
for i, sentence in enumerate(test_sentences):
    test_sentences[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]

In [None]:
print([idx2word[x] for x in train_sentences[0]])
print([x for x in train_file[0]])