In [1]:
import bz2
import nltk
import numpy as np
import re
from collections import Counter
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/leo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def collect_links(input_file, valid_links, INNER_SEP, OUTER_SEP):
    links = []
    for sentence in input_file:
        for word in sentence:
            if word[0] == OUTER_SEP and word[-1] == OUTER_SEP:
                _link = list(filter(None, word.split(OUTER_SEP)))
                if len(_link) == 2 and _link[1] in valid_links:
                    links.append(_link[1])
    return input_file, links

def encode_labels(text_corpus, link2idx, INNER_SEP, OUTER_SEP, default_no_link):
    labels = []
    sentences = []
    for sentence in text_corpus:
        _label = []
        _sentence = []
        for word in sentence:
            if word[0] == OUTER_SEP and word[-1] == OUTER_SEP:
                _split = list(filter(None, word.split(OUTER_SEP)))
                if len(_split) != 2:
                    _label.append(default_no_link)
                    _sentence.append(word.replace(OUTER_SEP, ''))
                else:
                    text, link = _split
                    sub_links = filter(None, text.split(INNER_SEP))
                    for sub_link in sub_links:
                        _label.append(link2idx.get(link, default_no_link))
                        _sentence.append(sub_link)
            else:
                _label.append(default_no_link)
                _sentence.append(word)
        labels.append(_label)
        sentences.append(_sentence)
    return labels, sentences

# This function takes a batch of sentences and pads/trims every sentence to seq_len
def pad_input(sentences, seq_len, pad_token):
    features = np.ones((len(sentences), seq_len)) * pad_token
    for ii, sentence in enumerate(sentences):
        features[ii, :len(sentence)] = np.array(sentence)[:seq_len]
    return features

In [7]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""

    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = dict()
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def add_many(self, tokens):
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [None]:
class ReviewVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, review_vocab, rating_vocab):
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab

    def vectorize(self, review):
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1

        return one_hot

    @classmethod
    def from_dataframe(cls, review_df, cutoff=25):
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        
        # Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        # Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
               
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)

    @classmethod
    def from_serializable(cls, contents):
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab =  Vocabulary.from_serializable(contents['rating_vocab'])

        return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)

    def to_serializable(self):
        return {'review_vocab': self.review_vocab.to_serializable(),
                'rating_vocab': self.rating_vocab.to_serializable()}

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        self.review_df = review_df
        self._vectorizer = vectorizer

        self.train_df = self.review_df[self.review_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.review_df[self.review_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.review_df[self.review_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        review_df = pd.read_csv(review_csv)
        train_review_df = review_df[review_df.split=='train']
        return cls(review_df, ReviewVectorizer.from_dataframe(train_review_df))
    
    @classmethod
    def load_dataset_and_load_vectorizer(cls, review_csv, vectorizer_filepath):
        review_df = pd.read_csv(review_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(review_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return ReviewVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]

        review_vector = \
            self._vectorizer.vectorize(row.review)

        rating_index = \
            self._vectorizer.rating_vocab.lookup_token(row.rating)

        return {'x_data': review_vector,
                'y_target': rating_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size  
    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [3]:
# Preprocess data ...

INNER_SEP = '_'
OUTER_SEP = '|'

train_file = bz2.BZ2File('../input_data/train.txt.bz2').read().decode('utf-8')
test_file = bz2.BZ2File('../input_data/test.txt.bz2').read().decode('utf-8')

train_file = [[word.lower() for word in nltk.word_tokenize(sentence)] for sentence in nltk.sent_tokenize(train_file)]
test_file = [[word.lower() for word in  nltk.word_tokenize(sentence)] for sentence in nltk.sent_tokenize(test_file)]

link_treshold = 1
valid_links = Counter([word.split(OUTER_SEP)[-2] for sentence in train_file for word in sentence if word[0] == OUTER_SEP and word[-1] == OUTER_SEP])
valid_links = set([link for link,frequence in valid_links.items() if frequence >= link_treshold])

train_file, train_links = collect_links(train_file, valid_links, INNER_SEP, OUTER_SEP)
test_file, test_links = collect_links(test_file, valid_links, INNER_SEP, OUTER_SEP)

output = train_links + test_links + ["_TEXT"]
_NO_LINK = len(output) -1
link2idx = {l:i for i,l in enumerate(output)}
idx2link = {i:l for i,l in enumerate(output)}

train_labels, train_file = encode_labels(train_file, link2idx, INNER_SEP, OUTER_SEP, _NO_LINK)
test_labels, test_file = encode_labels(test_file, link2idx, INNER_SEP, OUTER_SEP, _NO_LINK)

PAD_TOKEN = "_PAD"
UNKOWN_TOKEN = "_UNK"
vocabulary = Counter([word for sentence in train_file for word in sentence])
vocabulary = [PAD_TOKEN, UNKOWN_TOKEN] + sorted(vocabulary, key=vocabulary.get, reverse=True)

word2idx = {w:i for i,w in enumerate(vocabulary)}
idx2word = {i:w for i,w in enumerate(vocabulary)} # probably vocabulary array is enough

In [4]:
# Encode words as integers ...

train_sentences = [[word2idx[word] if word in word2idx else word2idx[UNKOWN_TOKEN] for word in sentence] for sentence in train_file]
test_sentences = [[word2idx[word] if word in word2idx else word2idx[UNKOWN_TOKEN] for word in sentence] for sentence in test_file]

#train_sentences.sort(key=len, reverse=True)
#test_sentences.sort(key=len, reverse=True)

batch_size = 32



#train_lengths = [len(sentence) for sentence in train_sentences]
#test_lengths = [len(sentence) for sentence in test_sentences]

In [7]:
print([x for x in train_sentences[0]])

[3, 842, 5, 4545, 6, 1970, 2830, 281, 8, 3, 5237, 4]


In [30]:
print([idx2word[x] for x in test_sentences[0]])

['kerameikos', 'also', 'known', 'by', 'its', 'latinized', 'form', '_UNK', ',', 'is', 'an', 'area', 'of', 'athens', ',', 'greece', ',', 'located', 'to', 'the', 'northwest', 'of', 'the', '|acropolis|acropolis', ',', '_athens|', ',', 'which', 'includes', 'an', 'extensive', 'area', 'both', 'within', 'and', 'outside', 'the', 'ancient', 'city', 'walls', ',', 'on', 'both', 'sides', 'of', 'the', 'dipylon', 'gate', 'and', 'by', 'the', 'banks', 'of', 'the', '_UNK', 'river', '.']
