In [1]:
import os
import urllib
import zipfile
import nltk
import numpy as np
import tensorflow as tf

# Method1: build vocabu from embedding table
 * buil vocabu from pre-trained embedding instead of corpus
 * don't distinct the Unknow words, all map to single UNK
     * embedding_table[0] = PAD;  embedding_table[-1] = UNK;
 * Uncased embedding
 * limit the vocabulary size
 * tf.embedding_lookup
 * Data structure
     * word2idx :  dictionary for mapping words to their index token - used for converting a sequence of words to sequence of integers for embedding lookup
     * idx2word : a list of words in order - used for decoding an integer sequence to words
     * weights : a matrice of size VOCAB_LENGTH x EMBEDDING_DIMESNION containing the vectors for each word
https://www.damienpontifex.com/2017/10/27/using-pre-trained-glove-embeddings-in-tensorflow/

In [47]:
# Available dimensions for 6B data is 50, 100, 200, 300
EMBEDDING_DIMENSION=50 
data_directory = '../data/glove'

if not os.path.isdir(data_directory):
    os.mkdir(data_directory)

glove_weights_file_path = os.path.join(data_directory, 'glove.6B.{}d.txt'.format(MBEDDING_DIMENSION))

# if not glove files, download if
if not os.path.isfile(glove_weights_file_path):
    # Glove embedding weights can be downloaded from https://nlp.stanford.edu/projects/glove/
    glove_fallback_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    local_zip_file_path = os.path.join(data_directory, os.path.basename(glove_fallback_url))
    if not os.path.isfile(local_zip_file_path):
        print('Retreiving glove weights from {}'.format(fallback_url))
        urllib.request.urlretrieve(glove_fallback_url, local_zip_file_path)
    with zipfile.ZipFile(local_zip_file_path, 'r') as z:
        print('Extracting glove weights from {}'.format(local_zip_file_path))
        z.extractall(path=data_directory)

In [79]:
# add 'PAD' the only uppercase word
PAD_TOKEN = 0

# dict so we can lookup indices for tokenising our text later from string to sequence of integers weights = []
word2idx = { 'PAD': PAD_TOKEN } 
weights = []
idx2word = []


with open(glove_weights_file_path, 'r') as file:     
    for index, line in enumerate(file): 
        values = line.split()
        # Word and weights separated by space 
        word = values[0]
        # Word is first symbol on each line 
        word_weights = np.asarray(values[1:], dtype=np.float32) 
        # Remainder of line is weights for word 
        word2idx[word] = index + 1 
        # Remainder of line is weights for word 
        weights.append(word_weights)
        # update the idx2word
        idx2word.append(word)
        
        if index + 1 == 40000:
            # Limit vocabulary to top 40k terms
            break
            
# Insert the PAD weights at index 0 now we know the embedding dimension
weights.insert(0, np.random.randn(EMBEDDING_DIMENSION))
idx2word.insert(0,'PAD')

# Append unknown and pad to end of vocab and initialize as random
UNKNOWN_TOKEN=len(weights) 
word2idx['UNK'] = UNKNOWN_TOKEN 
weights.append(np.random.randn(EMBEDDING_DIMENSION))
idx2word.append('UNK')
# Construct our final vocab
weights = np.asarray(weights, dtype=np.float32)

VOCAB_SIZE=weights.shape[0]

In [80]:
# Embeddings in TensorFlow
features = {}
features['word_indices'] = nltk.word_tokenize('hello world') # ['hello', 'world']
features['word_indices'] = [word2idx.get(word, UNKNOWN_TOKEN) for word in features['word_indices']]
features

{'word_indices': [13076, 86]}

In [82]:
tf.reset_default_graph()
glove_weights_initializer = tf.constant_initializer(weights)
embedding_weights = tf.get_variable(
    name='embedding_weights', 
    shape=(VOCAB_SIZE, EMBEDDING_DIMENSION), 
    initializer=glove_weights_initializer,
    trainable=False)
embedding = tf.nn.embedding_lookup(embedding_weights, features['word_indices'])
init_op = tf.initialize_all_variables()
with tf.Session() as sess:
    sess.run(init_op)
    print(sess.run(embedding))

[[-0.38497   0.80092   0.064106 -0.28355  -0.026759 -0.34532  -0.64253
  -0.11729  -0.33257   0.55243  -0.087813  0.9035    0.47102   0.56657
   0.6985   -0.35229  -0.86542   0.90573   0.03576  -0.071705 -0.12327
   0.54923   0.47005   0.35572   1.2611   -0.67581  -0.94983   0.68666
   0.3871   -1.3492    0.63512   0.46416  -0.48814   0.83827  -0.9246
  -0.33722   0.53741  -1.0616   -0.081403 -0.67111   0.30923  -0.3923
  -0.55002  -0.68827   0.58049  -0.11626   0.013139 -0.57654   0.048833
   0.67204 ]
 [-0.41486   0.71848  -0.3045    0.87445   0.22441  -0.56488  -0.37566
  -0.44801   0.61347  -0.11359   0.74556  -0.10598  -1.1882    0.50974
   1.3511    0.069851  0.73314   0.26773  -1.1787   -0.148     0.039853
   0.033107 -0.27406   0.25125   0.41507  -1.6188   -0.81778  -0.73892
  -0.28997   0.57277   3.4719    0.73817  -0.044495 -0.15119  -0.93503
  -0.13152  -0.28562   0.76327  -0.83332  -0.6793   -0.39099  -0.64466
   1.0044   -0.2051    0.46799   0.99314  -0.16221  -0.46022  -0

In [87]:
# check idx2word
print(idx2word[13076], idx2word[86])

hello world


# 2rd Method

In [None]:
https://dashayushman.github.io/tutorials/2017/08/19/neural-language-model.html
https://medium.com/@TalPerry/getting-text-into-tensorflow-with-the-dataset-api-ffb832c8bec6
    https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer
https://machinelearnings.co/tensorflow-text-classification-615198df9231

* Spacy for most of pre-processing
    *normalization 
        * lowercase, stemming lemmatization 
    * SEQUENCE_BEGIN and SEQUENCE_END
* create vocabulary from corpus
    * dict
        * Word2Idx: This dictionary has all the unique words(terms) as keys with a corresponding unique ID as values
        * Idx2Word: This is the reverse of Word2Idx. It has the unique IDs as keys and their corresponding words(terms) as values
* **collections library**
* count term **frequencies**, and to select the most commonly occurring terms in the vocabulary (as it covers most of the Natural Language).
* Uniform UNK
    * random initial the word not in pre-trained but in training corpus
    * not accept all the word in pre-trained
    * word not in training corpus but in test corpus are as UNK
    
* "0" pad
* **!! how to match the index according to frequency and the index of pre-trained table?? ** : re-extract it

* step : frequencey --> vocabu --> Word2Idx + Idx2Word --> re-build matched table.
         sentence --> token --> idx token -->

In [None]:
vocab = ['today', 'generative', 'stimuli', 'loosely', 'natural', 'at', 'level', 'opposed', 'experts', 
         'be', 'broader', 'comparable', 'propositional', 'also', 'language', 'wealth', 'such', 'responses', 
         'methods', 'continued', 'Colorado', 'correspond', 'leading', 'CAP', 'part', 'finance', 'valid', 'competed', 
         'in', 'or', 'composition', 'discoveries', 'recurrent', 'patterns', 'algorithms', 'biological', 'neural', 
         'some', 'networks', 'network', 'cascade', 'many', 'analysis', 'agree', 'Drinker', ':', 'theft', 'family', 
         'filtering', 'hidden', 'Othniel', 'massive', 'Bone', 'are', 'efficient', 'these', 'Cope', 'remain', 'America',
         'may', 'depends', 'levels', ']', 'solved', 'system', 'sought', 'high', '-', 'divides', 'disgrace', 'once', 
         'assignment', 'excavation', 'that', 'Academy', 'dinosaurs', 'low', 'rich', 'descriptions', 'results', 'human',
         'threshold', 'computer', 'previous', 'than', 'has', 'Deep', 'history', 'using', 'light', 'resorting', 
         'include', 'SEQUENCE_BEGIN', 'mainly', 'bones', 'applied', 'Belief', 'processing', 'no', 'dinosaur', 'define',
         'prehistoric', 'destruction', 'been', 'recognition', 'Marsh', 'speech', 'attempts', 'coding', 'each', 'depth', 
         'derived', 'ruined', 'potentially', 'representations', 'Wars', 'superior', 'features.[8', 'abstraction', 
         'information', 'brain', 'researchers', 'used', 'but', 'describe', 'complicated', 'own', 'paleontologists', 
         'species', 'systems', 'hierarchy', 'shed', 'research', 'caps', 'class', 'form', 'translation', 'Museum', 
         'pattern', 'path', 'Edward', 'transformations', 'socially', 'The', 'other', 'create', 'features', 'lower', 
         'fossils', 'concepts', 'neuronal', 'deaths', 'life', 'influence', 'fields', 'layers', 'different', 'audio', 
         'scientific', 'procure', 'layer', 'is', 'algorithm', 'applications', 'Machines', 'plus', 'upon', 'latent', 
         'representation', 'connections', '1892', 'contributions', 'science', 'through', 'definitions', 'hunters', 
         'organized', 'shallow', 'transformation', 'data', 'partially', 'wise', 'number', 'feedforward', 'bone', 
         'variables', 'Philadelphia', 'age', 'they', 'to', 'nodes', 'belief', 'universally', 'including', 'by', 
         'wars', 'of', 'where', 'unsupervised', 'architectures', 'Networks', '/', 'fossil', 'American', ',', 'and',
         'artificial', 'more', 'boxes', 'interest', 'propagate', 'follow', 'Yale', 'have', 'problem', 'efforts', 
         'Sciences', 'new', 'task', 'bribery', 'beds', 'associated', 'for', 'hierarchical', 'were', 'sparked', 
         'scale', 'cases', 'paleontology', 'the', 'Wyoming', 'interpretation', 'classification', 'multiple', 'an', 
         'higher', 'forming', 'agreed', 'surge', 'nervous', 'chain', 'Peabody', 'Charles', 'causal', '32', '.', 'deep',
         'vision', 'services', 'unlimited', 'unlabeled', 'formulas.[9', 'bioinformatics', 'uses', 'successive', 'one',
         'on', 'produced', 'cap', 'specific', 'rivalries', 'sets', 'as', 'financially', 'feature', 'large', 'a', 'most',
         'from', 'during', 'History', 'based', 'underhanded', 'Nebraska', 'expeditions', 'with', 'relationship', 
         'Boltzmann', 'credit', 'communication', '–', 'found', 'North', 'learning', ';', 'led', 'SEQUENCE_END', 
         'between', 'after', 'models', 'machine', 'extraction', 'unopened', "'s", 'learn', 'input', 'decades', 'their', 
         'social', '1877', 'Natural', 'various', 'common', 'gilded', 'mutual', 'publications', 'public', 'can', 
         'supervised', 'field', 'use', 'output', 'nonlinear', 'signal', 'attacks', 'which', 'units']

In [None]:
# count frequency
import collections

word_counter = collections.Counter()
for term in corpus_tokens:
    word_counter.update({term: 1})
vocab = word_counter.most_common(200) # 200 Most common terms
print('Vocab Size: {}'.format(len(vocab))) 
print(word_counter.most_common(100)) # just to show the top 100 terms

In [None]:
#UNKNOWN and PAD
vocab.append(('UNKNOWN', 1))
Idx = range(1, len(vocab)+1)
vocab = [t[0] for t in vocab]

# how to build dict
Word2Idx = dict(zip(vocab, Idx))
Idx2Word = dict(zip(Idx, vocab))

# zero for PAD
Word2Idx['PAD'] = 0
Idx2Word[0] = 'PAD'


VOCAB_SIZE = len(Word2Idx)
print('Word2Idx Size: {}'.format(len(Word2Idx)))
print('Idx2Word Size: {}'.format(len(Idx2Word)))
print(Word2Idx)

In [None]:
# re-extract pre-trained table to make index same order
# if one word not in pre-trained, it is ramdon initial
w2v = np.random.rand(len(Word2Idx), 300) # We use 300 because Spacy provides us with vectors of size 300

for w_i, key in enumerate(Word2Idx):
    token = nlp(key[0])
    if token.has_vector:
        w2v[w_i:] = token.vector
EMBEDDING_SIZE = w2v.shape[-1]
print('Shape of w2v: {}'.format(w2v.shape))
print('Some Vectors')
print(w2v)

In [1]:
# A method to convert a sequence of words into a sequence of IDs given a Word2Idx dictionary
def word2idseq(data, word2idx):
    id_seq = []
    for word in data:
        if word in word2idx:
            id_seq.append(word2idx[word])
        else:
            id_seq.append(word2idx['UNKNOWN'])
    return id_seq

# Thanks to http://locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/
# This method generated n-grams
def find_ngrams(input_list, n):
    return zip(*[input_list[i:] for i in range(n)])

train_id_seqs = word2idseq(train, Word2Idx)
validation_id_seqs = word2idseq(validation, Word2Idx)

print('Sample Train IDs')
print(train_id_seqs[-10:-1])
print('Sample Validation IDs')
print(validation_id_seqs[-10:-1])

NameError: name 'train' is not defined

# Method 3:

In [None]:
import sklearn.preprocessing
import utils
import collections
import codecs
import utils_nlp
import re
import time
import token
import os
import pickle
import random

In [None]:
def _parse_dataset(self, dataset_filepath):
    token_count = collections.defaultdict(lambda: 0)
    label_count = collections.defaultdict(lambda: 0)
    character_count = collections.defaultdict(lambda: 0)

    line_count = -1
    tokens = []
    labels = []
    new_token_sequence = []
    new_label_sequence = []
    if dataset_filepath:
        f = codecs.open(dataset_filepath, 'r', 'UTF-8')
        for line in f:
            line_count += 1
            line = line.strip().split(' ')
            if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
                if len(new_token_sequence) > 0:
                    labels.append(new_label_sequence)
                    tokens.append(new_token_sequence)
                    new_token_sequence = []
                    new_label_sequence = []
                continue
            token = str(line[0])
            label = str(line[-1])
            token_count[token] += 1
            label_count[label] += 1

            new_token_sequence.append(token)
            new_label_sequence.append(label)

            for character in token:
                character_count[character] += 1

            if self.debug and line_count > 200: break# for debugging purposes

        if len(new_token_sequence) > 0:
            labels.append(new_label_sequence)
            tokens.append(new_token_sequence)
        f.close()
    return labels, tokens, token_count, label_count, character_count

In [None]:
def load_dataset():
    token_to_vector = {}
    all_tokens_in_pretraining_dataset = []
    all_characters_in_pretraining_dataset = []

    
    remap_to_unk_count_threshold = 1
    self.UNK_TOKEN_INDEX = 0
    self.PADDING_CHARACTER_INDEX = 0
    self.tokens_mapped_to_unk = []
    self.UNK = 'UNK'
    self.unique_labels = []
    labels = {}
    tokens = {}
    label_count = {}
    token_count = {}
    character_count = {}
    
    ####!!!!!!######
    for dataset_type in ['train', 'valid', 'test', 'deploy']:
        labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \
                = self._parse_dataset(dataset_filepaths.get(dataset_type, None))
    
    
    token_count['all'] = {}
    for token in list(token_count['train'].keys()) + list(token_count['valid'].keys()) + list(token_count['test'].keys()) + list(token_count['deploy'].keys()):
        token_count['all'][token] = token_count['train'][token] + token_count['valid'][token] + token_count['test'][token] + token_count['deploy'][token]

In [None]:
def load_dataset(self, dataset_filepaths, parameters, token_to_vector=None):
        '''
        dataset_filepaths : dictionary with keys 'train', 'valid', 'test', 'deploy'
        '''
        start_time = time.time()
        print('Load dataset... ', end='', flush=True)
        if parameters['token_pretrained_embedding_filepath'] != '':
            if token_to_vector==None:
                token_to_vector = utils_nlp.load_pretrained_token_embeddings(parameters)
        else:
            token_to_vector = {}
        if self.verbose: print("len(token_to_vector): {0}".format(len(token_to_vector)))

        # Load pretraining dataset to ensure that index to label is compatible to the pretrained model,
        #   and that token embeddings that are learned in the pretrained model are loaded properly.
        all_tokens_in_pretraining_dataset = []
        all_characters_in_pretraining_dataset = []
        if parameters['use_pretrained_model']:
            pretraining_dataset = pickle.load(open(os.path.join(parameters['pretrained_model_folder'], 'dataset.pickle'), 'rb'))
            all_tokens_in_pretraining_dataset = pretraining_dataset.index_to_token.values()
            all_characters_in_pretraining_dataset = pretraining_dataset.index_to_character.values()

        remap_to_unk_count_threshold = 1
        self.UNK_TOKEN_INDEX = 0
        self.PADDING_CHARACTER_INDEX = 0
        self.tokens_mapped_to_unk = []
        self.UNK = 'UNK'
        self.unique_labels = []
        labels = {}
        tokens = {}
        label_count = {}
        token_count = {}
        character_count = {}
        for dataset_type in ['train', 'valid', 'test', 'deploy']:
            labels[dataset_type], tokens[dataset_type], token_count[dataset_type], label_count[dataset_type], character_count[dataset_type] \
                = self._parse_dataset(dataset_filepaths.get(dataset_type, None))

            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose: print("len(token_count[dataset_type]): {0}".format(len(token_count[dataset_type])))

        token_count['all'] = {}
        for token in list(token_count['train'].keys()) + list(token_count['valid'].keys()) + list(token_count['test'].keys()) + list(token_count['deploy'].keys()):
            token_count['all'][token] = token_count['train'][token] + token_count['valid'][token] + token_count['test'][token] + token_count['deploy'][token]
        
        if parameters['load_all_pretrained_token_embeddings']:
            for token in token_to_vector:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1
            for token in all_tokens_in_pretraining_dataset:
                if token not in token_count['all']:
                    token_count['all'][token] = -1
                    token_count['train'][token] = -1

        character_count['all'] = {}
        for character in list(character_count['train'].keys()) + list(character_count['valid'].keys()) + list(character_count['test'].keys()) + list(character_count['deploy'].keys()):
            character_count['all'][character] = character_count['train'][character] + character_count['valid'][character] + character_count['test'][character] + character_count['deploy'][character]

        for character in all_characters_in_pretraining_dataset:
            if character not in character_count['all']:
                character_count['all'][character] = -1
                character_count['train'][character] = -1

        for dataset_type in dataset_filepaths.keys():
            if self.verbose: print("dataset_type: {0}".format(dataset_type))
            if self.verbose: print("len(token_count[dataset_type]): {0}".format(len(token_count[dataset_type])))

        label_count['all'] = {}
        for character in list(label_count['train'].keys()) + list(label_count['valid'].keys()) + list(label_count['test'].keys()) + list(label_count['deploy'].keys()):
            label_count['all'][character] = label_count['train'][character] + label_count['valid'][character] + label_count['test'][character] + label_count['deploy'][character]

        token_count['all'] = utils.order_dictionary(token_count['all'], 'value_key', reverse = True)
        label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse = False)
        character_count['all'] = utils.order_dictionary(character_count['all'], 'value', reverse = True)
        if self.verbose: print('character_count[\'all\']: {0}'.format(character_count['all']))

        token_to_index = {}
        token_to_index[self.UNK] = self.UNK_TOKEN_INDEX
        iteration_number = 0
        number_of_unknown_tokens = 0
        if self.verbose: print("parameters['remap_unknown_tokens_to_unk']: {0}".format(parameters['remap_unknown_tokens_to_unk']))
        if self.verbose: print("len(token_count['train'].keys()): {0}".format(len(token_count['train'].keys())))
        for token, count in token_count['all'].items():
            if iteration_number == self.UNK_TOKEN_INDEX: iteration_number += 1

            if parameters['remap_unknown_tokens_to_unk'] == 1 and \
                (token_count['train'][token] == 0 or \
                parameters['load_only_pretrained_token_embeddings']) and \
                not utils_nlp.is_token_in_pretrained_embeddings(token, token_to_vector, parameters) and \
                token not in all_tokens_in_pretraining_dataset:
                if self.verbose: print("token: {0}".format(token))
                if self.verbose: print("token.lower(): {0}".format(token.lower()))
                if self.verbose: print("re.sub('\d', '0', token.lower()): {0}".format(re.sub('\d', '0', token.lower())))
                token_to_index[token] =  self.UNK_TOKEN_INDEX
                number_of_unknown_tokens += 1
                self.tokens_mapped_to_unk.append(token)
            else:
                token_to_index[token] = iteration_number
                iteration_number += 1
        if self.verbose: print("number_of_unknown_tokens: {0}".format(number_of_unknown_tokens))

        infrequent_token_indices = []
        for token, count in token_count['train'].items():
            if 0 < count <= remap_to_unk_count_threshold:
                infrequent_token_indices.append(token_to_index[token])
        if self.verbose: print("len(token_count['train']): {0}".format(len(token_count['train'])))
        if self.verbose: print("len(infrequent_token_indices): {0}".format(len(infrequent_token_indices)))

        # Ensure that both B- and I- versions exist for each label
        labels_without_bio = set()
        for label in label_count['all'].keys():
            new_label = utils_nlp.remove_bio_from_label_name(label)
            labels_without_bio.add(new_label)
        for label in labels_without_bio:
            if label == 'O':
                continue
            if parameters['tagging_format'] == 'bioes':
                prefixes = ['B-', 'I-', 'E-', 'S-']
            else:
                prefixes = ['B-', 'I-']
            for prefix in prefixes:
                l = prefix + label
                if l not in label_count['all']:
                    label_count['all'][l] = 0
        label_count['all'] = utils.order_dictionary(label_count['all'], 'key', reverse = False)

        if parameters['use_pretrained_model']:
            self.unique_labels = sorted(list(pretraining_dataset.label_to_index.keys()))
            # Make sure labels are compatible with the pretraining dataset.
            for label in label_count['all']:
                if label not in pretraining_dataset.label_to_index:
                    raise AssertionError("The label {0} does not exist in the pretraining dataset. ".format(label) +
                                         "Please ensure that only the following labels exist in the dataset: {0}".format(', '.join(self.unique_labels)))
            label_to_index = pretraining_dataset.label_to_index.copy()
        else:
            label_to_index = {}
            iteration_number = 0
            for label, count in label_count['all'].items():
                label_to_index[label] = iteration_number
                iteration_number += 1
                self.unique_labels.append(label)

        if self.verbose: print('self.unique_labels: {0}'.format(self.unique_labels))

        character_to_index = {}
        iteration_number = 0
        for character, count in character_count['all'].items():
            if iteration_number == self.PADDING_CHARACTER_INDEX: iteration_number += 1
            character_to_index[character] = iteration_number
            iteration_number += 1

        if self.verbose: print('token_count[\'train\'][0:10]: {0}'.format(list(token_count['train'].items())[0:10]))
        token_to_index = utils.order_dictionary(token_to_index, 'value', reverse = False)
        if self.verbose: print('token_to_index: {0}'.format(token_to_index))
        index_to_token = utils.reverse_dictionary(token_to_index)
        if parameters['remap_unknown_tokens_to_unk'] == 1: index_to_token[self.UNK_TOKEN_INDEX] = self.UNK
        if self.verbose: print('index_to_token: {0}'.format(index_to_token))

        if self.verbose: print('label_count[\'train\']: {0}'.format(label_count['train']))
        label_to_index = utils.order_dictionary(label_to_index, 'value', reverse = False)
        if self.verbose: print('label_to_index: {0}'.format(label_to_index))
        index_to_label = utils.reverse_dictionary(label_to_index)
        if self.verbose: print('index_to_label: {0}'.format(index_to_label))

        character_to_index = utils.order_dictionary(character_to_index, 'value', reverse = False)
        index_to_character = utils.reverse_dictionary(character_to_index)
        if self.verbose: print('character_to_index: {0}'.format(character_to_index))
        if self.verbose: print('index_to_character: {0}'.format(index_to_character))


        if self.verbose: print('labels[\'train\'][0:10]: {0}'.format(labels['train'][0:10]))
        if self.verbose: print('tokens[\'train\'][0:10]: {0}'.format(tokens['train'][0:10]))

        if self.verbose:
            # Print sequences of length 1 in train set
            for token_sequence, label_sequence in zip(tokens['train'], labels['train']):
                if len(label_sequence) == 1 and label_sequence[0] != 'O':
                    print("{0}\t{1}".format(token_sequence[0], label_sequence[0]))

        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.index_to_character = index_to_character
        self.character_to_index = character_to_index
        self.index_to_label = index_to_label
        self.label_to_index = label_to_index
        if self.verbose: print("len(self.token_to_index): {0}".format(len(self.token_to_index)))
        if self.verbose: print("len(self.index_to_token): {0}".format(len(self.index_to_token)))
        self.tokens = tokens
        self.labels = labels

        token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices = self._convert_to_indices(dataset_filepaths.keys())
        
        self.token_indices = token_indices
        self.label_indices = label_indices
        self.character_indices_padded = character_indices_padded
        self.character_indices = character_indices
        self.token_lengths = token_lengths
        self.characters = characters
        self.label_vector_indices = label_vector_indices

        self.number_of_classes = max(self.index_to_label.keys()) + 1
        self.vocabulary_size = max(self.index_to_token.keys()) + 1
        self.alphabet_size = max(self.index_to_character.keys()) + 1
        if self.verbose: print("self.number_of_classes: {0}".format(self.number_of_classes))
        if self.verbose: print("self.alphabet_size: {0}".format(self.alphabet_size))
        if self.verbose: print("self.vocabulary_size: {0}".format(self.vocabulary_size))

        # unique_labels_of_interest is used to compute F1-scores.
        self.unique_labels_of_interest = list(self.unique_labels)
        self.unique_labels_of_interest.remove('O')

        self.unique_label_indices_of_interest = []
        for lab in self.unique_labels_of_interest:
            self.unique_label_indices_of_interest.append(label_to_index[lab])

        self.infrequent_token_indices = infrequent_token_indices

        if self.verbose: print('self.unique_labels_of_interest: {0}'.format(self.unique_labels_of_interest))
        if self.verbose: print('self.unique_label_indices_of_interest: {0}'.format(self.unique_label_indices_of_interest))

        elapsed_time = time.time() - start_time
        print('done ({0:.2f} seconds)'.format(elapsed_time))
        
        return token_to_vector