# 1.Test create Vocabulary function

In [13]:
import sklearn.preprocessing
# import utils
import collections
import codecs
# import utils_nlp
import re
import time
import token
import os
import pickle
import random
import json
import timeit
import numpy as np

path_train = '../data/CoNLL2003/eng.train'
path_eval = '../data/CoNLL2003/eng.testa'
path_test = '../data/CoNLL2003/eng.testb'

In [14]:
# first 10 lines in test file
! head -10 ../data/CoNLL2003/eng.train

-DOCSTART- -X- O O

EU NNP I-NP I-ORG
rejects VBZ I-VP O
German JJ I-NP I-MISC
call NN I-NP O
to TO I-VP O
boycott VB I-VP O
British JJ I-NP I-MISC
lamb NN I-NP O


In [15]:
def get_glove_vocab(filename):
    vocab = set()
    with open(filename) as f:
        for line in f:
            word = line.strip().split(' ')[0]
            vocab.add(word)
    return vocab

In [16]:
def get_2idx(vocabu, save_idx = False, file_path = None):
    dictionary = dict()
    for idx, word in enumerate(vocabu):
        word = word.strip()
        dictionary[word] = idx
        
    # save index    
    if save_idx:
        if not os.path.exists(os.path.dirname(file_path)):
            try:
                os.makedirs(os.path.dirname(file_path))
            except OSError as exc: # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise                    
        with open(file_path, 'w+') as fp:
            json.dump(dictionary, fp, indent=4)
            
    return dictionary

In [17]:
def get_embedding_lookup_table(vocab, glove_filename, dim = 100, save_table = False, file_path = None):

    embeddings = np.zeros([len(vocab), dim])
    with open(glove_filename) as f:
        for line in f:
            line = line.strip().split(' ')
            word = line[0]
            embedding = [float(x) for x in line[1:]]
            if word in vocab:
                word_idx = vocab[word]
                embeddings[word_idx] = np.asarray(embedding)
                
    # save lookup table
    if save_table:
        if not os.path.exists(os.path.dirname(file_path)):
            try:
                os.makedirs(os.path.dirname(file_path))
            except OSError as exc: # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise                    
        with open(file_path, 'w+') as fp:
            np.savez_compressed(trimmed_filename, embeddings=embeddings)
            
    return embeddings

In [18]:
def get_vocabs(filepath, sparator = ' ', lowercase = True):
    
    count_token = collections.Counter()
    count_label = collections.Counter()
    count_character = collections.Counter()
    
    if filepath:
        f = codecs.open(filepath, 'r', 'UTF-8')
        for line in f:
            line = line.strip().split(sparator)

            #skip sentence separator
            if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
                continue
            
            token = str(line[0])
            for character in token:
                count_character.update({character: 1})
                
            # lowercase & digit
            if lowercase:
                token = str(line[0]).lower()
            else:
                token = str(line[0])
                
            # use the digit in pretrained embedding
#             if token.isdigit():
#                 token = '$NUM$'
                
            label = str(line[-1])
            count_token.update({token: 1})
            count_label.update({label: 1})              
        
        f.close()    
            
    return count_token, count_label, count_character

In [19]:
start = timeit.default_timer()
print("Building vocab...")
count_token = {} 
count_label = {} 
count_character = {}

datasets = [('train',path_train), ('eval', path_eval), ('test', path_test)]
for dataset in datasets:
    count_token[dataset[0]], count_label[dataset[0]], count_character[dataset[0]] = get_vocabs(dataset[1])

vocab_token_corpus = count_token['train'] + count_token['eval'] + count_token['test']
vocab_label = count_label['train'] + count_label['eval'] + count_label['test']
vocab_char = count_character['train'] + count_character['eval'] + count_character['test']

# sorted the vocabu by frequency 
vocab_token_corpus = [x[0] for x in vocab_token_corpus.most_common()]
vocab_label = [x[0] for x in vocab_label.most_common()]
vocab_char = [x[0] for x in vocab_char.most_common()]

# future features: limit the vocabulary by threshold
###############################################
# if config.vocabulary_threshold > 1:
#     vocab_token_corpus = 
###############################################

# vocab in pre-trained embedding
filename_glove = '../data/glove/glove.6B.100d.txt'
vocab_glove = get_glove_vocab(filename_glove)

# selected only common vocabs in corpus and pre-trained embedding(like glove)
vocab_token_final = [token for token in vocab_token_corpus if token.strip() in vocab_glove]
vocab_token_final = ['$UNK$'] + vocab_token_final


# generate 2idx mapping dict for token, char, label
path = '../data/idx/'
save_idx = True
paths = ['../data/idx/token2idx.json', '../data/idx/label2idx.json', '../data/idx/tag2idx.json']
token2idx = get_2idx(vocab_token_final, save_idx, paths[0])
char2idx = get_2idx(vocab_char, save_idx, paths[1])
label2idx = get_2idx(vocab_label, save_idx, paths[2])

# get embedding lookup table
lookup_table = get_embedding_lookup_table(token2idx, filename_glove)

stop = timeit.default_timer()
print("vocabulary for this corpus: {} tokens, {} labels, {} chars"
      .format(len(vocab_token_corpus), len(vocab_char),len(vocab_label)))
print("final vocabulary : {} vocabs"
      .format(len(vocab_token_final)))
print('vocabulary construction time: ', stop - start) 

Building vocab...
vocabulary for this corpus: 26869 tokens, 85 labels, 8 chars
final vocabulary : 22948 vocabs
vocabulary construction time:  19.17466149595566


### 1.1 check internal variable

In [20]:
token2idx

{'$UNK$': 0,
 'the': 1,
 ',': 2,
 '.': 3,
 'of': 4,
 'in': 5,
 'to': 6,
 'a': 7,
 '(': 8,
 ')': 9,
 'and': 10,
 '"': 11,
 'on': 12,
 'said': 13,
 "'s": 14,
 'for': 15,
 '-': 16,
 '1': 17,
 'at': 18,
 'was': 19,
 '2': 20,
 'with': 21,
 '3': 22,
 '0': 23,
 'that': 24,
 'he': 25,
 'from': 26,
 'by': 27,
 'it': 28,
 ':': 29,
 'is': 30,
 '4': 31,
 'as': 32,
 'his': 33,
 'had': 34,
 'were': 35,
 'an': 36,
 'but': 37,
 'not': 38,
 'after': 39,
 'has': 40,
 'be': 41,
 'have': 42,
 'new': 43,
 'first': 44,
 'who': 45,
 '5': 46,
 'will': 47,
 '6': 48,
 'two': 49,
 'they': 50,
 'u.s.': 51,
 '$': 52,
 'been': 53,
 'their': 54,
 'i': 55,
 'are': 56,
 'which': 57,
 'would': 58,
 '--': 59,
 'beat': 60,
 'friday': 61,
 'this': 62,
 '7': 63,
 'up': 64,
 'its': 65,
 'percent': 66,
 'one': 67,
 'out': 68,
 'we': 69,
 'year': 70,
 'thursday': 71,
 'over': 72,
 'last': 73,
 'million': 74,
 'government': 75,
 'police': 76,
 'against': 77,
 'results': 78,
 '10': 79,
 'world': 80,
 'when': 81,
 'second': 82,


In [21]:
print('chars that in test not in train:', count_character['test'].keys() - count_character['train'].keys())

chars that in test not in train: {'#'}


In [22]:
print('label vocabs:', vocab_label)

label vocabs: ['O', 'I-PER', 'I-ORG', 'I-LOC', 'I-MISC', 'B-MISC', 'B-ORG', 'B-LOC']


In [23]:
print('chars vocabs in train:{}, vs in total corpus:{}'.format(len(count_character['train'].keys()), 
                                                               len(vocab_char)))

chars vocabs in train:84, vs in total corpus:85


In [24]:
print('token vocabs that in corpus:',len(vocab_token_corpus))
print('token vocabs that in pretrained-embedding:',len(vocab_glove))
print('final token vocabs:',len(vocab_token_final))
print('shape of lookup table:',lookup_table.shape)

token vocabs that in corpus: 26869
token vocabs that in pretrained-embedding: 400000
final token vocabs: 22948
shape of lookup table: (22948, 100)


### 1.2 check finally result
* extract embedding directly from glove 
* extract embedding from lookup table by token2idx
* compare finally result

In [25]:
def get_glove(file_path):
    glove = dict()
    with open(file_path) as f:
        for line in f:
            line = line.strip().split(' ')
            word = line[0]
            embedding = [float(x) for x in line[1:]]
            glove[word] = embedding
    return glove

In [26]:
test = ['i', 'went', 'to', 'paris', 'yesterday', '.']
filename_glove = '../data/glove/glove.6B.100d.txt'
glove = get_glove(filename_glove)
voctors_from_glove = np.asarray([glove[token] for token in test])
voctors_from_table = np.asarray([lookup_table[token2idx[token]] for token in test])

In [27]:
print('if the finall result correct:',np.all(voctors_from_glove == voctors_from_table))

if the finall result correct: True


# 2. Split CoNLL columns into sentences + Mapping to Idx

In [28]:
def get_inputs(dataset_filepath, token2idx, char2idx, label2idx, sparator = ' ', lowercase = True):
    
    # collection per sentence
    # format [[[char_idxs], word_idx], ...]
    sentence_token = []
    # format [[label], ...]
    sentence_label = []
    
    # format [[sentence1_token], [sentence2_token], ...]
    tokens = []
    # format [[sentence1_label], [sentence2_label], ...]
    labels = []

    # go throught whole CoNLL file
    f = codecs.open(dataset_filepath, 'r', 'UTF-8')
    for line in f:
        line = line.strip().split(sparator)
        # encouter a new sentence
        if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
            if len(sentence_token) > 0:
                labels.append(sentence_label)
                tokens.append(sentence_token)
                sentence_label = []
                sentence_token = []
            continue
                
        token = str(line[0])
        label = str(line[-1])    
        # 1. preprocess word
        if lowercase:
            word = token.lower()
        # don't use NUM
#         if word.isdigit():
#             word = NUM

        # char idxs
        char_idxs = []
        for char in word:
            if char in char2idx:
                char_idxs += [char2idx[char]]  
            else:
                print("encounter UNK char:", char)
        
        # word idx
        if word in token2idx:
            word_idx = token2idx[word]
        else:
            word_idx = token2idx['$UNK$']
        
        # label idx
        if label in label2idx:
            label_idx = label2idx[label]
        else:
            print("encounter UNK label:", label)
            
        sentence_token.append((char_idxs, word_idx))
        sentence_label.append(label_idx)

    if len(sentence_token) > 0:
        tokens.append(sentence_token)
        labels.append(sentence_label)

    f.close()
    
    return tokens, labels

### 2.2 check finally result

In [29]:
! cat ../data/test_conll_small.txt

-DOCSTART- -X- O O

CRICKET NNP I-NP O
- : O O
LEICESTERSHIRE NNP I-NP I-ORG
TAKE NNP I-NP O
OVER IN I-PP O
AT NNP I-NP O
TOP NNP I-NP O
AFTER NNP I-NP O
INNINGS NNP I-NP O
VICTORY NN I-NP O
. . O O

LONDON NNP I-NP I-LOC
1996-08-30 CD I-NP O


test123 CD I-NP O
123 CD I-NP O
test CD I-NP O
I CD I-NP O
went CD I-NP O
to CD I-NP O
Paris CD I-NP I-LOC
yesterday CD I-NP O
. CD I-NP O

CRICKET NNP I-NP O
- : O O
LEICESTERSHIRE NNP I-NP I-ORG
TAKE NNP I-NP O
OVER IN I-PP O
AT NNP I-NP O
TOP NNP I-NP O
AFTER NNP I-NP O
INNINGS NNP I-NP O
VICTORY NN I-NP O
. . O O

LONDON NNP I-NP I-LOC
1996-08-30 CD I-NP O


test123 CD I-NP O
123 CD I-NP O
test CD I-NP O
I CD I-NP O
went CD I-NP O
to CD I-NP O
Paris CD I-NP I-LOC
yesterday CD I-NP O
. CD I-NP O


In [30]:
dataset_filepath = '../data/test_conll_small1.txt'
tokens, labels = get_inputs(dataset_filepath, token2idx, char2idx, label2idx)
print('number of sentence:', len(tokens))
print('number of token of test sentence:', len(tokens[-1]))

number of sentence: 6
number of token of test sentence: 9


In [31]:
print('shape of tokens:', np.asarray(tokens).shape)
print('shape of labels:', np.asarray(labels).shape)

shape of tokens: (6,)
shape of labels: (6,)


In [32]:
tokens[-1]

[([2, 0, 7, 2, 21, 29, 37], 0),
 ([21, 29, 37], 12655),
 ([2, 0, 7, 2], 404),
 ([4], 55),
 ([19, 0, 3, 2], 626),
 ([2, 5], 6),
 ([15, 1, 6, 4, 7], 400),
 ([18, 0, 7, 2, 0, 6, 9, 1, 18], 2529),
 ([17], 3)]

In [33]:
checking = [ '123', 'test', 'i', 'went', 'to', 'paris', 'yesterday', '.']
print('token for checking:', [token2idx[token] for token in checking])

token for checking: [12655, 404, 55, 626, 6, 400, 2529, 3]


In [34]:
checking = [ 'y', 'e', 's', 't', 'e', 'r','d','a', 'y', '.']
print('token for checking:', [char2idx[token] for token in checking])

token for checking: [18, 0, 7, 2, 0, 6, 9, 1, 18, 17]


In [35]:
labels[-1]

[0, 0, 0, 0, 0, 0, 3, 0, 0]

In [36]:
checking = [ 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O']
print('token for checking:', [label2idx[token] for token in checking])

token for checking: [0, 0, 0, 0, 0, 0, 3, 0, 0]


# 3. shuffle data + Minibase

In [37]:
tokens

[[([11, 6, 4, 11, 28, 0, 2], 277),
  ([22], 16),
  ([8, 0, 4, 11, 0, 7, 2, 0, 6, 7, 10, 4, 6, 0], 1907),
  ([2, 1, 28, 0], 244),
  ([5, 26, 0, 6], 72),
  ([1, 2], 18),
  ([2, 5, 15], 385),
  ([1, 14, 2, 0, 6], 39),
  ([4, 3, 3, 4, 3, 16, 7], 284),
  ([26, 4, 11, 2, 5, 6, 18], 310),
  ([17], 3)],
 [([8, 5, 3, 9, 5, 3], 100), ([21, 31, 31, 32, 22, 24, 45, 22, 37, 24], 0)],
 [([2, 0, 7, 2, 21, 29, 37], 0),
  ([21, 29, 37], 12655),
  ([2, 0, 7, 2], 404),
  ([4], 55),
  ([19, 0, 3, 2], 626),
  ([2, 5], 6),
  ([15, 1, 6, 4, 7], 400),
  ([18, 0, 7, 2, 0, 6, 9, 1, 18], 2529),
  ([17], 3)],
 [([11, 6, 4, 11, 28, 0, 2], 277),
  ([22], 16),
  ([8, 0, 4, 11, 0, 7, 2, 0, 6, 7, 10, 4, 6, 0], 1907),
  ([2, 1, 28, 0], 244),
  ([5, 26, 0, 6], 72),
  ([1, 2], 18),
  ([2, 5, 15], 385),
  ([1, 14, 2, 0, 6], 39),
  ([4, 3, 3, 4, 3, 16, 7], 284),
  ([26, 4, 11, 2, 5, 6, 18], 310),
  ([17], 3)],
 [([8, 5, 3, 9, 5, 3], 100), ([21, 31, 31, 32, 22, 24, 45, 22, 37, 24], 0)],
 [([2, 0, 7, 2, 21, 29, 37], 0),
  ([

In [38]:
labels

[[0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0],
 [3, 0],
 [0, 0, 0, 0, 0, 0, 3, 0, 0],
 [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0],
 [3, 0],
 [0, 0, 0, 0, 0, 0, 3, 0, 0]]

In [39]:
def next_batch(tokens, labels, batch_size = 1, shuffle = True):
    # shuffle the data at the beginning of each epoch
    data = np.array([[tokens[i], labels[i]] for i in range(len(tokens))])
    np.random.shuffle(data)
    token, labels = zip(*data)
#     print(token)
#     print()
#     print(labels)
    
    # generate mini batches
    for i in np.arange(0, len(token), batch_size):
        offset = min(i+batch_size, len(token))
        yield (token[i:offset], labels[i:offset])


In [40]:
for (x_batch, y_batch) in next_batch(tokens, labels,2):
    print(len(x_batch[0]),len(x_batch[1]))
    print(len(y_batch[0]),len(y_batch[1]))
    print()

11 11
11 11

9 9
9 9

2 2
2 2



# 4. Padding sentence into fixed length sequence
* reference https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html

In [53]:
for (x_batch, y_batch) in next_batch(tokens, labels,6):
    sentences = [list(zip(*x))[1] for x in x_batch]
    char_sentences = [list(zip(*x))[0] for x in x_batch]
    print(len(sentences))
    print()
    y_labels = y_batch

6



In [50]:
len(sentences)

1

In [403]:
print(len(sentences[0]), len(sentences[1]))
print(len(char_sentences[0]), len(char_sentences[1]))
print(len(y_labels[0]), len(y_labels[1]))

11 9
11 9
11 9


In [410]:
def pad_sentence(batch_setence):
    
    # find the max_length
    max_length = max(map(lambda x : len(x), batch_setence))
    
    # padding
    sequence_padded = []
    sequence_length = []
    for seq in batch_setence:
        seq = list(seq)
        seq_ = seq[:max_length] + [-1]*max(max_length - len(seq), 0)
        sequence_padded +=  [seq_]
        sequence_length += [min(len(seq), max_length)]

    return sequence_padded, sequence_length

In [436]:
def pad_word(batch_setence_word):
    '''
    https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html
    '''
    max_length_word = max([max(map(lambda x: len(x), seq))
                           for seq in batch_setence_word])
    sequence_padded, sequence_length = [], []
    for seq in batch_setence_word:
        # all words are same length now
        sp, sl = _pad_sequences(seq, -1, max_length_word)
        sequence_padded += [sp]
        sequence_length += [sl]

    max_length_sentence = max(map(lambda x : len(x), batch_setence_word))
    sequence_padded, _ = _pad_sequences(sequence_padded,
            [-1]*max_length_word, max_length_sentence)
    sequence_length, _ = _pad_sequences(sequence_length, -1,
            max_length_sentence)

    return sequence_padded, sequence_length

def _pad_sequences(sequences, pad_tok, max_length):
    sequence_padded, sequence_length = [], []

    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
        sequence_padded +=  [seq_]
        sequence_length += [min(len(seq), max_length)]

    return sequence_padded, sequence_length

In [489]:
for (x_batch, y_batch) in next_batch(tokens, labels,2):
    sentences = [list(zip(*x))[1] for x in x_batch]
    char_sentences = [list(zip(*x))[0] for x in x_batch]
    y_labels = y_batch
    pad_sentences,sentence_length = pad_sentence(sentences)
    pad_char_sentences, word_length = pad_word(char_sentences)
    

In [438]:
sentences

[(277, 16, 1907, 244, 72, 18, 385, 39, 284, 310, 3), (100, 0)]

In [439]:
char_sentences

[([11, 6, 4, 11, 28, 0, 2],
  [22],
  [8, 0, 4, 11, 0, 7, 2, 0, 6, 7, 10, 4, 6, 0],
  [2, 1, 28, 0],
  [5, 26, 0, 6],
  [1, 2],
  [2, 5, 15],
  [1, 14, 2, 0, 6],
  [4, 3, 3, 4, 3, 16, 7],
  [26, 4, 11, 2, 5, 6, 18],
  [17]),
 ([8, 5, 3, 9, 5, 3], [21, 31, 31, 32, 22, 24, 45, 22, 37, 24])]

In [440]:
pad_sentences

[[277, 16, 1907, 244, 72, 18, 385, 39, 284, 310, 3],
 [100, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1]]

In [490]:
sentence_length

[9, 11]

In [491]:
pad_char_sentences

[[[2, 0, 7, 2, 21, 29, 37, -1, -1, -1, -1, -1, -1, -1],
  [21, 29, 37, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [2, 0, 7, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [19, 0, 3, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [2, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [15, 1, 6, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [18, 0, 7, 2, 0, 6, 9, 1, 18, -1, -1, -1, -1, -1],
  [17, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]],
 [[11, 6, 4, 11, 28, 0, 2, -1, -1, -1, -1, -1, -1, -1],
  [22, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [8, 0, 4, 11, 0, 7, 2, 0, 6, 7, 10, 4, 6, 0],
  [2, 1, 28, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [5, 26, 0, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
  [2, 5, 15, -1, -1, -1, -1,

In [492]:
word_length

[[7, 3, 4, 1, 4, 2, 5, 9, 1, -1, -1], [7, 1, 14, 4, 4, 2, 3, 5, 7, 7, 1]]

* token_to_vector = utils_nlp.load_pretrained_token_embeddings(parameters) // do not modify like add new vectors

* token_count['all'] : {word, count} : for all in train, test, valid, dev 
* add to token_count['all'] the vocabu in token_to_vector {-1}
* same for chars
* same for chars
* order by frequency
* set: token_to_index {word, integer} : 
    if not in train & not in token_to_vector & not in pretrained dataset --> UNK
    == index ++
* infrequent_token_indices


* Label
    * aligin ['B-', 'I-', 'E-', 'S-']
    * order
    * label_to_index

* character
    * character_to_index

*idx2
    index_to_token
    label_to_index
    index_to_character
    
    
* _convert_to_indices
    * 2idx
    * max_length
    * padding

In [None]:
# Generators object
dev   = CoNLLDataset(config.filename_dev, processing_word)
    lowercase 
    digit