In [24]:
import sklearn.preprocessing
# import utils
import collections
import codecs
# import utils_nlp
import re
import time
import token
import os
import pickle
import random
import numpy as np
path_train = '../data/CoNLL2003/eng.train'
path_eval = '../data/CoNLL2003/eng.testa'
path_test = '../data/CoNLL2003/eng.testb'

### Here we analysis the vocabulary of CoNLL 2003 dataset for:
 * token
 * character
 * label<br><br>

And compare the token vocabulary OOV rate with glove.6B


In [2]:
# first 10 lines in test file
! head -20 ../data/CoNLL2003/eng.train

-DOCSTART- -X- O O

EU NNP I-NP I-ORG
rejects VBZ I-VP O
German JJ I-NP I-MISC
call NN I-NP O
to TO I-VP O
boycott VB I-VP O
British JJ I-NP I-MISC
lamb NN I-NP O
. . O O

Peter NNP I-NP I-PER
Blackburn NNP I-NP I-PER

BRUSSELS NNP I-NP I-LOC
1996-08-22 CD I-NP O

The DT I-NP O
European NNP I-NP I-ORG


In [4]:
def get_vocabs(filepath, sparator = ' ', lowercase = True):
    
    count_token = collections.Counter()
    count_label = collections.Counter()
    count_character = collections.Counter()
    
    if filepath:
        f = codecs.open(filepath, 'r', 'UTF-8')
        for line in f:
            line = line.strip().split(sparator)

            #skip sentence separator
            if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
                continue
            
            token = str(line[0])
            for character in token:
                count_character.update({character: 1})
                
            # lowercase & digit
            if lowercase:
                token = str(line[0]).lower()
            else:
                token = str(line[0])
            if token.isdigit():
                token = '$NUM$'
                
            label = str(line[-1])
            count_token.update({token: 1})
            count_label.update({label: 1})              
        
        f.close()    
            
    return count_token, count_label, count_character

In [12]:
count_token = {} 
count_label = {} 
count_character = {}

datasets = [('train',path_train), ('eval', path_eval), ('test', path_eval)]
for dataset in datasets:
    count_token[dataset[0]], count_label[dataset[0]], count_character[dataset[0]] = get_vocabs(dataset[1])
    
vocabs_corpus_token = count_token['train'] + count_token['test'] + count_token['eval']
vocabs_label = count_label['train'] + count_label['test'] + count_label['eval']
vocabs_char = count_character['train'] + count_character['test'] + count_character['eval']

### Label vocabulary

In [18]:
print('Label size of CoNLL dataset: ',len(vocabs_label))
vocabs_label

Label size of CoNLL dataset:  8


Counter({'B-LOC': 11,
         'B-MISC': 45,
         'B-ORG': 24,
         'I-LOC': 12474,
         'I-MISC': 7084,
         'I-ORG': 14185,
         'I-PER': 17426,
         'O': 255096})

### Character vocabulary

In [15]:
print('character size of CoNLL dataset: ',len(vocabs_char))
vocabs_char

character size of CoNLL dataset:  84


Counter({'!': 6,
         '"': 3458,
         '$': 628,
         '%': 23,
         '&': 65,
         "'": 3049,
         '(': 4222,
         ')': 4223,
         '*': 58,
         '+': 226,
         ',': 12315,
         '-': 12566,
         '.': 18254,
         '/': 664,
         '0': 10557,
         '1': 12343,
         '2': 8111,
         '3': 6040,
         '4': 5165,
         '5': 5220,
         '6': 6520,
         '7': 4004,
         '8': 4521,
         '9': 6931,
         ':': 1410,
         ';': 101,
         '=': 30,
         '?': 19,
         '@': 6,
         'A': 9594,
         'B': 4950,
         'C': 6639,
         'D': 3959,
         'E': 6196,
         'F': 3103,
         'G': 3434,
         'H': 3086,
         'I': 6799,
         'J': 2148,
         'K': 2462,
         'L': 4677,
         'M': 5385,
         'N': 6150,
         'O': 5038,
         'P': 3899,
         'Q': 213,
         'R': 5995,
         'S': 10345,
         'T': 8137,
         'U': 2855,
         'V': 1

<div class="alert alert-success" align="justify">
the character vocabulary in CoNLL 2003 data set is small than UTF-8-128, so i choose to build our characters embedding table from the character in corpus, not from UTF-8-128, since the UTF-8-128 also contain some un-visible character that make no means in understand the text.
</div>

### Word vocabulary

In [32]:
def get_glove_vocab(filename):
    vocab = set()
    with open(filename) as f:
        for line in f:
            word = line.strip().split(' ')[0]
            vocab.add(word)
    return vocab

In [37]:
# glove.6B.100d vocab 
vocab_glove = get_glove_vocab('../data/glove/glove.6B.100d.txt')
total_token_glove = len(vocab_glove)
total_token_corpus = np.sum(list(vocabs_corpus_token.values()))

print('vocab size of glove:', total_token_glove)
print('vocab size of CoNLL:',total_token_corpus)

vocab size of glove: 400000
vocab size of CoNLL: 306345


In [83]:
OOV = set(vocabs_corpus_token.keys()) - vocab_glove
print('OOV rate in CoNLL compare with glove :{}%'.format((np.sum([vocabs_corpus_token[x] for x in OOV])/total_token_corpus)*100))

OOV rate in CoNLL compare with glove :7.550310923958283%


#### Digit rate in CoNLL

In [49]:
print('digit rate in CoNLL :{}%'.format((vocabs_corpus_token['$NUM$']/total_token_corpus)*100))

digit rate in CoNLL :5.391307186342196%


vocabs_corpus_token.most_common()[-20:]

in CoNLL dataset it has many different digit formats like:
* '9.5',
* '73.83',
* '4:13.353',
* '14,668,000',
* '312-408-8787',
* '32.0-43.0'.

<div class="alert alert-success" align="justify">
the digit consist most of the OOV token in CoNLL data set.
</div>

In [127]:
def find_all_digit(filepath, sparator = ' ', lowercase = True):
    
    digit = []
    if filepath:
        f = codecs.open(filepath, 'r', 'UTF-8')
        for line in f:
            line = line.strip().split(sparator)

            #skip sentence separator
            if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
                continue
                

            token = str(line[0])
            token1 = token
            if len(token) > 1 and any(char.isdigit() for char in token):
                token = token.replace('-','1')
                token = token.replace(',','1')
                token = token.replace('.','1')
                token = token.replace(':','1')
            if token.isdigit():
                digit.append(token1)
                
        f.close()    
            
    return digit

In [128]:
digits = {}
datasets = [('train',path_train), ('eval', path_eval), ('test', path_eval)]
for dataset in datasets:
    digits[dataset[0]] = find_all_digit(dataset[1])
    
vocabs_digits = digits['train'] + digits['test'] + digits['eval']
vocabs_digits

['1996-08-22',
 '47,600',
 '4,275',
 '10',
 '17,000',
 '1996-08-22',
 '17,000',
 '10,925',
 '16,935',
 '1966',
 '1967',
 '16',
 '1966',
 '1969',
 '5,060',
 '7,845',
 '1970',
 '27',
 '1996-08-22',
 '1996-08-22',
 '14.2',
 '1996-08-22',
 '14.2',
 '356,725',
 '1996',
 '304,850',
 '15,613',
 '13.6',
 '2.2',
 '1995',
 '32.7',
 '77,719',
 '49,269',
 '16.4',
 '35,563',
 '11.7',
 '1996',
 '3,420',
 '5522',
 '554',
 '643',
 '1996-08-22',
 '3311812-4',
 '100',
 '1996-08-22',
 '100',
 '6.625',
 '100.92',
 '1.875',
 '99.32',
 '1-10-100',
 '0.275',
 '1.60',
 '7.0',
 '2001',
 '171',
 '542',
 '7658',
 '300',
 '1999',
 '1996-08-22',
 '300',
 '12.5',
 '99.956',
 '10',
 '1-10-100',
 '2',
 '5',
 '5',
 '171',
 '542',
 '8863',
 '10',
 '24',
 '1996-08-22',
 '1967',
 '1991',
 '1996-08-22',
 '100',
 '800',
 '1996-08-22',
 '1991',
 '100',
 '1996-08-22',
 '3.7504',
 '06',
 '1996-08-22',
 '1996-08-22',
 '1996-08-22',
 '1995',
 '76',
 '1996-08-22',
 '1990',
 '1995',
 '22',
 '1996-08-22',
 '12',
 '22',
 '1996-08-2

In [129]:
print('General digit rate in CoNLL  :{}%'.format((len(vocabs_digits)/total_token_corpus)*100))

General digit rate in CoNLL  :8.418613001681111%


#### and digit should not count some strings like
* 84th
* 20-month
* 25-YEAR