In [61]:
import deeppavlov
from deeppavlov.core.data.utils import download_decompress
download_decompress('http://lnsigo.mipt.ru/export/deeppavlov_data/conll2003_v2.tar.gz', 'data/')

2018-08-20 22:21:48.45 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 205: Starting new HTTP connection (1): 127.0.0.1:34799
2018-08-20 22:21:48.415 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 393: http://127.0.0.1:34799 "GET http://lnsigo.mipt.ru/export/deeppavlov_data/conll2003_v2.tar.gz HTTP/1.1" 302 None
2018-08-20 22:21:48.423 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 240: Resetting dropped connection: 127.0.0.1
2018-08-20 22:21:48.441 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 393: http://127.0.0.1:34799 "GET http://124.202.164.9/files/314600000C706EB6/lnsigo.mipt.ru/export/deeppavlov_data/conll2003_v2.tar.gz HTTP/1.1" 302 None
2018-08-20 22:21:48.448 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 240: Resetting dropped connection: 127.0.0.1
2018-08-20 22:21:48.458 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 393: http://127.0.0.1:34799 "GET http://202.112.144.234/files/3146000001CF3D15/ln

In [64]:
from pathlib import Path

class NerDatasetReader:
    def read(self,data_path,provide_pos = False):
        self.provide_pos = provide_pos
        data_parts = ['train','valid','test']
        extension = '.txt'
        dataset = {}
        for data_part in data_parts:#遍历每一个元素
            file_path = Path(data_path) / Path(data_part + extension)#形成文件路径
            dataset[data_part] = self.read_file(str(file_path))#字典
        return dataset
    def read_file(self,file_path):
        samples = []
        with open(file_path,'r')as rr:
            tokens = ['<DOCSTART>']
            pos_tags = ['0']
            tags = ['0']
            for line in rr:
                if 'DOCSTART' in line:
                    if len(tokens) > 1:
                        if self.provide_pos:
                            samples.append(((tokens.pos_tags),tags,))
                        else:
                            samples.append((tokens,tags))
                        tokens = []
                        pos_tags = []
                        tags = []
                elif len(line) < 2:
                    if len(tokens) > 0:
                        if self.provide_pos:
                            samples.append(((tokens,tags),tags))
                        else:
                            samples.append((tokens,tags))
                        tokens = []
                        pos_tags = []
                        tags = []
                else:
                    if self.provide_pos:
                        token,*_,pos,tag = line.split()
                        pos_tags.append(pos)
                    else:
                        token,*_,tag = line.split()
                    tags.append(tag)
                    tokens.append(token)
    
        return samples

In [65]:
from collections import defaultdict, Counter
from itertools import chain
import numpy as np

In [66]:
class Vocab:
    def __init__(self,special_tokens = tuple()):
        self.special_tokens = special_tokens
        self._t2i = defaultdict(lambda:0)#字典不存在的键键值默认为１,存放标号
        self._i2t = []#存放标记/标签
    
    def fit(self,tokens):
        count = 0
        self.freqs = Counter(chain(*tokens))#形成一个字典，键是词,值是词出现的次数
        for special_token in self.special_tokens:#special_token放在最前面
            self._t2i[special_token] = count
            self._i2t.append(special_token)
            count += 1
        for token,freq in self.freqs.most_common():#去除出现在specail_tokens里面的元素，根据出现次数从多到少->从前到后标号
            if token not in self._t2i:
                self._t2i[token] = count
                self._i2t.append(token)
                count += 1
    
    def __call__(self,batch,**kwargs):#通过()方式就可以访问
        indices_batch = []
        for sample in batch:
            indices_batch.append([self[ch] for ch in sample])#self[ch]调用了__getitem__
        return indices_batch
    
    def __getitem__(self,key):#判断是字符串还是数字,返回对应的列表
        if isinstance(key,(int,np.integer)):
            return self._i2t[key]
        elif isinstance(key,str):
            return self._t2i[key]
        else:
            raise NotImplementedError("not implemented for type `{}`".format(type(key)))
     
    def __len__(self):
        return len(self._i2t)


In [67]:
import random

class DatasetIterator:
    def __init__(self, data):
        self.data = {
            'train': data['train'],
            'valid': data['valid'],
            'test': data['test']
        }

    def gen_batches(self, batch_size, data_type='train', shuffle=True):
        """Return a generator, which serves for generation of raw (no preprocessing such as tokenization)
        batches
        Args:
            batch_size (int): number of samples in batch
            data_type (str): can be either 'train', 'test', or 'valid'
            shuffle (bool): whether to shuffle dataset before batching
        Returns:
            batch_gen (Generator): a generator, that iterates through the part (defined by data_type) of the dataset
        """
        if shuffle is None:
            shuffle = self.shuffle

        data = self.data[data_type]
        data_len = len(data)

        if data_len == 0:
            return

        order = list(range(data_len))  #生成一样大小的列表，元素为数字
        if shuffle:
            random.shuffle(order)  #打乱order列表中的数字

        if batch_size < 0:
            batch_size = data_len

        for i in range((data_len - 1) // batch_size + 1):
            yield tuple(zip(*[data[o] for o in order[i * batch_size:(i + 1) * batch_size]]))



In [68]:
dataset_reader = NerDatasetReader()

In [69]:
dataset = dataset_reader.read('data/')

print(len(dataset['train']))
print(len(dataset['valid']))
print(len(dataset['test']))

assert len(dataset) == 3, 'The dataset must be a dict with three fields: train, test, and valid'
assert len(set(dataset) & {'train', 'test', 'valid'}) == 3, 'The dataset keys must be exactly train, test, and valid'
assert isinstance(dataset['train'][0][0][0], str) and isinstance(dataset['train'][0][1][0], str), 'Both tokens and tags must be strings'
assert len(dataset['train']) == 14042, 'there must be exactly 14041 samples in train'
assert len(dataset['valid']) == 3251, 'there must be exactly 3250 samples in train'
assert len(dataset['test']) == 3454, 'there must be exactly 3453 samples in test'

14042
3251
3454


AssertionError: there must be exactly 14041 samples in train

In [70]:
data_iterator = DatasetIterator(dataset)

In [71]:
#from deeppavlov.core.data.simple_vocab import SimpleVocabulary  #！！！

special_tokens = ['<UNK>']
special_tags = ['0']

token_vocab = Vocab(special_tokens)
tag_vocab = Vocab(special_tags)

#token_vocab = SimpleVocabulary(special_tokens, save_path='model/token.dict') #！！！
#tag_vocab = SimpleVocabulary(save_path='model/tag.dict') #！！！

print(tag_vocab)

<__main__.Vocab object at 0x7ff632662390>


In [72]:
all_tokens_by_sentences = [tokens for tokens, tags in dataset['train']]
all_tags_by_sentences = [tags for tokens, tags in dataset['train']]

token_vocab.fit(all_tokens_by_sentences)
tag_vocab.fit(all_tags_by_sentences)

In [73]:
#token_vocab([[65,56,78]])

In [74]:
class Mask:
    def __init__(self,*args,**kwargs):
        pass
    def __call__(self,tokens_batch,**kwargs):
        """
        接收批量的tokens,返回对应长度的mask列表
        """
        batch_size = len(tokens_batch)
        max_len = max(len(utt) for utt in tokens_batch)
        mask = np.zeros([batch_size,max_len],dtype = np.float32)#返回一个用0填充的数组
        #np.zeros参考资料https://blog.csdn.net/qq_26948675/article/details/54318917
        for n,utterance in enumerate(tokens_batch):#返回一个二维数组,第一维代表是第几个列表,第二维表示列表的具体内容
            #print(n,utterance)
            mask[n,:len(utterance)] = 1
        return mask


get_mask = Mask()

In [75]:
import numpy as np
import tensorflow as tf

np.random.seed(42)
tf.set_random_seed(42)

class NerNetwork:
    def __init__(self,
                 n_tokens,
                 n_tags,
                 token_emb_dim=100,
                 n_hidden_list=(128,),
                 cnn_filter_width=7,
                 use_batch_norm=False,
                 embeddings_dropout=False,
                 top_dropout=False,
                 **kwargs):
        
        # ================ Building inputs =================
        
        self.learning_rate_ph = tf.placeholder(tf.float32, [])
        self.dropout_keep_ph = tf.placeholder(tf.float32, [])
        self.token_ph = tf.placeholder(tf.int32, [None, None], name='token_ind_ph')
        self.mask_ph = tf.placeholder(tf.float32, [None, None], name='Mask_ph')
        self.y_ph = tf.placeholder(tf.int32, [None, None], name='y_ph')
        
        # ================== Building the network ==================
        
        # Now embedd the indices of tokens using token_emb_dim function
        
        def get_embeddings(indices, vocabulary_size, emb_dim):
            # Initialize the random gaussian matrix with dimensions [vocabulary_size, embedding_dimension]
            # The **VARIANCE** of the random samples must be 1 / embedding_dimension
            emb_mat = np.random.randn(vocabulary_size, emb_dim).astype(np.float32) / np.sqrt(emb_dim) # YOUR CODE HERE
            emb_mat = tf.Variable(emb_mat, name='Embeddings', trainable=True)
            emb = tf.nn.embedding_lookup(emb_mat, indices)
            return emb
        
        emb = get_embeddings(self.token_ph, n_tokens, token_emb_dim)
        

        emb = tf.nn.dropout(emb, self.dropout_keep_ph, (tf.shape(emb)[0], 1, tf.shape(emb)[2]))
        
        # Build a multilayer CNN on top of the embeddings.
        # The number of units in the each layer must match
        # corresponding number from n_hidden_list.
        # Use ReLU activation 
        
        def conv_net(units, n_hidden_list, cnn_filter_width, activation=tf.nn.relu):
            # Use activation(units) to apply activation to units
            for n_hidden in n_hidden_list:
                units = tf.layers.conv1d(units,
                                         n_hidden,
                                         cnn_filter_width,
                                         padding='same')
                units = activation(units)
            return units
        
        units = conv_net(emb, n_hidden_list, cnn_filter_width)
        
        units = tf.nn.dropout(units, self.dropout_keep_ph, (tf.shape(units)[0], 1, tf.shape(units)[2]))
        logits = tf.layers.dense(units, n_tags, activation=None)
        self.predictions = tf.argmax(logits, 2)
        
        # ================= Loss and train ops =================
        # Use cross-entropy loss. check the tf.nn.softmax_cross_entropy_with_logits_v2 function
        
        def masked_cross_entropy(logits, label_indices, number_of_tags, mask):
            ground_truth_labels = tf.one_hot(label_indices, depth=number_of_tags)
            loss_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(labels=ground_truth_labels, logits=logits)
            loss_tensor *= mask
            loss = tf.reduce_mean(loss_tensor)
            return loss      
        
        self.loss = masked_cross_entropy(logits, self.y_ph, n_tags, self.mask_ph)

        # Create a training operation to update the network parameters.
        # We purpose to use the Adam optimizer as it work fine for the
        # most of the cases. Check tf.train to find an implementation.
        # Put the train operation to the attribute self.train_op
        
        optimizer = tf.train.AdamOptimizer(self.learning_rate_ph)
        self.train_op = optimizer.minimize(self.loss)
        
        # ================= Initialize the session =================
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

    def __call__(self, tok_batch, mask_batch):
        feed_dict = {self.token_ph: tok_batch,
                     self.mask_ph: mask_batch,
                     self.dropout_keep_ph: 1.0}
        return self.sess.run(self.predictions, feed_dict)

    def train_on_batch(self, tok_batch, tag_batch, mask_batch, dropout_keep_prob, learning_rate):
        feed_dict = {self.token_ph: tok_batch,
                     self.y_ph: tag_batch,
                     self.mask_ph: mask_batch,
                     self.dropout_keep_ph: dropout_keep_prob,
                     self.learning_rate_ph: learning_rate}
        self.sess.run(self.train_op, feed_dict)

In [76]:
nernet = NerNetwork(len(token_vocab),
                    len(tag_vocab),
                    n_hidden_list=[100, 100])

In [77]:
from deeppavlov.models.ner.evaluation import precision_recall_f1
# The function precision_recall_f1 takes two lists: y_true and y_predicted
# the tag sequences for each sentences should be merged into one big list 

from itertools import chain

#from deeppavlov.core.data.utils import zero_pad
# zero_pad takes a batch of lists of token indices, pad it with zeros to the
# maximal length and convert it to numpy matrix


def zero_pad(batch, dtype=np.float32):
    if len(batch) == 1 and len(batch[0]) == 0:
        return np.array([], dtype=dtype)
    batch_size = len(batch)
    max_len = max(len(utterance) for utterance in batch)
    if isinstance(batch[0][0], (int, np.int)):
        padded_batch = np.zeros([batch_size, max_len], dtype=np.int32)
        for n, utterance in enumerate(batch):
            padded_batch[n, :len(utterance)] = utterance
    else:
        n_features = len(batch[0][0])
        padded_batch = np.zeros([batch_size, max_len, n_features], dtype=dtype)
        for n, utterance in enumerate(batch):
            for k, token_features in enumerate(utterance):
                padded_batch[n, k] = token_features
    return padded_batch


def eval_valid(network, batch_generator):
    total_true = []
    total_pred = []
    for x, y_true in batch_generator:

        # Prepare token indices from tokens batch
        x_inds = token_vocab(x) # YOUR CODE HERE

        # Pad the indices batch with zeros
        x_batch = zero_pad(x_inds) # YOUR CODE HERE

        # Get the mask using get_mask
        mask = get_mask(x) # YOUR CODE HERE
        
        # We call the instance of the NerNetwork because we have defined __call__ method
        y_inds = network(x_batch, mask)

        # For every sentence in the batch extract all tags up to paddings
        y_inds = [y_inds[n][:len(x[n])] for n, y in enumerate(y_inds)] # YOUR CODE HERE
        y_pred = tag_vocab(y_inds)

        # Add fresh predictions 
        total_true.extend(chain(*y_true))
        total_pred.extend(chain(*y_pred))
    res = precision_recall_f1(total_true, total_pred, print_results=True)

In [78]:
batch_size = 16 # 将样本集划分为小部分，每部分内的样本数
n_epochs = 20 # 训练次数，每一个epoch利用所有数据训练一次
learning_rate = 0.001

# The keep_prob value is used to control the dropout rate used when training 
# the neural network. Essentially, it means that each connection between layers 
# (in this case between the last densely connected layer and the readout layer) 
# will only be used with probability 0.5 when training. This reduces overfitting.
dropout_keep_prob = 0.5

In [79]:
for epoch in range(n_epochs):
    for x, y in data_iterator.gen_batches(batch_size, 'train'):
        # Convert tokens to indices via Vocab
        x_inds = token_vocab(x) # YOUR CODE 
        # Convert tags to indices via Vocab
        y_inds = tag_vocab(y) # YOUR CODE 
        
        # Pad every sample with zeros to the maximal length
        x_batch = zero_pad(x_inds)
        y_batch = zero_pad(y_inds)

        mask = get_mask(x)
        nernet.train_on_batch(x_batch, y_batch, mask, dropout_keep_prob, learning_rate)
    print('Evaluating the model on valid part of the dataset')
    eval_valid(nernet, data_iterator.gen_batches(batch_size, 'valid'))


<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class

2018-08-20 22:33:20.459 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51363 tokens with 5942 phrases; found: 5160 phrases; correct: 3123.

precision:  60.52%; recall:  52.56%; FB1:  56.26

	: precision:  0.00%; recall:  0.00%; F1:  0.00 0

	LOC: precision:  60.46%; recall:  74.58%; F1:  66.78 2266

	MISC: precision:  53.18%; recall:  15.40%; F1:  23.89 267

	ORG: precision:  50.65%; recall:  40.79%; F1:  45.19 1080

	PER: precision:  68.78%; recall:  57.76%; F1:  62.79 1547




<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class

2018-08-20 22:33:24.638 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51363 tokens with 5942 phrases; found: 5519 phrases; correct: 4321.

precision:  78.29%; recall:  72.72%; FB1:  75.40

	: precision:  0.00%; recall:  0.00%; F1:  0.00 0

	LOC: precision:  84.39%; recall:  83.89%; F1:  84.14 1826

	MISC: precision:  69.50%; recall:  67.46%; F1:  68.46 895

	ORG: precision:  72.41%; recall:  61.45%; F1:  66.48 1138

	PER: precision:  80.36%; recall:  72.42%; F1:  76.19 1660




<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class

2018-08-20 22:33:28.914 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51363 tokens with 5942 phrases; found: 5595 phrases; correct: 4653.

precision:  83.16%; recall:  78.31%; FB1:  80.66

	: precision:  0.00%; recall:  0.00%; F1:  0.00 0

	LOC: precision:  84.22%; recall:  89.49%; F1:  86.78 1952

	MISC: precision:  80.33%; recall:  74.40%; F1:  77.25 854

	ORG: precision:  79.86%; recall:  68.31%; F1:  73.63 1147

	PER: precision:  85.69%; recall:  76.38%; F1:  80.77 1642




<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class

KeyboardInterrupt: 

In [80]:
eval_valid(nernet, data_iterator.gen_batches(batch_size, 'test'))

2018-08-20 22:33:40.483 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 46436 tokens with 5648 phrases; found: 4953 phrases; correct: 3900.

precision:  78.74%; recall:  69.05%; FB1:  73.58

	: precision:  0.00%; recall:  0.00%; F1:  0.00 0

	LOC: precision:  84.78%; recall:  83.15%; F1:  83.96 1636

	MISC: precision:  72.95%; recall:  64.53%; F1:  68.48 621

	ORG: precision:  72.13%; recall:  63.58%; F1:  67.58 1464

	PER: precision:  81.49%; recall:  62.09%; F1:  70.48 1232




In [34]:
sentence = input('Please use English:')
x = [sentence.split()]

x_inds = token_vocab(x)
x_batch = zero_pad(x_inds)
mask = get_mask(x)
y_inds = nernet(x_batch, mask)
print(x[0])
print(y_inds)
print(tag_vocab(y_inds)[0])

Please use English:hi tony
['hi', 'tony']
[[0 0]]
['O', 'O']
