In [2]:
import pandas as pd
wnuttrain = 'https://storage.googleapis.com/wnut-2017_ner-shared-task/wnut17train_clean_tagged.txt'
train = pd.read_table(wnuttrain, header=None, names=['token', 'label', 'bio_only', 'upos'])  # don't drop the empty lines yet, they show up as NaN in the data frame
train.head(n=1)



Unnamed: 0,token,label,bio_only,upos
0,@paulwalk,O,O,NOUN


In [3]:
# Feature Extraction

import numpy as np
import itertools

# in order to convert word tokens to integers: list the set of token types
token_vocab = train.token.unique().tolist()
oov = len(token_vocab)  # OOV (out of vocabulary) token as vocab length (because that's max.index + 1)


# convert word tokens to integers
def token_index(tok):
  ind = tok
  if not pd.isnull(tok):  # new since last time: deal with the empty lines which we didn't drop yet
    if tok in token_vocab:  # if token in vocabulary
      ind = token_vocab.index(tok)
    else:  # else it's OOV
      ind = oov
  return ind


#### DONE: Convert the POS Tag Into Integer.

pos_vocab = train.upos.unique().tolist()
oov_pos = len(pos_vocab)  # OOV (out of vocabulary) token as vocab length (because that's max.index + 1)
# print(oov_pos)

# Convert POS tag into Integers
def pos_index(tok):
  ind = tok
  if not pd.isnull(tok):  # new since last time: deal with the empty lines which we didn't drop yet
    if tok in pos_vocab:  # if token in vocabulary
      ind = pos_vocab.index(tok)
    else:  # else it's OOV
      ind = oov_pos
  return ind


# training labels: convert BIO to integers
def bio_index(bio):
  ind = bio
  if not pd.isnull(bio):  # deal with empty lines
    if bio=='B':
      ind = 0
    elif bio=='I':
      ind = 1
    elif bio=='O':
      ind = 2
  return ind


# Working Create Character Level Embedding


# print(char_vocab)
all_chars = set()
for item in train.token:
  if not (pd.isnull(item)):
    for ch in item:
      all_chars.add(ch)

all_chars = list(all_chars)
char_vocab = (all_chars)
oov_char = len(char_vocab)


# Convert Ch tag into Integers
def char_index(word):
  ans = []
  if not pd.isnull(word):
    for ch in word:
      if ch in char_vocab:  # if token in vocabulary
        ind = char_vocab.index(ch)
      else:  # else it's OOV
        ind = oov_char
      ans.append(ind)
    return ans
  return []


# pass a data frame through our feature extractor
def extract_features(txt_orig,istest=False):
  txt = txt_orig.copy()
  tokinds = [token_index(u) for u in txt['token']]
  txt['token_indices'] = tokinds


  posinds = [pos_index(u) for u in txt['upos']]
  txt['pos_indices'] = posinds

  charsinds = [char_index(u) for u in txt['token']]

  txt['char_list'] = charsinds


  if not istest:  # can't do this with the test set
    bioints = [bio_index(b) for b in txt['bio_only']]
    txt['bio_only'] = bioints
  return txt

train_copy = extract_features(train)
train_copy.head(n=10)

Unnamed: 0,token,label,bio_only,upos,token_indices,pos_indices,char_list
0,@paulwalk,O,2.0,NOUN,0.0,0.0,"[75, 9, 26, 50, 43, 24, 26, 43, 2]"
1,It,O,2.0,PRON,1.0,1.0,"[36, 67]"
2,'s,O,2.0,AUX,2.0,2.0,"[74, 69]"
3,the,O,2.0,DET,3.0,3.0,"[67, 31, 49]"
4,view,O,2.0,NOUN,4.0,0.0,"[33, 32, 49, 24]"
5,from,O,2.0,ADP,5.0,4.0,"[41, 90, 13, 72]"
6,where,O,2.0,ADV,6.0,5.0,"[24, 31, 49, 90, 49]"
7,I,O,2.0,PRON,7.0,1.0,[36]
8,'m,O,2.0,X,8.0,6.0,"[74, 72]"
9,living,O,2.0,NOUN,9.0,0.0,"[43, 32, 33, 32, 40, 35]"


In [4]:
def tokens2sequences(txt_orig,istest=False):
  '''
  Takes panda dataframe as input, copies, and adds a sequence index based on full-stops.
  Outputs a dataframe with sequences of tokens, named entity labels, and token indices as lists.
  '''
  txt = txt_orig.copy()
  txt['sequence_num'] = 0
  seqcount = 0
  for i in txt.index:  # in each row...
    txt.loc[i,'sequence_num'] = seqcount  # set the sequence number
    if pd.isnull(txt.loc[i,'token']):  # increment sequence counter at empty lines
      seqcount += 1
  # now drop the empty lines, group by sequence number and output df of sequence lists
  txt = txt.dropna()
  if istest:  # test set doesn't have labels
    txt_seqs = txt.groupby(['sequence_num'],as_index=False)[['token', 'token_indices', "upos", "pos_indices","char_list"]].agg(lambda x: list(x))
  else:
    txt_seqs = txt.groupby(['sequence_num'],as_index=False)[['token', 'bio_only', 'token_indices', 'upos', "pos_indices","char_list"]].agg(lambda x: list(x))
  return txt_seqs

print("This cell takes a little while to run: be patient :)")
train_seqs = tokens2sequences(train_copy)
train_seqs.head(1)

This cell takes a little while to run: be patient :)


Unnamed: 0,sequence_num,token,bio_only,token_indices,upos,pos_indices,char_list
0,0,"[@paulwalk, It, 's, the, view, from, where, I,...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...","[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...","[NOUN, PRON, AUX, DET, NOUN, ADP, ADV, PRON, X...","[0.0, 1.0, 2.0, 3.0, 0.0, 4.0, 5.0, 1.0, 6.0, ...","[[75, 9, 26, 50, 43, 24, 26, 43, 2], [36, 67],..."


In [5]:
# use sequence number as the index and apply pandas explode to all other columns
train_back = train_seqs.set_index('sequence_num').apply(pd.Series.explode).reset_index()
train_back.head(5)

Unnamed: 0,sequence_num,token,bio_only,token_indices,upos,pos_indices,char_list
0,0,@paulwalk,2,0,NOUN,0,"[75, 9, 26, 50, 43, 24, 26, 43, 2]"
1,0,It,2,1,PRON,1,"[36, 67]"
2,0,'s,2,2,AUX,2,"[74, 69]"
3,0,the,2,3,DET,3,"[67, 31, 49]"
4,0,view,2,4,NOUN,0,"[33, 32, 49, 24]"


In [6]:
def find_longest_chars(txt,longest_seq):
  '''find the longest sequence in the dataframe'''
  for i in txt.index:
    seqlen = np.max(list(map(lambda x : len(x), (txt['char_list'][i]))))
    if seqlen > longest_seq:  # update high water mark if new longest sequence encountered
      longest_seq = seqlen
  return longest_seq


train_longest_char = find_longest_chars(train_seqs,0)
print('The longest chars in the training set is %i tokens long' % train_longest_char)


The longest chars in the training set is 121 tokens long


In [7]:
def find_longest_sequence(txt,longest_seq):
  '''find the longest sequence in the dataframe'''
  for i in txt.index:
    seqlen = len(txt['token'][i])
    if seqlen > longest_seq:  # update high water mark if new longest sequence encountered
      longest_seq = seqlen
  return longest_seq

train_longest = find_longest_sequence(train_seqs,0)
print('The longest sequence in the training set is %i tokens long' % train_longest)


The longest sequence in the training set is 41 tokens long


In [8]:
# the dev set
wnutdev = 'https://storage.googleapis.com/wnut-2017_ner-shared-task/wnut17dev_clean_tagged.txt'
dev = pd.read_table(wnutdev, header=None, names=['token', 'label', 'bio_only', 'upos'])
dev_copy = extract_features(dev)
dev_seqs = tokens2sequences(dev_copy)
dev_longest = find_longest_sequence(dev_seqs,0)
print('The longest sequence in the dev set is %i tokens long' % dev_longest)

dev_longest_char = find_longest_chars(dev_seqs,0)
print('The longest char in the dev set is %i tokens long' % dev_longest_char)

# the test set
wnuttest = 'https://storage.googleapis.com/wnut-2017_ner-shared-task/wnut17test_clean_tagged.txt'
test = pd.read_table(wnuttest, header=None, names=['token', 'upos'])
test_copy = extract_features(test, True)
test_seqs = tokens2sequences(test_copy, True)
test_longest = find_longest_sequence(test_seqs,0)
print('The longest sequence in the test set is %i tokens long' % test_longest)
test_longest_char = find_longest_chars(test_seqs,0)
print('The longest char in the test set is %i tokens long' % test_longest_char)

The longest sequence in the dev set is 82 tokens long
The longest char in the dev set is 66 tokens long
The longest sequence in the test set is 105 tokens long
The longest char in the test set is 195 tokens long


In [9]:
# def padd_char(seq):
#   X_char = []
#   for item in seq["char_list"]:
#      sent_seq = []
#      for i in range(seq_length):
#           word_seq = []
#           for j in range(char_seq_length):
#               try:
#                   word_seq.append(item[j])
#               except:
#                   word_seq.append(char_padtok)
#           sent_seq.append(word_seq)
#      X_char.append(np.array(sent_seq))
#   return X_char

In [10]:
def padd_char(seq):
  temp_char_seqs_padded = []
  for item in seq["char_list"]:
    temp_pad = pad_sequences(item, maxlen=char_seq_length,
                                  dtype='int32', padding='post', truncating='post', value=char_padtok)
  
    a = temp_pad
    b = [[char_padtok for i in range(char_seq_length)] for _ in range(0, seq_length - len(temp_pad))]
    c = np.concatenate((a, b))
    # print(len(c))
    temp_char_seqs_padded.append(c)
  # print(len(temp_char_seqs_padded))
  return temp_char_seqs_padded

In [11]:
from keras.preprocessing.sequence import pad_sequences

# set maximum sequence length
seq_length = test_longest
# char_seq_length = test_longest_char
char_seq_length = 5

# a new dummy token index, one more than OOV
padtok = oov+1
pos_padtok = oov_pos+1
char_padtok = oov_char + 1

print('The padding token index is %i' % padtok)

# use pad_sequences, padding or truncating at the end of the sequence (default is 'pre')
train_seqs_padded = pad_sequences(train_seqs['token_indices'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=padtok)



train_pos_seqs_padded = pad_sequences(train_seqs['pos_indices'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=pos_padtok)



# Pad Character Level 
train_char_seqs_padded = padd_char(train_seqs)

# for item in train_seqs["char_list"]:
#   temp_pad = pad_sequences(item, maxlen=char_seq_length,
#                                   dtype='int32', padding='post', truncating='post', value=char_padtok)
  
#   a = temp_pad
#   b = [[char_padtok for i in range(char_seq_length)] for _ in range(0, seq_length - len(temp_pad))]
#   c = np.concatenate((a, b))
#   # print(len(c))
#   train_char_seqs_padded.append(c)

print('Example of padded token sequence:')
print(train_seqs_padded[1])


print('Example of padded pos sequence:')
print(train_pos_seqs_padded[1])

The padding token index is 14802
Example of padded token sequence:
[   26    27    28    29    30    31    32    10    33    34    35    36
    13    37    38 14802 14802 14802 14802 14802 14802 14802 14802 14802
 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802
 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802
 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802
 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802
 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802
 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802 14802
 14802 14802 14802 14802 14802 14802 14802 14802 14802]
Example of padded pos sequence:
[ 4  9  9  8  9  0  0  4  9  9  4  9  8  6  6 19 19 19 19 19 19 19 19 19
 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19
 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19
 19 19 19 19 19 19 19 19 19 19 19 19 19 19

In [12]:
assert len(train_char_seqs_padded) == len(train_seqs_padded)
for item in train_char_seqs_padded:
  assert len(item) == len(train_seqs_padded[0])

In [13]:
from keras.utils import to_categorical

# get lists of named entity labels, padded with a null label (=3)
padlab = 3
train_labs_padded = pad_sequences(train_seqs['bio_only'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=padlab)

# convert those labels to one-hot encoding
n_labs = 4  # we have 3 labels: B, I, O (0, 1, 2) + the pad label 3
train_labs_onehot = [to_categorical(i, num_classes=n_labs) for i in train_labs_padded]

# follow the print outputs below to see how the labels are transformed
print('Example of padded label sequence and one-hot encoding (first 10 tokens):')
print(train_seqs.loc[1])
print('Length of input sequence: %i' % len(train_labs_padded[1]))
print('Length of label sequence: %i' % len(train_labs_onehot[1]))
print(train_labs_padded[1][:11])
print(train_labs_onehot[1][:11])

Example of padded label sequence and one-hot encoding (first 10 tokens):
sequence_num                                                     1
token            [From, Green, Newsfeed, :, AHFA, extends, dead...
bio_only         [2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0, 2.0, 2.0, ...
token_indices    [26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 10....
upos             [ADP, PROPN, PROPN, PUNCT, PROPN, NOUN, NOUN, ...
pos_indices      [4.0, 9.0, 9.0, 8.0, 9.0, 0.0, 0.0, 4.0, 9.0, ...
char_list        [[8, 90, 13, 72], [12, 90, 49, 49, 40], [19, 4...
Name: 1, dtype: object
Length of input sequence: 105
Length of label sequence: 105
[2 2 2 2 0 2 2 2 2 2 2]
[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]


In [14]:
# Prepare Dev Set
# now process the dev set in the same way: padding the tokens & labels, and one-hot encoding the labels
dev_seqs_padded = pad_sequences(dev_seqs['token_indices'].tolist(), maxlen=seq_length,
                                dtype='int32', padding='post', truncating='post', value=padtok)
dev_labs_padded = pad_sequences(dev_seqs['bio_only'].tolist(), maxlen=seq_length,
                                dtype='int32', padding='post', truncating='post', value=padlab)
dev_labs_onehot = [to_categorical(i, num_classes=n_labs) for i in dev_labs_padded]

dev_pos_seqs_padded = pad_sequences(dev_seqs['pos_indices'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=pos_padtok)

dev_char_seqs_padded = padd_char(dev_seqs)


print('Dev set padded label sequence and one-hot encoding (first 10 tokens):')
print(dev_seqs.loc[2])
print('Length of input sequence: %i' % len(dev_labs_padded[1]))
print('Length of label sequence: %i' % len(dev_labs_onehot[1]))
print(dev_labs_padded[2][:11])
print(dev_labs_onehot[2][:11])

Dev set padded label sequence and one-hot encoding (first 10 tokens):
sequence_num                                                     2
token            [All, I, ', ve, been, doing, is, BINGE, watchi...
bio_only         [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...
token_indices    [405.0, 7.0, 573.0, 12927.0, 90.0, 848.0, 52.0...
upos             [DET, PRON, PUNCT, NOUN, AUX, VERB, AUX, PROPN...
pos_indices      [3.0, 1.0, 8.0, 0.0, 2.0, 14.0, 2.0, 9.0, 14.0...
char_list        [[21, 43, 43], [36], [74], [33, 49], [22, 49, ...
Name: 2, dtype: object
Length of input sequence: 105
Length of label sequence: 105
[2 2 2 2 2 2 2 2 2 0 1]
[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]


In [15]:
assert len(dev_char_seqs_padded) == len(dev_pos_seqs_padded)
for item in dev_char_seqs_padded:
  assert len(item) == len(dev_pos_seqs_padded[0])

In [16]:
# load Keras and TensorFlow
from tensorflow import keras
import tensorflow as tf
# from keras_contrib.layers import CRF

# prepare sequences and labels as numpy arrays, check dimensions
X = np.array(train_seqs_padded)
y = np.array(train_labs_onehot)
X_pos = np.array(train_pos_seqs_padded)
X_char = np.array(train_char_seqs_padded)

print('Input sequence dimensions (n.docs, seq.length):')
print(X.shape)
print('Input pos dimensions (n.docs, seq.length):')
print(X_pos.shape)
print('Input char dimensions (n.docs, seq.length):')
print(X_char.shape)
print('Label dimensions (n.docs, seq.length, one-hot encoding of 4 NER labels):')
print(y.shape)

# our final vocab size is the padding token + 1 (OR length of vocab + OOV + PAD)
vocab_size = padtok+1
assert (vocab_size==len(token_vocab)+2)


# Validate the index of pos tag.

pos_size = pos_padtok + 1
assert (pos_size==len(pos_vocab)+2)

char_size = char_padtok + 1
assert (char_size==len(char_vocab)+2)

embed_size = 128  # try an embedding size of 128 (could tune this)

# list of metrics to use: true & false positives, negatives, accuracy, precision, recall, area under the curve
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

# our model has the option for an label prediction bias, it's sequential, starts with an embedding layer, then bi-LSTM,
# a dropout layer follows for regularisation, and a dense final layer with softmax activation to output class probabilities
# we compile with the Adam optimizer at a low learning rate, use categorical cross-entropy as our loss function
def make_model(metrics = METRICS, output_bias=None):
  if output_bias is not None:
    output_bias = tf.keras.initializers.Constant(output_bias)

  tok_input1 = keras.layers.Input(shape=(seq_length,), dtype='int32', name='tok_input1')
  pos_input2 = keras.layers.Input(shape=(seq_length,), dtype='int32', name='pos_input2')
  # ("CHAR INPUT")
  # print(char_input3)

  # Character Level Model




  # char_in = keras.layers.Input(shape=(seq_length,char_seq_length,), dtype='int32', name='char_in')
  char_input3 = keras.layers.Input(shape=(seq_length,char_seq_length), dtype='int32', name='char_input3')
  emb_char = keras.layers.TimeDistributed(keras.layers.Embedding(output_dim=embed_size, input_dim=char_size, input_length=5,  mask_zero=True, trainable=True))(char_input3)
  char_enc = keras.layers.TimeDistributed(keras.layers.Bidirectional(keras.layers.LSTM(units=50, return_sequences=False, dropout=0.2, recurrent_dropout=0.2)))(emb_char)



  # char_output = char_x_lstm
  # char_model = tf.keras.Model(inputs=char_in, outputs=char_output)

# input_inner = Input(shape=(4,), name='input_inner')
# output_inner = Dense(3, name='inner_dense')(input_inner)
# inner_model = Model(inputs=input_inner, outputs=output_inner)


  # x3 = char_model(char_input3)

  x1 = keras.layers.Embedding(output_dim=embed_size, input_dim=vocab_size,  input_length=seq_length,  mask_zero=True, trainable=True)(tok_input1)
  x2 = keras.layers.Embedding(output_dim=embed_size, input_dim=pos_size,  input_length=seq_length, mask_zero=True, trainable=True)(pos_input2)
  
  # print(":D")
  # print(tok_input1)
  

  x_cancat = keras.layers.concatenate([x1, x2, char_enc ])
  # x_cancat = keras.layers.concatenate([x1, x2])

  x_lstm = keras.layers.Bidirectional(keras.layers.LSTM(units=50, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(x_cancat)
  x_drop = keras.layers.Dropout(0.5)(x_lstm)
  main_output = keras.layers.TimeDistributed(keras.layers.Dense(n_labs, activation='softmax', bias_initializer=output_bias))(x_drop)
 # model = keras.Sequential()
 # model.add(keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=seq_length, mask_zero=True, trainable=True))
# model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=50, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
#  model.add(keras.layers.Dropout(0.5))
#  model.add(keras.layers.TimeDistributed(keras.layers.Dense(n_labs, activation='softmax', bias_initializer=output_bias)))
  # crf = CRF(4)  # CRF layer, n_tags+1(PAD)
  # main_output = crf(main_output)  # output

  model = keras.models.Model(inputs=[tok_input1, pos_input2, char_input3], outputs= main_output)

  model.compile(optimizer=keras.optimizers.Adam(lr=1e-3), loss=keras.losses.CategoricalCrossentropy(), metrics=metrics)
  return model

# early stopping criteria based on area under the curve: will stop if no improvement after 10 epochs
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', verbose=1, patience=10, mode='max', restore_best_weights=True)

# the number of training epochs we'll use, and the batch size (how many texts are input at once)
EPOCHS = 100
BATCH_SIZE = 32

print('**Defining a neural network**')
model = make_model()
model.summary()

Input sequence dimensions (n.docs, seq.length):
(3375, 105)
Input pos dimensions (n.docs, seq.length):
(3375, 105)
Input char dimensions (n.docs, seq.length):
(3375, 105, 5)
Label dimensions (n.docs, seq.length, one-hot encoding of 4 NER labels):
(3375, 105, 4)
**Defining a neural network**
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input3 (InputLayer)        [(None, 105, 5)]     0                                            
__________________________________________________________________________________________________
tok_input1 (InputLayer)         [(None, 105)]        0                                            
__________________________________________________________________________________________________
pos_input2 (InputLayer)         [(None, 105)]        0                                            


In [17]:
# evaluate our initial model


results = model.evaluate([X,X_pos,X_char], y, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0]))

Loss: 1.4288


In [18]:
# figure out the label distribution in our fixed-length texts
from collections import Counter

all_labs = [l for lab in train_labs_padded for l in lab]
label_count = Counter(all_labs)
total_labs = len(all_labs)
print(label_count)
print(total_labs)

# use this to define an initial model bias
initial_bias=[(label_count[0]/total_labs), (label_count[1]/total_labs),
              (label_count[2]/total_labs), (label_count[3]/total_labs)]
print('Initial bias:')
print(initial_bias)

# pass the bias to the model and re-evaluate
model = make_model(output_bias=initial_bias)
results = model.evaluate([X,X_pos,X_char], y, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0]))


Counter({3: 292139, 2: 59095, 0: 1964, 1: 1177})
354375
Initial bias:
[0.005542151675485009, 0.0033213403880070548, 0.1667583774250441, 0.8243781305114638]
Loss: 0.9316


In [19]:
# # prepare the dev sequences and labels as numpy arrays
dev_X = np.array(dev_seqs_padded)
dev_X_pos = np.array(dev_pos_seqs_padded)
dev_X_char = np.array(dev_char_seqs_padded)
dev_y = np.array(dev_labs_onehot)


# # re-initiate model with bias
# model = make_model(output_bias=initial_bias)

# # and fit...
# model.fit([X,X_pos] , y, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks = [early_stopping], validation_data=([dev_X, dev_X_pos],dev_y))

In [20]:
# # use argmax to figure out the class with highest probability per token
# preds = np.argmax(model.predict([dev_seqs_padded,dev_pos_seqs_padded]), axis=-1)
# flat_preds = [p for pred in preds for p in pred]
# print(Counter(flat_preds))

In [21]:
# use deep copy to ensure we aren't updating original values
import copy
train_weights_onehot = copy.deepcopy(train_labs_onehot)

# our first-pass class weights: normal for named entities (0 and 1), down-weighted for non named entities (2 and 3)
class_wts = [1,1,.1,.1]

# apply our weights to the label lists
for i,labs in enumerate(train_weights_onehot):
  for j,lablist in enumerate(labs):
    lablistaslist = lablist.tolist()
    whichismax = lablistaslist.index(max(lablistaslist))
    train_weights_onehot[i][j][whichismax] = class_wts[whichismax]

# what's this like, before and after?
print('Initial one-hot label encoding:')
print(train_labs_onehot[1][:11])

print('Weighted label encoding:')
print(train_weights_onehot[1][:11])

Initial one-hot label encoding:
[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]
Weighted label encoding:
[[0.  0.  0.1 0. ]
 [0.  0.  0.1 0. ]
 [0.  0.  0.1 0. ]
 [0.  0.  0.1 0. ]
 [1.  0.  0.  0. ]
 [0.  0.  0.1 0. ]
 [0.  0.  0.1 0. ]
 [0.  0.  0.1 0. ]
 [0.  0.  0.1 0. ]
 [0.  0.  0.1 0. ]
 [0.  0.  0.1 0. ]]


In [22]:
# TRAINTRAIN
# now try the weighted one-hot encoding
y = np.array(train_weights_onehot)
print('Label dimensions (n.docs, seq.length, one-hot encoding of 4 NER labels):')
print(np.shape(y))



model2 = make_model(output_bias=initial_bias)
model2.fit([X,X_pos,X_char], y, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks = [early_stopping], validation_data=([dev_X,dev_X_pos,dev_X_char], dev_y))


Label dimensions (n.docs, seq.length, one-hot encoding of 4 NER labels):
(3375, 105, 4)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 00013: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f65201f9ac8>

In [23]:
preds = np.argmax(model2.predict([dev_X,dev_X_pos,dev_X_char]), axis=-1)
flat_preds = [p for pred in preds for p in pred]
print(Counter(flat_preds))

Counter({3: 91669, 2: 11581, 0: 840, 1: 175})


In [24]:
# corrected_preds = []

# for item in (preds):
#   preds_temp = item

#   if preds_temp[0] == 1:
#       preds_temp[0] = 0
    
#   for i in range(1,len(preds_temp)):
#       if preds_temp[i] == 1:
#         if preds_temp[i-1] == 2 :
#           preds_temp[i] = 0

#   corrected_preds.append(preds_temp)
  
#   preds = corrected_preds

In [25]:
    dev_seqs['prediction'] = ''

    # for each text: get original sequence length and trim predictions accordingly
    # (_trim_ because we know that our seq length is longer than the longest seq in dev)
    for i in dev_seqs.index:
      this_seq_length = len(dev_seqs['token'][i])
      dev_seqs['prediction'][i] = preds[i][:this_seq_length].astype(int)

    dev_seqs.head(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,sequence_num,token,bio_only,token_indices,upos,pos_indices,char_list,prediction
0,0,"[Stabilized, approach, or, not, ?, That, ´, s,...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...","[14801.0, 10361.0, 414.0, 556.0, 131.0, 1740.0...","[PROPN, NOUN, CCONJ, PART, PUNCT, PRON, SYM, P...","[9.0, 0.0, 16.0, 15.0, 8.0, 1.0, 10.0, 15.0, 1...","[[25, 67, 26, 22, 32, 43, 32, 3, 49, 30], [26,...","[0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]"


In [26]:
dev_long = dev_seqs.set_index('sequence_num').apply(pd.Series.explode).reset_index()
dev_long.head(1)

Unnamed: 0,sequence_num,token,bio_only,token_indices,upos,pos_indices,char_list,prediction
0,0,Stabilized,2,14801,PROPN,9,"[25, 67, 26, 22, 32, 43, 32, 3, 49, 30]",0


In [None]:
# re-using the BIO integer-to-character function from last time
def reverse_bio(ind):
  bio = 'O'  # for any pad=3 predictions
  if ind==0:
    bio = 'B'
  elif ind==1:
    bio = 'I'
  elif ind==2:
    bio = 'O'
  return bio

bio_labs = [reverse_bio(b) for b in dev_long['bio_only']]
dev_long['bio_only'] = bio_labs
pred_labs = [reverse_bio(b) for b in dev_long['prediction']]
dev_long['prediction'] = pred_labs

dev_long.head()
dev_long.prediction.value_counts()

In [27]:
# re-using the BIO integer-to-character function from last time
def reverse_bio(ind):
  bio = 'O'  # for any pad=3 predictions
  if ind==0:
    bio = 'B'
  elif ind==1:
    bio = 'I'
  elif ind==2:
    bio = 'O'
  return bio

bio_labs = [reverse_bio(b) for b in dev_long['bio_only']]
dev_long['bio_only'] = bio_labs
pred_labs = [reverse_bio(b) for b in dev_long['prediction']]
dev_long['prediction'] = pred_labs

dev_long.head()
dev_long.prediction.value_counts()

O    14367
B      840
I      175
Name: prediction, dtype: int64

In [28]:
def wnut_evaluate(txt):
  '''row by row entity evaluation: we evaluate by whole named entities'''
  tp = 0; fp = 0; fn = 0
  in_entity = 0
  for i in txt.index:
    if txt['prediction'][i]=='B' and txt['bio_only'][i]=='B':
      if in_entity==1:  # if there's a preceding named entity which didn't have intervening O...
        tp += 1  # count a true positive
      in_entity = 1  # start tracking this entity (don't count it until we know full span of entity)
    elif txt['prediction'][i]=='B':
      fp += 1  # if not a B in gold annotations, it's a false positive
      in_entity = 0
    elif txt['prediction'][i]=='I' and txt['bio_only'][i]=='I':
      next  # correct entity continuation: do nothing
    elif txt['prediction'][i]=='I' and txt['bio_only'][i]=='B':
      fn += 1  # if a new entity should have begun, it's a false negative
      in_entity = 0
    elif txt['prediction'][i]=='I':  # if gold is O...
      if in_entity==1:  # and if tracking an entity, then the span is too long
        fp += 1  # it's a false positive
      in_entity = 0
    elif txt['prediction'][i]=='O':
      if txt['bio_only'][i]=='B':
        fn += 1  # false negative if there's B in gold but no predicted B
        if in_entity==1:  # also check if there was a named entity in progress
          tp += 1  # count a true positive
      elif txt['bio_only'][i]=='I':
        if in_entity==1:  # if this should have been a continued named entity, the span is too short
          fn += 1  # count a false negative
      elif txt['bio_only'][i]=='O':
        if in_entity==1:  # if a named entity has ended in right place
          tp += 1  # count a true positive
      in_entity = 0

  if in_entity==1:  # catch any final named entity
    tp += 1

  print('Sum of TP and FP = %i' % (tp+fp))
  print('Sum of TP and FN = %i' % (tp+fn))
  print('True positives = %i, False positives = %i, False negatives = %i' % (tp, fp, fn))
  prec = tp / (tp+fp)
  rec = tp / (tp+fn)
  f1 = (2*(prec*rec)) / (prec+rec)
  print('Precision = %.3f, Recall = %.3f, F1 = %.3f (max=1)' % (prec, rec, f1))
 
wnut_evaluate(dev_long)

Sum of TP and FP = 733
Sum of TP and FN = 797
True positives = 380, False positives = 353, False negatives = 417
Precision = 0.518, Recall = 0.477, F1 = 0.497 (max=1)


In [29]:
    dev_long.to_csv('FullChar_POS.txt', sep='\t', index=False)


In [30]:
from keras.models import Sequential
from keras.layers import Embedding
import numpy as np
 
model = Sequential()
model.add(Embedding(1000, 64, input_length=10))
# 输入大小为(None，10)，Nnoe是batch_size大小，10代表每一个batch中有10条样本
# 输出大小为(None, 10, 64),其中64代表输入中每个每条样本被embedding成了64维的向量
 
input_array = np.random.randint(1000, size=(32, 10))



model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
print(output_array)
assert output_array.shape == (32, 10, 64)

[[[-3.04704439e-02 -2.76493188e-02  2.69191600e-02 ...  8.50858539e-03
    2.46909373e-02 -8.24428722e-03]
  [-3.71010900e-02  4.32311185e-02  1.05293170e-02 ... -7.49167055e-03
    5.51857054e-04  4.66991924e-02]
  [-3.55623141e-02  4.45062630e-02 -2.62874719e-02 ...  3.10262926e-02
    9.06201452e-03  1.36376284e-02]
  ...
  [-7.28629902e-03  6.90187141e-03  2.21623071e-02 ...  3.22265737e-02
    1.68389119e-02 -2.97251344e-02]
  [-2.44478118e-02  4.82443720e-03  3.62755768e-02 ...  7.80452043e-04
    1.10337846e-02 -4.74399440e-02]
  [-3.79296765e-02 -1.76474564e-02  2.28854455e-02 ... -6.39337301e-03
    3.35512049e-02  2.52128728e-02]]

 [[ 2.28893198e-02  3.84732746e-02  4.30743769e-03 ...  1.71151012e-03
   -3.42347398e-02  4.54065688e-02]
  [ 3.24051045e-02  5.02653047e-03  4.08249758e-02 ... -2.93975826e-02
   -1.30207762e-02 -2.33641267e-02]
  [ 4.55523469e-02 -1.70677900e-03 -7.90727139e-03 ...  1.20730177e-02
    4.93777432e-02  2.08526738e-02]
  ...
  [-3.35530043e-02 -3.0

In [31]:
from keras.models import Model
from keras.layers import Input
from keras.layers import LSTM
from numpy import array
from keras.models import Sequential

from numpy.random import seed
seed(0)
import tensorflow
tensorflow.random.set_seed(0)


data = array([0.1,0.2,0.3]).reshape((1,3,1))
inputs1 = Input(shape=(3,1))
lstm1,state_h,state_c = LSTM(2,return_sequences=True,return_state=True)(inputs1) #第一层LSTM
lstm2 = LSTM(2,return_sequences=False)(lstm1)  #第二层LSTM
model = Model( inputs1,outputs = [lstm2])

print(model.predict(data))


UnknownError:  [_Derived_]  Fail to find the dnn implementation.
	 [[{{node CudnnRNN}}]]
	 [[model_3/lstm_6/StatefulPartitionedCall]] [Op:__inference_predict_function_53959]

Function call stack:
predict_function -> predict_function -> predict_function


In [None]:
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import random
import io

In [None]:
path = keras.utils.get_file(
    "nietzsche.txt", origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt"
)
with io.open(path, encoding="utf-8") as f:
    text = f.read().lower()
text = text.replace("\n", " ")  # We remove newlines chars for nicer display
print("Corpus length:", len(text))

chars = sorted(list(set(text)))
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [None]:
print(sentences[0])

In [None]:
y