# Features

* So far we always worked purely with words and their embeddings
* Other features might be useful
* POS tagger as a running example: much can be guessed from the word's suffix: "this is an *ooobviously* good idea"
  * no embedding for *ooobviously*
  * but the suffix -ly and the context should tell us it's an adverb


In [2]:
## Read in the data
import json
import random
import numpy

def read_labeled_data(json_file):
    with open(json_file) as f:
        data=json.load(f)
        texts=[one_example["text"] for one_example in data]  #list of texts
        labels=[one_example["tags"] for one_example in data] # list of lists of output labels
    return texts,labels

texts_train,labels_train=read_labeled_data("data/pos_train_fi.json")
texts_devel,labels_devel=read_labeled_data("data/pos_devel_fi.json")       

In [4]:
## Read in pre-trained embeddings
from gensim.models import KeyedVectors

# English model: wiki-news-300d-1M.vec
# Finnish model: pb34_wf_200_v2_skgram.bin
# these models are under /home/bio in the classroom machines
#                        /home/ginter on the virtual server
#                         ...don't make a copy of that file on the virtual server, just use it from that path
#                         ...if you run things locally on your laptop, you can scp this model from the virtual machine
vector_model=KeyedVectors.load_word2vec_format("/home/bio/pb34_wf_200_v2_skgram.bin", binary=True, limit=100000)
word_embeddings=vector_model.vectors # these are the vectors themselves


In [5]:
# Just checking all is fine
print("word_embeddings shape=",word_embeddings.shape)
print("embeddings=",word_embeddings)

word_embeddings shape= (100000, 200)
embeddings= [[ 2.0013428e-03  2.2097016e-03 -1.9151306e-03 ...  9.9411009e-05
  -8.4304810e-04 -5.6327821e-04]
 [-1.0991982e-01 -1.7190212e-01  1.4615083e-01 ...  5.6789882e-02
   5.0900381e-02 -8.7465588e-03]
 [-9.0047391e-03 -1.0183236e-01  1.5222897e-01 ...  5.6943305e-02
   5.0679442e-02 -2.7512657e-03]
 ...
 [-1.7641033e-01 -3.0918688e-01  4.2229193e-01 ...  3.8911417e-01
   1.8602428e-01 -2.6177013e-01]
 [ 2.3696978e-01 -3.1057227e-02 -9.4661742e-02 ...  2.1558458e-02
   3.4130868e-01  2.6005360e-01]
 [ 2.0296814e-01  2.0556472e-01  6.9490981e-01 ...  2.3367092e-01
  -1.2235161e-02  2.4763262e-01]]


In [6]:
import keras.utils
# The embeddings have one row for every word, and they are indexed from 0 upwards
# For our tagger, we need words with index 0 and 1 to have a special meaning
#       0 is the mask
#       1 is OOV (out of vocabulary)
# We need to make space for the two words:
# 1) Add two rows into the word_embeddings matrix
# 2) Renumber indices in the gensim model by 2, so that what was word 0 is now word 2, word 1 becomes word 3, etc...

# ad 1:
# Two rows with the right number of columns, and filled with random numbers
two_random_rows=numpy.random.uniform(low=-0.01, high=0.01, size=(2,word_embeddings.shape[1]))
# stack the two rows, and the embedding matrix on top of each other
word_embeddings=numpy.vstack([two_random_rows,word_embeddings])

# Normalize to unit length, works better this way
word_embeddings=keras.utils.normalize(word_embeddings)

# Alternative normalization code
#norm=numpy.linalg.norm(word_embeddings,axis=1,keepdims=True) #magnitude of every row
#word_embeddings/=norm #divide every row by magnitude, results in unit length vectors

# Ad 2:
# Now renumber all word indices, shifting them up by two
for word_record in vector_model.vocab.values():
    word_record.index+=2

print("New embeddings shape=",word_embeddings.shape)
print(word_embeddings)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


New embeddings shape= (100002, 200)
[[ 0.08607108 -0.01166316 -0.06657191 ...  0.0680553  -0.0654337
  -0.11574859]
 [-0.03078522  0.06780943  0.11581672 ...  0.08902182  0.10278784
   0.00744082]
 [ 0.09913075  0.1094512  -0.09486048 ...  0.00492404 -0.04175796
  -0.02790036]
 ...
 [-0.04595862 -0.08054972  0.11001598 ...  0.10137248  0.04846326
  -0.06819666]
 [ 0.06736331 -0.00882863 -0.02690946 ...  0.00612841  0.09702369
   0.07392534]
 [ 0.0625686   0.06336904  0.21421851 ...  0.07203328 -0.00377171
   0.07633723]]


## Adding features

* So far, our examples were 2D matrices *sentence x word*
  * `M[5,7]` was the vocabulary index of the 8th word in the 6th sentence (counting from zero)
* We will add a dimension, so the matrices will look like sentence x word x feature like so:
  * `M[5,7,0]` is the vocabulary index of the 8th word in the 6th sentence as before
  * `M[5,7,1]` is the vocabulary index of the 1st feature of the 8th word in the 6th sentence
  * ...

In [7]:
def vectorize(texts,word_vocab,feature_vocab):
    vectorized_texts=[] # List of sentences, each sentence is a list of words, and each word is a list of features
    for one_text in texts:
        vectorized_text=[] # One sentence, ie list of words, each being a list of features
        for one_word in one_text:
            # feature vector of this one word
            # [ word_itself, last_character, last_two_characters, last_three_characters, 
            #                first character, first_two_characters, first_three_characters, ...]
            one_word_feature_vector=[]
            if one_word in word_vocab:
                one_word_feature_vector.append(word_vocab[one_word].index) # the .index comes from gensim's vocab
            else:
                one_word_feature_vector.append(1) # OOV
            #as a future-proof idea, let us mark the word with a beginning and end marker
            marked="^"+one_word+"$"
            for affix_length in range(2,5): #2,3,4
                suffix=marked[-affix_length:]  # g$  og$  dog$
                prefix=marked[:affix_length]   # ^d  ^do  ^dog
                if len(suffix)==affix_length: #if len(suffix) is less than the desired length, the word is too short
                    one_word_feature_vector.append(feature_vocab.setdefault(suffix,len(feature_vocab)))
                else:
                    one_word_feature_vector.append(1) #No such suffix
                if len(prefix)==affix_length: #if len(prefix) is less than the desired length, the word is too short
                    one_word_feature_vector.append(feature_vocab.setdefault(prefix,len(feature_vocab)))
                else:
                    one_word_feature_vector.append(1) #No such prefix
            
            #Done with the word
            vectorized_text.append(one_word_feature_vector)
        #Done with the text
        vectorized_texts.append(vectorized_text)
    return numpy.array(vectorized_texts)

feature_vocab={"<SPECIAL>":0,"<NOSUCHSUFFIX>":1} #these are just to reserve the indices 0 and 1
vectorized_train=vectorize(texts_train,vector_model.vocab,feature_vocab)
print("First 10 features",list(feature_vocab.items())[:10]) #first 10 features
print("Some text:",vectorized_train[100])

vectorized_devel=vectorize(texts_devel,vector_model.vocab,feature_vocab)

First 10 features [('<SPECIAL>', 0), ('<NOSUCHSUFFIX>', 1), ('i$', 2), ('^K', 3), ('ti$', 4), ('^Kä', 5), ('tti$', 6), ('^Käv', 7), ('I$', 8), ('^I', 9)]
Some text: [[1, 53, 15, 54, 1053, 160, 1054], [33026, 2, 21, 375, 274, 674, 686], [15478, 14, 66, 129, 80, 344, 82], [2199, 2, 49, 556, 50, 1055, 52], [4, 83, 84, 85, 85, 1, 1], [145, 26, 21, 119, 77, 121, 79], [80, 53, 36, 67, 158, 69, 711], [59588, 26, 72, 221, 227, 478, 1056], [1, 2, 278, 1057, 445, 1058, 1059], [3, 58, 59, 60, 60, 1, 1]]


In [None]:
import tensorflow as tf
### Only needed for me, not to block the whole GPU, you don't need this stuff
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.4
set_session(tf.Session(config=config))
### ---end of weird stuff


In [8]:
from keras.preprocessing.sequence import pad_sequences

# Now we pad the sequences to get everything into the right sizes
padded_train_data=pad_sequences(vectorized_train,padding="post")
print("Padded train shape (texts x words x features):",padded_train_data.shape)
_,longest_train_sent,_=padded_train_data.shape
padded_devel_data=pad_sequences(vectorized_devel,maxlen=longest_train_sent,padding="post")
print("Padded devel shape (texts x words x features):",padded_devel_data.shape)

Padded train shape (texts x words x features): (12217, 238, 7)
Padded devel shape (texts x words x features): (1364, 238, 7)


In [9]:
# Now the training data input part is done ... labels needed yet
# Easiest way is to make our own vectorizer
def vectorize_labels(labels,label_dictionary):
    vectorized=[]
    for one_text_labels in labels: #List like ["NOUN","VERB","VERB","PUNCT"]
        one_text_vectorized=[] #numerical indices of the labels
        for one_label in one_text_labels:
            one_text_vectorized.append(label_dictionary.setdefault(one_label,len(label_dictionary)))
        vectorized.append(one_text_vectorized) #done with the sentence
    return numpy.array(vectorized)

label_dictionary={}
vectorized_train_labels=vectorize_labels(labels_train,label_dictionary)
padded_train_labels=pad_sequences(vectorized_train_labels,padding="post")
print("padded_train_labels shape=",padded_train_labels.shape)
vectorized_devel_labels=vectorize_labels(labels_devel,label_dictionary)
padded_devel_labels=pad_sequences(vectorized_devel_labels,padding="post",maxlen=longest_train_sent)
print("padded_devel_labels shape=",padded_devel_labels.shape)

padded_train_labels shape= (12217, 238)
padded_devel_labels shape= (1364, 238)


In [10]:
# Almost there ... we yet need the mask, telling which parts of each padded sequence are real words
# and which are only the padding which should be ignored in the output

#                           where(condition,value_if_true,value_if_false)
# padded_train_data[:,:,0]  -> returns the first feature of every word, i.e. the index of this word in the vocabulary
# here zero means padding
sentence_mask_train = numpy.where(padded_train_data[:,:,0]>0,1,0)
print(sentence_mask_train[:3])

sentence_mask_devel = numpy.where(padded_devel_data[:,:,0]>0,1,0) 


[[1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 

In [11]:
# phew, finally everything in place:

print("padded_train_data.shape",padded_train_data.shape)
print("padded_train_labels.shape",padded_train_labels.shape)
print("padded_devel_data.shape",padded_devel_data.shape)
print("padded_devel_labels.shape",padded_devel_labels.shape)

padded_train_data.shape (12217, 238, 7)
padded_train_labels.shape (12217, 238)
padded_devel_data.shape (1364, 238, 7)
padded_devel_labels.shape (1364, 238)


In [25]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Activation, Conv1D, TimeDistributed, GlobalMaxPooling1D
from keras.layers import Bidirectional, Concatenate,Flatten,Reshape
from keras.optimizers import SGD, Adam
from keras.initializers import Constant
from keras.layers import CuDNNLSTM as LSTM  #massive speedup on graphics cards
#from keras.layers import LSTM
from keras.callbacks import EarlyStopping



example_count, sequence_len, feature_count = padded_train_data.shape
_,word_embedding_dim=word_embeddings.shape
feature_embedding_dim=100 #we need to decide on an embedding for the features

word_input=Input(shape=(sequence_len,))
feature_input=Input(shape=(sequence_len,feature_count-1)) #first feature is word, so feature_count-1
word_embeddings_layer=Embedding(len(vector_model.vocab)+2,\
                     word_embedding_dim, mask_zero=False,\
                     trainable=True, weights=[word_embeddings])(word_input)
feature_embeddings_layer=Embedding(len(feature_vocab),feature_embedding_dim,embeddings_initializer=Constant(value=0.1))(feature_input)
feature_embeddings_layer_concat=Reshape((sequence_len,(feature_count-1)*feature_embedding_dim))(feature_embeddings_layer)
word_and_f_emb_layer=Concatenate()([word_embeddings_layer,feature_embeddings_layer_concat])

#Yritin lisätä convolution layerin..:
filters = 50
conv_res = []
for width in range(2,4):
    conv_result = Conv1D(filters, width, padding='valid', activation='relu', strides=1)(word_and_f_emb_layer)
    pooled = (GlobalMaxPooling1D())(conv_result) 
    conv_res.append(pooled)
concatenated = (Concatenate())(conv_res)

hidden_layer=TimeDistributed(Dense(100,activation="tanh"))(concatenated)  #Simple
outp_layer=TimeDistributed(Dense(len(label_dictionary),activation="softmax"))(hidden_layer)


model=Model(inputs=[word_input,feature_input], outputs=[outp_layer])
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy", sample_weight_mode='temporal',weighted_metrics=["acc"])

SyntaxError: invalid syntax (<ipython-input-25-2e93ef232ab4>, line 27)

In [16]:
print(model.summary())
word_input_data_train=padded_train_data[:,:,0]
feature_input_data_train=padded_train_data[:,:,1:]
labels_output_train=numpy.expand_dims(padded_train_labels,-1)

word_input_data_devel=padded_devel_data[:,:,0]
feature_input_data_devel=padded_devel_data[:,:,1:]
labels_output_devel=numpy.expand_dims(padded_devel_labels,-1)

print("word input shape",word_input_data_train.shape)
print("feature input shape",feature_input_data_train.shape)
print("output shape",labels_output_train.shape)
# train
# stop early
es_callback=EarlyStopping(monitor='val_weighted_acc', min_delta=0, patience=2, verbose=1, mode='auto')
hist=model.fit([word_input_data_train,feature_input_data_train],[labels_output_train],\
               validation_data=([word_input_data_devel,feature_input_data_devel],[labels_output_devel],sentence_mask_devel),\
               batch_size=200,sample_weight=sentence_mask_train,verbose=1,epochs=5,callbacks=[es_callback])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 238, 6)       0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 238)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 238, 6, 100)  1137500     input_2[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 238, 200)     20000400    input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 