# Lab Session 4 Edan 95

## Collecting Embeddings

Download the GloVe embeddings 6B from https://nlp.stanford.edu/projects/glove/ and keep the 100d vectors.

In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
import numpy as np

def readGloveEmbeddings(file):
    f = open(file, 'r', encoding='UTF-8')
    word_dict = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embeddings = np.array([float(val) for val in splitLine[1:]])
        word_dict[word] = embeddings
    print("Number of words collected: ", len(word_dict))
    f.close()
    return word_dict
    
    

In [3]:
word_dict = readGloveEmbeddings('glove.6B.100d.txt')

Number of words collected:  400000


Another way of collecting embeddings given by Pierre

In [4]:
import os

glove_dir = r'C:\Users\David_000\Documents\EDAN95\edan95\Lab4'
embedding_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding = 'UTF-8')

for line in f:
    values = line.strip().split()
    word = values[0]
    embeddings = np.array(values[1:], dtype='float32')
    embedding_index[word] = embeddings
f.close()
print("Number of words collected: ", len(embedding_index))

Number of words collected:  400000


In [5]:
from sklearn.metrics.pairwise import cosine_similarity # Raises error for 2D vector.

In [6]:
import numpy.linalg as npl


def cos_sim(a,b):
    sim = np.dot(a, b)/(npl.norm(a)*npl.norm(b))
    return sim

In [7]:
comparison_words = ['table', 'france', 'sweden']
most_simular = {}

for word in comparison_words:
    sim = np.array([cos_sim(embedding_index[word], embedding_index[compare]) for compare in embedding_index.keys()])
    top_idx = np.argpartition(sim, -6)[-6:]
    print("Top index: ", top_idx[0:], " for word ", word)
    most_simular[word] = np.array([list(embedding_index.keys())[ind] for ind in top_idx[0:]])
        

Top index:  [ 437 2389  927  241 7221 1801]  for word  table
Top index:  [1035 1029  348  695 2975  387]  for word  france
Top index:  [2640 2238 3817 3384 2819 2038]  for word  sweden


In [8]:
print(most_simular['table'])
print(most_simular['france'])
print(most_simular['sweden'])

['side' 'bottom' 'room' 'place' 'tables' 'table']
['paris' 'spain' 'french' 'britain' 'belgium' 'france']
['austria' 'netherlands' 'finland' 'denmark' 'norway' 'sweden']


## Reading the Corpus and Collecting Building Indicies

You will read the corpus with programs available from https://github.com/pnugues/edan95. These programs will enable you to load the files in the form of a list of dictionaries.

In [19]:
from conll_dictorizer import CoNLLDictorizer, Token
import datasets

In [22]:
BASE_DIR = r'C:\Users\David_000\Documents\EDAN95\edan95\Lab4'


def load_conll2003_en():
    train_file = BASE_DIR + '\eng.train'
    dev_file = BASE_DIR + '\eng.valid'
    test_file = BASE_DIR + '\eng.test'
    column_names = ['form', 'ppos', 'pchunk', 'ner']
    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names


In [24]:
train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en()

conll_dict = CoNLLDictorizer(column_names, col_sep=' +')
train_dict = conll_dict.transform(train_sentences)
dev_dict = conll_dict.transform(dev_sentences)
test_dict = conll_dict.transform(test_sentences)

In [25]:
print(train_dict[1])

[{'form': 'EU', 'ppos': 'NNP', 'pchunk': 'I-NP', 'ner': 'I-ORG'}, {'form': 'rejects', 'ppos': 'VBZ', 'pchunk': 'I-VP', 'ner': 'O'}, {'form': 'German', 'ppos': 'JJ', 'pchunk': 'I-NP', 'ner': 'I-MISC'}, {'form': 'call', 'ppos': 'NN', 'pchunk': 'I-NP', 'ner': 'O'}, {'form': 'to', 'ppos': 'TO', 'pchunk': 'I-VP', 'ner': 'O'}, {'form': 'boycott', 'ppos': 'VB', 'pchunk': 'I-VP', 'ner': 'O'}, {'form': 'British', 'ppos': 'JJ', 'pchunk': 'I-NP', 'ner': 'I-MISC'}, {'form': 'lamb', 'ppos': 'NN', 'pchunk': 'I-NP', 'ner': 'O'}, {'form': '.', 'ppos': '.', 'pchunk': 'O', 'ner': 'O'}]


In [26]:
def build_sequences(corpus_dict, key_x='form', key_y='ner', tolower=True):
    """
    Creates sequences from a list of dictionaries
    :param corpus_dict:
    :param key_x:
    :param key_y:
    :return:
    """
    X = []
    Y = []
    for sentence in corpus_dict:
        x = []
        y = []
        for word in sentence:
            x += [word[key_x]]
            y += [word[key_y]]
        if tolower:
            x = list(map(str.lower, x))
        X += [x]
        Y += [y]
    return X, Y

In [27]:
X_train_cat, Y_train_cat = build_sequences(train_dict)
X_dev_cat, Y_dev_cat = build_sequences(dev_dict)
X_test_cat, Y_test_cat = build_sequences(test_dict)
print('First sentence, words', X_train_cat[0])
print('First sentence, NER', Y_train_cat[0])

First sentence, words ['-docstart-']
First sentence, NER ['O']


In [28]:
vocabulary_words = sorted(list(
    set([word for sentence 
         in X_train_cat for word in sentence])))
ner = sorted(list(set([ner for sentence 
                       in Y_train_cat for ner in sentence])))
print(ner)
NB_CLASSES = len(ner)

['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


In [29]:
def load(file):
    """
    Return the embeddings in the from of a dictionary
    :param file:
    :return:
    """
    file = file
    embeddings = {}
    glove = open(file,encoding='UTF-8')
    for line in glove:
        values = line.strip().split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        embeddings[word] = vector
    glove.close()
    embeddings_dict = embeddings
    embedded_words = sorted(list(embeddings_dict.keys()))
    return embeddings_dict

In [30]:
embedding_file = r'C:\Users\David_000\Documents\EDAN95\edan95\Lab4\glove.6B.100d.txt'
embeddings_dict = load(embedding_file)
embeddings_words = embeddings_dict.keys()
print('Words in GloVe:',  len(embeddings_dict.keys()))
vocabulary_words = sorted(list(set(vocabulary_words + 
                                   list(embeddings_words))))
cnt_uniq = len(vocabulary_words) + 2
print('# unique words in the vocabulary: embeddings and corpus:', 
      cnt_uniq)

Words in GloVe: 400000
# unique words in the vocabulary: embeddings and corpus: 402597


In [31]:
def to_index(X, idx):
    """
    Convert the word lists (or NER lists) to indexes
    :param X: List of word (or NER) lists
    :param idx: word to number dictionary
    :return:
    """
    X_idx = []
    for x in X:
        # We map the unknown words to one
        x_idx = list(map(lambda x: idx.get(x, 1), x))
        X_idx += [x_idx]
    return X_idx

In [33]:
rev_word_idx = dict(enumerate(vocabulary_words, start=2))
rev_ner_idx = dict(enumerate(ner, start=2))
word_idx = {v: k for k, v in rev_word_idx.items()}
ner_idx = {v: k for k, v in rev_ner_idx.items()}
print('word index:', list(word_idx.items())[:10])
print('NER index:', list(ner_idx.items())[:10])

# We create the parallel sequences of indexes
X_idx = to_index(X_train_cat, word_idx)
Y_idx = to_index(Y_train_cat, ner_idx)
X_idx_dev = to_index(X_dev_cat, word_idx)
X_idx_test = to_index(X_test_cat, word_idx)
Y_idx_dev = to_index(Y_dev_cat, ner_idx)
Y_idx_test = to_index(Y_test_cat, ner_idx)
print('First sentences, word indices', X_idx[:3])
print('First sentences, NER indices', Y_idx[:3])

word index: [('!', 2), ('!!', 3), ('!!!', 4), ('!!!!', 5), ('!!!!!', 6), ('!?', 7), ('!?!', 8), ('"', 9), ('#', 10), ('##', 11)]
NER index: [('B-LOC', 2), ('B-MISC', 3), ('B-ORG', 4), ('I-LOC', 5), ('I-MISC', 6), ('I-ORG', 7), ('I-PER', 8), ('O', 9)]
First sentences, word indices [[935], [142143, 307143, 161836, 91321, 363368, 83766, 85852, 218260, 936], [284434, 79019]]
First sentences, NER indices [[9], [7, 9, 6, 9, 9, 9, 6, 9, 9], [8, 8]]


In [68]:
X = pad_sequences(X_idx)
Y = pad_sequences(Y_idx)
max_len = X.shape[1]
X_dev = pad_sequences(X_idx_dev,maxlen = max_len)
Y_dev = pad_sequences(Y_idx_dev,maxlen = max_len)
X_test = pad_sequences(X_idx_test,maxlen = max_len)
Y_test = pad_sequences(Y_idx_test,maxlen = max_len)

max_len = X.shape[1]
print(X[0])
print(Y[0])

# The number of NER classes and 0 (padding symbol)
Y_train = to_categorical(Y, num_classes=len(ner) + 2)
Y_val = to_categorical(Y_dev, num_classes=len(ner) + 2)
print(Y_train[0])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0 935]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 9]
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [69]:
print(X.shape)
print(Y.shape)

(14987, 113)
(14987, 113)


# Create embedding Matrix

In [70]:
EMBEDDING_DIM = 100
rdstate = np.random.RandomState(1234567)
embedding_matrix = rdstate.uniform(-0.05, 0.05, 
                                   (len(vocabulary_words) + 2, 
                                    EMBEDDING_DIM))

In [71]:
for word in vocabulary_words:
    if word in embeddings_dict:
        # If the words are in the embeddings, we fill them with a value
        embedding_matrix[word_idx[word]] = embeddings_dict[word]

In [72]:
print('Shape of embedding matrix:', embedding_matrix.shape)
print('Embedding of table', embedding_matrix[word_idx['table']])
print('Embedding of the padding symbol, idx 0, random numbers', 
      embedding_matrix[0])

Shape of embedding matrix: (402597, 100)
Embedding of table [-0.61453998  0.89692998  0.56770998  0.39102    -0.22437     0.49035001
  0.10868     0.27410999 -0.23833001 -0.52152997  0.73550999 -0.32653999
  0.51304001  0.32415    -0.46709001  0.68050998 -0.25497001 -0.040484
 -0.54417998 -1.05480003 -0.46691999  0.23557     0.31233999 -0.34536999
  0.14793    -0.53745002 -0.43215001 -0.48723999 -0.51019001 -0.90509999
 -0.17918999 -0.018376    0.09719    -0.31623     0.75120002  0.92236
 -0.49965     0.14036    -0.28296    -0.97443002 -0.0094408  -0.62944001
  0.14711    -0.94375998  0.0075222   0.18565001 -0.99172002  0.072789
 -0.18474001 -0.52901     0.38995001 -0.45677    -0.21932     1.37230003
 -0.29635999 -2.2342     -0.36667001  0.04987     0.63420999  0.53275001
 -0.53955001  0.31398001 -0.44698    -0.38389     0.066668   -0.02168
  0.20558     0.59456003 -0.24891999 -0.52794999 -0.3761      0.077104
  0.75221997 -0.2647     -0.0587      0.67540997 -0.16559    -0.49278
 -0.26

# Create a simple RNN

In [73]:
from keras import models, layers

In [93]:
model = models.Sequential()
model.add(layers.Embedding(len(vocabulary_words) + 2, EMBEDDING_DIM, input_length=max_len, mask_zero=True))
model.add(layers.SimpleRNN(100,return_sequences=True))
model.add(layers.Dense(NB_CLASSES + 2,activation = 'sigmoid'))

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = True

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 113, 100)          40259700  
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 113, 100)          20100     
_________________________________________________________________
dense_5 (Dense)              (None, 113, 10)           1010      
Total params: 40,280,810
Trainable params: 40,280,810
Non-trainable params: 0
_________________________________________________________________


In [94]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


In [95]:
# Xre = np.reshape(X,(-1,1))
# Yre = np.reshape(Y_train, (-1,Y_train.shape[2]))

In [100]:
model.fit(X,Y_train,epochs=2, batch_size = 32)

Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x2bcecd68e48>

### Test set

In [97]:
Ycat_test = to_categorical(Y_test, num_classes=len(ner) + 2)
testloss, testacc = model.evaluate(X_test,Ycat_test)
print("Test Loss: ", testloss, "\nTest Accuracy: ", testacc)

Test Loss:  0.01834092480024317 
Test Accuracy:  0.9485585689544678


In [98]:
predictions = model.predict_classes(X_test)
predictions

array([[9, 9, 9, ..., 9, 9, 9],
       [9, 9, 9, ..., 9, 9, 9],
       [9, 9, 9, ..., 9, 8, 8],
       ...,
       [9, 9, 9, ..., 9, 9, 9],
       [9, 9, 9, ..., 9, 9, 9],
       [9, 9, 9, ..., 9, 8, 9]], dtype=int64)

In [99]:
Y_test

array([[0, 0, 0, ..., 0, 0, 9],
       [0, 0, 0, ..., 9, 9, 9],
       [0, 0, 0, ..., 0, 8, 8],
       ...,
       [0, 0, 0, ..., 9, 9, 9],
       [0, 0, 0, ..., 9, 9, 9],
       [0, 0, 0, ..., 9, 8, 9]])