### Train our embeddings  using  Skip-gram’s implementation from the Word2Vec module of the gensim library.

In [1]:
import nltk

In [9]:
from nltk.corpus import brown
from gensim.models import Word2Vec
import multiprocessing

In [10]:
sentences = brown.sents()

In [11]:
sentences.count

<bound method AbstractLazySequence.count of [['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]>

In [13]:
sentences[:1]

[['The',
  'Fulton',
  'County',
  'Grand',
  'Jury',
  'said',
  'Friday',
  'an',
  'investigation',
  'of',
  "Atlanta's",
  'recent',
  'primary',
  'election',
  'produced',
  '``',
  'no',
  'evidence',
  "''",
  'that',
  'any',
  'irregularities',
  'took',
  'place',
  '.']]

In [15]:
EMBED_DIM = 300

#Create the word2Vec instance
w2v = Word2Vec(sentences, size= EMBED_DIM, window=5, min_count=5, negative=15, iter=10, workers=multiprocessing.cpu_count())

In [19]:
word_vect = w2v.wv

In [21]:
word_vect.similar_by_word("praise")

[('adviser', 0.8923773169517517),
 ('voiced', 0.889167845249176),
 ('origins', 0.8832777142524719),
 ('candidacy', 0.8806986212730408),
 ('youthful', 0.8804792165756226),
 ('footsteps', 0.8756811022758484),
 ('splendor', 0.8754454851150513),
 ('disciples', 0.873500645160675),
 ('initiation', 0.8709819316864014),
 ('mania', 0.8686869144439697)]

In [22]:
word_vect.similar_by_word("Friday")

[('Wednesday', 0.9209637641906738),
 ('Tuesday', 0.9153870344161987),
 ('Sunday', 0.9120498895645142),
 ('Monday', 0.8928256034851074),
 ('Thursday', 0.8843043446540833),
 ('Saturday', 0.8753754496574402),
 ('November', 0.8415101170539856),
 ('October', 0.8350034952163696),
 ('April', 0.8303235769271851),
 ('Chicago', 0.825799822807312)]

In [23]:
word_vect.similar_by_word("Atlanta")

[('Beverly', 0.8987345695495605),
 ('News', 0.8955743312835693),
 ('Columbia', 0.8948028087615967),
 ('Colorado', 0.8942939043045044),
 ('Missouri', 0.8939505815505981),
 ('Springs', 0.8936272263526917),
 ('Portland', 0.8932808637619019),
 ('N.Y.', 0.8932299613952637),
 ('Minnesota', 0.8928429484367371),
 ('Order', 0.8927724361419678)]

### Using our embeddings as features in a Neural model


In [24]:
from nltk.corpus import conll2000

In [28]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
import numpy as np
import collections

In [30]:
train_words = conll2000.tagged_words("train.txt")
test_words = conll2000.tagged_words("test.txt")

In [33]:
train_words[:10]

[('Confidence', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('pound', 'NN'),
 ('is', 'VBZ'),
 ('widely', 'RB'),
 ('expected', 'VBN'),
 ('to', 'TO'),
 ('take', 'VB'),
 ('another', 'DT')]

#### First we have to get rid of the tagged data in order to make it model-friendly

In [38]:
def get_tag_vocab(tagged_words):
    tag2id = {}
    for item in tagged_words:
        tag = item[1]
        tag2id.setdefault(tag, len(tag2id))
    return tag2id

In [35]:
word2id = {k:v.index for k,v in word_vect.vocab.items()}

In [36]:
word2id

{'The': 14,
 'Fulton': 5615,
 'County': 1280,
 'Grand': 5377,
 'said': 59,
 'Friday': 1852,
 'an': 34,
 'investigation': 2586,
 'of': 3,
 'recent': 595,
 'primary': 1162,
 'election': 1521,
 'produced': 1206,
 '``': 12,
 'no': 67,
 'evidence': 475,
 "''": 13,
 'that': 8,
 'any': 84,
 'irregularities': 9647,
 'took': 220,
 'place': 188,
 '.': 2,
 'jury': 1754,
 'further': 499,
 'in': 7,
 'the': 0,
 'City': 762,
 'Executive': 8895,
 'Committee': 1235,
 ',': 1,
 'which': 35,
 'had': 25,
 'over-all': 3165,
 'charge': 869,
 'deserves': 5880,
 'praise': 5616,
 'and': 4,
 'thanks': 3917,
 'Atlanta': 3166,
 'for': 11,
 'manner': 838,
 'was': 10,
 'conducted': 2046,
 'term': 1391,
 'been': 48,
 'charged': 1962,
 'by': 24,
 'Superior': 5881,
 'Court': 960,
 'Judge': 2861,
 'to': 5,
 'investigate': 7781,
 'reports': 1407,
 'possible': 254,
 'won': 1604,
 'Allen': 4961,
 'Jr.': 1469,
 'Only': 1062,
 'a': 6,
 'relative': 2530,
 'handful': 6888,
 'such': 91,
 'received': 609,
 'considering': 2862,
 

In [39]:
tag2id = get_tag_vocab(train_words)

In [40]:
tag2id

{'NN': 0,
 'IN': 1,
 'DT': 2,
 'VBZ': 3,
 'RB': 4,
 'VBN': 5,
 'TO': 6,
 'VB': 7,
 'JJ': 8,
 'NNS': 9,
 'NNP': 10,
 ',': 11,
 'CC': 12,
 'POS': 13,
 '.': 14,
 'VBP': 15,
 'VBG': 16,
 'PRP$': 17,
 'CD': 18,
 '``': 19,
 "''": 20,
 'VBD': 21,
 'EX': 22,
 'MD': 23,
 '#': 24,
 '(': 25,
 '$': 26,
 ')': 27,
 'NNPS': 28,
 'PRP': 29,
 'JJS': 30,
 'WP': 31,
 'RBR': 32,
 'JJR': 33,
 'WDT': 34,
 'WRB': 35,
 'RBS': 36,
 'PDT': 37,
 'RP': 38,
 ':': 39,
 'FW': 40,
 'WP$': 41,
 'SYM': 42,
 'UH': 43}

In [65]:
#get number representations of the data and keep a track of the words which are not in the vocab
def get_int_data(tagged_words, word2id, tag2id):
    X,Y = [],[] # X - for word ids, Y - for tag id
    unk_count  = 0 # for unknown words that don't have a representation 
    
    for word, tag in tagged_words:
        Y.append(tag2id.get(tag))
        if word in word2id:
            X.append(word2id.get(word))
        else:
            X.append(UNK_INDEX)
            unk_count+=1
    print("Percentage of unkown words: %.3f" % (unk_count/len(tagged_words)))
    return np.array(X), np.array(Y)
    
    


In [66]:
X_train, Y_train = get_int_data(train_words,word2id,tag2id)
X_test, Y_test = get_int_data(test_words,word2id,tag2id)

Percentage of unkown words: 0.143
Percentage of unkown words: 0.149


In [67]:
Y_train

array([ 0,  1,  2, ..., 10,  4, 14])

In [68]:
#One hot encoding for the tag indexes
Y_train, Y_test = to_categorical(Y_train), to_categorical(Y_test)

In [59]:
#Add the unkown word 
def add_new_word(new_word, new_vector, new_index, embed_matrix, word2id):
    embed_matrix = np.insert(embed_matrix, [new_index], [new_vector], axis = 0)
    #update the index of the word that follow the new word
    word2id = {word:(index+1) if index>= new_index else index for word,index in word2id.items()}
    word2id[new_word] = new_index
    return embed_matrix, word2id

In [60]:
UNK_INDEX = 0;
UNK_TOKEN = "UNK"
embed_matrix = word_vect.vectors
unk_vector = embed_matrix.mean(0)
embed_matrix, word2id = add_new_word(UNK_TOKEN, unk_vector, UNK_INDEX,embed_matrix, word2id)

In [61]:
embed_matrix

array([[ 0.03203953,  0.15326405,  0.04586557, ...,  0.01970407,
         0.13494614, -0.07527844],
       [ 0.6099406 , -0.08260303,  0.6045211 , ..., -0.14682685,
         0.5754098 , -0.3860158 ],
       [-0.00507205, -0.49630207, -0.45752096, ..., -0.33814788,
         0.58461475, -0.31399578],
       ...,
       [-0.01662082,  0.14667685,  0.06267399, ...,  0.03565331,
         0.05831501, -0.09306163],
       [-0.02909321,  0.17623653,  0.04378775, ...,  0.01513462,
         0.11138717, -0.03159484],
       [ 0.0347857 ,  0.14375113,  0.01076526, ...,  0.03773131,
         0.04941016, -0.05327427]], dtype=float32)

In [64]:
len(word2id)

15174

In [84]:
BATCH_SIZE = 128
HIDDEN_SIZE = 50

def define_model(embed_matrix, class_count):
    vocab_length = len(embed_matrix)
    model = Sequential()
    
    model.add(Embedding(input_dim = vocab_length,
                       output_dim = EMBED_DIM,
                       weights = [embed_matrix],
                         input_length = 1))
    model.add(Flatten())
    model.add(Dense(HIDDEN_SIZE))
    model.add(Activation("tanh"))
    model.add(Dense(class_count))
    model.add(Activation("softmax"))
    model.compile(optimizer="adam", loss="categorical_crossentropy",
                  metrics = ["accuracy"])
    return model


In [85]:
pos_model = define_model(embed_matrix, len(tag2id))
pos_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1, 300)            4552200   
_________________________________________________________________
flatten_2 (Flatten)          (None, 300)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 50)                15050     
_________________________________________________________________
activation_4 (Activation)    (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 44)                2244      
_________________________________________________________________
activation_5 (Activation)    (None, 44)                0         
Total params: 4,569,494
Trainable params: 4,569,494
Non-trainable params: 0
____________________________________________

In [86]:
pos_model.fit(X_train,Y_train, batch_size = BATCH_SIZE, epochs=1, verbose = 1)

Train on 211727 samples


<tensorflow.python.keras.callbacks.History at 0x15944e050>

###https://blog.cambridgespark.com/tutorial-build-your-own-embedding-and-use-it-in-a-neural-network-e9cde4a81296