<a href="https://colab.research.google.com/github/JstnClmnt/NLP-Word-Embeddings/blob/master/Word_Embeddings_English.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import nltk
import tensorflow as tf
import tensorflow.keras as keras
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import FastText

In [2]:
nltk.download("treebank")

tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 100676


In [3]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 100676


In [4]:
import numpy as np
 
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(sentence)
    sentence_tags.append(tags)
print(sentences[0])
print(sentence_tags[0])
# ['Lorillard' 'Inc.' ',' 'the' 'unit' 'of' 'New' 'York-based' 'Loews'
#  'Corp.' 'that' '*T*-2' 'makes' 'Kent' 'cigarettes' ',' 'stopped' 'using'
#  'crocidolite' 'in' 'its' 'Micronite' 'cigarette' 'filters' 'in' '1956'
# '.']
# ['NNP' 'NNP' ',' 'DT' 'NN' 'IN' 'JJ' 'JJ' 'NNP' 'NNP' 'WDT' '-NONE-' 'VBZ'
#  'NNP' 'NNS' ',' 'VBD' 'VBG' 'NN' 'IN' 'PRP$' 'NN' 'NN' 'NNS' 'IN' 'CD'
#  '.']]

('Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.')
('NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.')


In [0]:
from sklearn.model_selection import train_test_split
 
train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences, sentence_tags, test_size=0.2)

words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())

for ts in train_tags:
    for t in ts:
        tags.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

In [0]:
#Declare Model Parameters
cbow = 0
skipgram = 1
EMB_DIM = 300 #more dimensions, more computationally expensive to train
min_word_count = 3
workers = multiprocessing.cpu_count() #based on computer cpu count
context_size = 7
downsampling = 1e-3
learning_rate = 0.025 #initial learning rate
min_learning_rate = 0.025 #fixated learning rate
num_epoch = 15

In [7]:
w2v = Word2Vec(
    sg = skipgram,
    hs = 0, #hierarchical softmax
    negative = 7,
    size = EMB_DIM,
    min_count = min_word_count, 
    workers = workers,
    window = context_size, 
    sample = downsampling, 
    alpha = learning_rate, 
    min_alpha = min_learning_rate,
    seed=1
)
print('Vocabulary size: %d' % len(words))
w2v.build_vocab(train_sentences)
w2v.train(train_sentences,epochs=10,total_examples=w2v.corpus_count)
words_w2v = list(w2v.wv.vocab)
# save model in ASCII (word2vec) format
filename = 'embedding_word2vec.txt'
w2v.wv.save_word2vec_format(filename, binary=False)

Vocabulary size: 10149


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
vector = w2v.wv['computer']
print(vector)

[-9.48063880e-02 -1.83071475e-04  5.84658720e-02  1.34530783e-01
 -1.45727927e-02 -3.09020630e-03  1.72997098e-02 -1.79214135e-01
 -9.87414718e-02 -1.05974391e-01 -1.32804677e-01  7.87142068e-02
  1.05216421e-01  1.04272559e-01 -3.06221023e-02  3.05985170e-03
  4.38102055e-03 -3.01895570e-02  2.47776195e-01 -1.61331743e-02
 -5.07561415e-02  8.14820454e-03  9.00185555e-02  1.96949467e-01
  6.01170994e-02  9.73612964e-02  1.34601876e-01 -5.33512942e-02
  3.62782404e-02 -1.28585979e-01  1.10913925e-01  7.76477456e-02
 -6.77567674e-03  1.92401558e-02  5.78191243e-02 -8.30302015e-02
  5.49941286e-02 -3.28412047e-03  2.67380439e-02  1.38981650e-02
  1.14437900e-01  1.78134426e-01 -3.33158951e-03  6.12779707e-02
  1.85984135e-01 -4.34649922e-02  2.47458816e-01 -1.48701691e-03
  8.96465406e-02 -9.68495160e-02  5.85098639e-02 -5.79798268e-03
 -1.75435856e-01 -1.71247706e-01  1.92920908e-01  5.70610873e-02
 -5.85080218e-03 -5.17708473e-02 -1.33252069e-01 -9.19627771e-02
  8.68019313e-02  2.73979

In [9]:
w2v.most_similar("computer")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('models', 0.820219099521637),
 ('projects', 0.8163951635360718),
 ('high-tech', 0.8078446984291077),
 ('NEC', 0.7974915504455566),
 ('Take', 0.7906618714332581),
 ('giant', 0.7814673185348511),
 ('medical', 0.7806452512741089),
 ('car', 0.7796580791473389),
 ('television', 0.776167631149292),
 ('brokerage', 0.7723062634468079)]

In [10]:
w2v.most_similar("compute")

  """Entry point for launching an IPython kernel.


KeyError: ignored

In [11]:
fasttext = FastText(
    sg = skipgram,
    hs = 0, #hierarchical softmax
    negative = 7,
    size = EMB_DIM,
    min_count = min_word_count, 
    workers = workers,
    window = context_size, 
    sample = downsampling, 
    alpha = learning_rate, 
    min_alpha = min_learning_rate,
    seed=1,
    word_ngrams=1
)
print('Vocabulary size: %d' % len(words))
fasttext.build_vocab(train_sentences)
fasttext.train(train_sentences,epochs=10,total_examples=w2v.corpus_count)
words_fasttext = list(fasttext.wv.vocab)
# save model in ASCII (word2vec) format
filename = 'embedding_fasttext.txt'
fasttext.wv.save_word2vec_format(filename, binary=False)

Vocabulary size: 10149


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [12]:
fasttext.most_similar("computer")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('computers', 0.9316549301147461),
 ('supercomputer', 0.9286706447601318),
 ('Computer', 0.8386468887329102),
 ('heavily', 0.7870428562164307),
 ('electronic', 0.7486151456832886),
 ('electronics', 0.7109121680259705),
 ('Foster', 0.710498034954071),
 ('electrical', 0.7073049545288086),
 ('names', 0.7015905380249023),
 ('center', 0.6978118419647217)]

In [13]:
fasttext.most_similar("compute")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('computer', 0.9319080114364624),
 ('computers', 0.90720534324646),
 ('supercomputer', 0.8444979786872864),
 ('Computer', 0.727797269821167),
 ('heavily', 0.711861789226532),
 ('electrical', 0.629509687423706),
 ('electronic', 0.6275074481964111),
 ('introduced', 0.6167677640914917),
 ('high-tech', 0.6075800061225891),
 ('devices', 0.6040067076683044)]