In [None]:
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')


In [None]:
# define training data
#Genism word2vec requires that a format of ‘list of lists’ be provided for training where every document contained in a list.
#Every list contains lists of tokens of that document.
corpus = [['dog','bites','man'], ["man", "bites" ,"dog"],["dog","eats","meat"],["man", "eats","food"]]

#Training the model
model_cbow = Word2Vec(corpus, min_count=1,sg=0) #using CBOW Architecture for trainnig
model_skipgram = Word2Vec(corpus, min_count=1,sg=1)#using skipGram Architecture for training 



## Continuous Bag of Words (CBOW) 
In CBOW, the primary task is to build a language model that correctly predicts the center word given the context words in which the center word appears.

In [None]:
#Summarize the loaded model
print(model_cbow)

#Summarize vocabulary
words = list(model_cbow.wv.vocab)
print(words)

#Acess vector for one word
print(model_cbow['dog'])


Word2Vec(vocab=6, size=100, alpha=0.025)
['dog', 'bites', 'man', 'eats', 'meat', 'food']
[ 0.00331508 -0.00488006  0.00229956 -0.00447445 -0.00407264  0.00172649
  0.00260756 -0.00272675 -0.00463857  0.00071664  0.00334291  0.00020597
 -0.00431047  0.00043294 -0.00439009 -0.00057162 -0.00164546 -0.00195808
  0.00162258  0.00327302 -0.00228783 -0.00387814 -0.00329031  0.00119494
  0.0007751  -0.00240479 -0.00101702  0.00448922  0.00212406 -0.00049628
 -0.00344776 -0.00169515  0.00492119 -0.00063559  0.00345495 -0.00312139
 -0.00124647 -0.00043837  0.00362671 -0.00263283 -0.0042666   0.0032261
  0.00449331 -0.00194364 -0.00347566  0.00273526  0.00346346  0.00129265
  0.00065297  0.00373967 -0.00313     0.00446249 -0.00465436 -0.00278388
  0.00472894 -0.00369471 -0.0016987  -0.00082774  0.00078832  0.00450934
 -0.00185647 -0.00315558 -0.00304945 -0.00205605 -0.00186332  0.00212172
  0.00428404  0.00392068  0.00416603 -0.00329837  0.00177305  0.00027518
 -0.00408334  0.00461931 -0.00138989

In [None]:
#Compute similarity 
print("Similarity between eats and bites:",model_cbow.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_cbow.similarity('eats', 'man'))


Similarity between eats and bites: 0.008178662
Similarity between eats and man: -0.10606599


From the above similarity scores we can conclude that eats is more similar to bites than man.

In [None]:
#Most similarity
model_cbow.most_similar('meat')

[('food', 0.059630878269672394),
 ('dog', 0.025991111993789673),
 ('eats', 0.003866717219352722),
 ('man', -0.1195911169052124),
 ('bites', -0.11992672085762024)]

In [None]:
# save model
model_cbow.save('model_cbow.bin')

# load model
new_model_cbow = Word2Vec.load('model_cbow.bin')
print(new_model_cbow)

Word2Vec(vocab=6, size=100, alpha=0.025)


## SkipGram
In skipgram, the task is to predict the context words from the center word.

In [None]:
#Summarize the loaded model
print(model_skipgram)

#Summarize vocabulary
words = list(model_skipgram.wv.vocab)
print(words)

#Acess vector for one word
print(model_skipgram['dog'])


Word2Vec(vocab=6, size=100, alpha=0.025)
['dog', 'bites', 'man', 'eats', 'meat', 'food']
[ 0.00331508 -0.00488006  0.00229956 -0.00447445 -0.00407264  0.00172649
  0.00260756 -0.00272675 -0.00463857  0.00071664  0.00334291  0.00020597
 -0.00431047  0.00043294 -0.00439009 -0.00057162 -0.00164546 -0.00195808
  0.00162258  0.00327302 -0.00228783 -0.00387814 -0.00329031  0.00119494
  0.0007751  -0.00240479 -0.00101702  0.00448922  0.00212406 -0.00049628
 -0.00344776 -0.00169515  0.00492119 -0.00063559  0.00345495 -0.00312139
 -0.00124647 -0.00043837  0.00362671 -0.00263283 -0.0042666   0.0032261
  0.00449331 -0.00194364 -0.00347566  0.00273526  0.00346346  0.00129265
  0.00065297  0.00373967 -0.00313     0.00446249 -0.00465436 -0.00278388
  0.00472894 -0.00369471 -0.0016987  -0.00082774  0.00078832  0.00450934
 -0.00185647 -0.00315558 -0.00304945 -0.00205605 -0.00186332  0.00212172
  0.00428404  0.00392068  0.00416603 -0.00329837  0.00177305  0.00027518
 -0.00408334  0.00461931 -0.00138989

In [None]:
#Compute similarity 
print("Similarity between eats and bites:",model_skipgram.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_skipgram.similarity('eats', 'man'))


Similarity between eats and bites: 0.008187433
Similarity between eats and man: -0.106057286


From the above similarity scores we can conclude that eats is more similar to bites than man.

In [None]:
#Most similarity
model_skipgram.most_similar('meat')

[('food', 0.05963088572025299),
 ('dog', 0.025991126894950867),
 ('eats', 0.003793695941567421),
 ('man', -0.1195911169052124),
 ('bites', -0.11992672085762024)]

In [None]:
# save model
model_skipgram.save('model_skipgram.bin')

# load model
new_model_skipgram = Word2Vec.load('model_skipgram.bin')
print(model_skipgram)

Word2Vec(vocab=6, size=100, alpha=0.025)


## Training Your Embedding on Wiki Corpus

##### The corpus download page : https://dumps.wikimedia.org/enwiki/20200120/
The entire wiki corpus as of 28/04/2020 is just over 16GB in size.
We will take a part of this corpus due to computation constraints and train our word2vec and fasttext embeddings.


In [None]:
!mkdir -p data/en/
!wget -P data/en/ https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream14.xml-p6197595p7697594.bz2 

--2020-07-08 03:21:01--  https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream14.xml-p6197595p7697594.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.7, 2620:0:861:1:208:80:154:7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 440723201 (420M) [application/octet-stream]
Saving to: ‘data/en/enwiki-latest-pages-articles-multistream14.xml-p6197595p7697594.bz2’


2020-07-08 03:42:24 (336 KB/s) - ‘data/en/enwiki-latest-pages-articles-multistream14.xml-p6197595p7697594.bz2’ saved [440723201/440723201]



In [None]:
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
import time

In [None]:
#Preparing the Training data
wiki = WikiCorpus('data/en/enwiki-latest-pages-articles-multistream14.xml-p6197595p7697594.bz2', 
                  lemmatize=False, dictionary={})
sentences = list(wiki.get_texts())


### Hyperparameters


1.   sg - Selecting the training algorithm: 1 for skip-gram else its 0 for CBOW. Default is CBOW.
2.   min_count-  Ignores all words with total frequency lower than this.<br>
There are many more hyperparamaeters whose list can be found in the official documentation [here.](https://radimrehurek.com/gensim/models/word2vec.html)


In [None]:
#CBOW
start = time.time()
word2vec_cbow = Word2Vec(sentences,min_count=10, sg=0)
end = time.time()

print("CBOW Model Training Complete.\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))


CBOW Model Training Complete.
Time taken for training is:0.12 hrs 


In [None]:
#Summarize the loaded model
print(word2vec_cbow)
print("-"*30)

#Summarize vocabulary
words = list(word2vec_cbow.wv.vocab)
print(words)
print("-"*30)

#Acess vector for one word
print(word2vec_cbow['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:",word2vec_cbow.similarity('film', 'drama'))
print("Similarity between film and tiger:",word2vec_cbow.similarity('film', 'tiger'))
print("-"*30)


Word2Vec(vocab=161018, size=100, alpha=0.025)
------------------------------
------------------------------
[-0.05278917 -2.2432473  -0.6968726   2.6487577   1.5946589   0.23525089
 -0.47907975 -0.78968483 -2.8959844   3.726202    1.0541613  -1.1372422
 -1.3833123  -0.13650467  0.05466807 -0.18252724 -1.6289347   3.5304248
  0.95799816  0.11003758  0.97561294  5.3836365  -2.926578   -0.661086
 -0.97229356  1.2039522   0.34759814  0.8269918   1.3703564  -0.07136638
 -4.7125483   0.61925966  0.7808032   1.860589   -0.05967012 -1.2537986
  1.7102126   0.801111   -0.6330668   2.9158566  -2.1082087   0.47208667
  2.1758597  -1.1709138  -5.503552    2.8895037  -0.01359996 -4.399968
 -0.3254155  -0.8703112   3.9359555  -0.35001296 -0.06683039  1.6437399
 -1.0910201  -2.3790524   2.6753175  -1.0751883   0.34626892  1.3782827
  2.3898885  -0.43576995 -1.6203228   0.61654866 -0.72969776 -1.4565114
  1.0598677   2.4900308   2.8398023  -2.5287054  -1.7698632   0.3529609
 -2.5632315   2.2514613  -1

In [None]:
# save model
from gensim.models import Word2Vec, KeyedVectors   
word2vec_cbow.wv.save_word2vec_format('word2vec_cbow.bin', binary=True)

# # load model
# new_modelword2vec_cbow = Word2Vec.load('word2vec_cbow.bin')
# print(word2vec_cbow)

In [None]:
#SkipGram
start = time.time()
word2vec_skipgram = Word2Vec(sentences,min_count=10, sg=1)
end = time.time()

print("SkipGram Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))


SkipGram Model Training Complete
Time taken for training is:0.61 hrs 


In [None]:
#Summarize the loaded model
print(word2vec_skipgram)
print("-"*30)

#Summarize vocabulary
words = list(word2vec_skipgram.wv.vocab)
print(words)
print("-"*30)

#Acess vector for one word
print(word2vec_skipgram['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:", .similarity('film', 'drama'))
print("Similarity between film and tiger:",word2vec_skipgram.similarity('film', 'tiger'))
print("-"*30)


Word2Vec(vocab=161018, size=100, alpha=0.025)
------------------------------
------------------------------
[-0.35990748  0.20537794 -0.08471059  0.3329436  -0.0092362  -0.07147974
 -0.30981717 -0.21653889  0.34842396 -0.28996146  0.45289078  0.64880157
 -0.14537139 -0.6593135   0.46777877  0.01224853 -0.30658954 -0.2638053
  0.09966122 -0.37593904  0.16245331 -0.10233496  0.5540646   0.27518794
  0.25898558  0.2793495  -0.2394661  -0.43676636 -0.43892923  0.04013366
 -0.38420838 -0.8375879  -0.10474149 -0.42934766 -0.07724728  0.32263622
 -0.08061744 -0.3264137  -0.13703062  0.40950722  0.6043124   0.29718038
 -0.24458958 -0.3338308   0.00118772 -1.0104022  -0.26405972  0.15964566
 -0.29186898 -0.21630155  0.32755786  0.24951488 -0.29366866  0.15838994
  0.45849612  0.2894191   0.6303711  -0.26540717 -0.24943331 -0.42347214
 -0.4279108   0.24273829 -0.14081118 -0.4153366  -0.3691591  -0.08119843
 -0.28623915  0.1835074  -0.19500887 -0.20063516  0.34779832  0.31729746
 -0.13560808 -0.2

In [None]:
# save model
word2vec_cbow.wv.save_word2vec_format('word2vec_sg.bin', binary=True)

# # load model
# new_model_skipgram = Word2Vec.load('model_skipgram.bin')
# print(model_skipgram)

#### An interesting obeseravtion if you noticed is that CBOW trains faster than SkipGram in both cases.
We will leave it to the user to figure out why. A hint would be to refer the working of CBOW and skipgram.