### Word2Vec -- Comparing Word Vectors in both Bag of Words (CBOW) & Skip-Gram (SKIPG) Models

In [1]:
import pandas as pd

import gensim 
from gensim.models import Word2Vec 

from nltk.tokenize import sent_tokenize, word_tokenize 

### Import and Tokenize Lord of the Rings text 

In [2]:
with open('./the_lord_of_the_rings/Lord_of_the_Rings_complete.txt', 'r') as file:
    lotr = file.read().replace('\n', '')

In [3]:
lotr_sent = []
# iterate through each sentence in the file 
for i in sent_tokenize(lotr): 
    temp = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    lotr_sent.append(temp) 

In [4]:
lotr_sent[1971]

['gandalf',
 'stayed',
 'in',
 'the',
 'shire',
 'for',
 'over',
 'two',
 'months',
 '.']

In [5]:
# Create CBOW model 

wvec = gensim.models.Word2Vec(lotr_sent,     # Corpus of data.
                              min_count = 1, # Ignores words below this threshold.
                              size = 100,    # How many dimensions do you want in your word vector?
                              window = 5)    # Number of "worker threads" to use (parallelizes process).

In [6]:
# Create Skip Gram model for comparison

skipg = gensim.models.Word2Vec(lotr_sent,      # Corpus of data.
                                min_count = 1, # Ignores words below this threshold.
                                size = 100,    # How many dimensions do you want in your word vector?
                                window = 5,    # Number of "worker threads" to use (parallelizes process).
                                sg = 1)        # SG = 1 uses SkipGram, SG = 0 uses CBOW (default).

In [7]:
wvec.wv.most_similar('ring', topn=25)

[('enemy', 0.8938414454460144),
 ('journey', 0.88094562292099),
 ('company', 0.8700255155563354),
 ('word', 0.8694908022880554),
 ('story', 0.8671610355377197),
 ('strength', 0.8656622171401978),
 ('place', 0.8581945300102234),
 ('work', 0.8575025796890259),
 ('council', 0.8515850305557251),
 ('just', 0.8482983708381653),
 ('tale', 0.8472678661346436),
 ('parting', 0.8464469909667969),
 ('watch', 0.8458412885665894),
 ('choice', 0.8444128036499023),
 ('day', 0.8408998250961304),
 ('beginning', 0.8391226530075073),
 ('home', 0.8389843702316284),
 ('ending', 0.8388870358467102),
 ('burden', 0.8375438451766968),
 ('same', 0.8366794586181641),
 ('given', 0.8358065485954285),
 ('power', 0.8343549370765686),
 ('way', 0.8336449861526489),
 ('errand', 0.8334192037582397),
 ('quarrel', 0.8332650065422058)]

In [8]:
skipg.wv.most_similar('ring', topn=25)

[('enemy', 0.8354686498641968),
 ('burden', 0.806827187538147),
 ('council', 0.7940381765365601),
 ('strength', 0.7914930582046509),
 ('wisdom', 0.7823638916015625),
 ('fate', 0.7779597640037537),
 ('saruman', 0.7767810821533203),
 ('family', 0.7741245031356812),
 ('sauron', 0.7740589380264282),
 ('account', 0.7728832364082336),
 ('finger', 0.7724927067756653),
 ('folly', 0.771098792552948),
 ('grief', 0.7706279158592224),
 ('death', 0.7664724588394165),
 ('weight', 0.7655009031295776),
 ('treasure', 0.7619128227233887),
 ('desire', 0.7619084119796753),
 ('quest', 0.7606112360954285),
 ('master', 0.7590110301971436),
 ('word', 0.7563979029655457),
 ('final', 0.7549488544464111),
 ('madness', 0.7526630759239197),
 ('key', 0.7524664998054504),
 ('bearer', 0.7504841685295105),
 ('mood', 0.7502917051315308)]

In [9]:
wvec.wv.most_similar('gollum')

[('bilbo', 0.9474315643310547),
 ('she', 0.9363054037094116),
 ('boromir', 0.9242907762527466),
 ('gandalf', 0.9128257036209106),
 ('faramir', 0.9108299612998962),
 ('frodo', 0.9093053936958313),
 ('strider', 0.9077584147453308),
 ('treebeard', 0.9054824113845825),
 ('sam', 0.89302659034729),
 ('everyone', 0.8924288749694824)]

In [10]:
skipg.wv.most_similar('gollum')

[('pippin', 0.8794245719909668),
 ('frodo', 0.8741874098777771),
 ('treebeard', 0.8672569990158081),
 ('he', 0.8524508476257324),
 ('sam', 0.8519383668899536),
 ('shagrat', 0.8455409407615662),
 ('strider', 0.8425599336624146),
 ('himself', 0.8330941200256348),
 ('grishnbkh', 0.8315867185592651),
 ('gandalf', 0.826464056968689)]

In [11]:
wvec.wv.most_similar('frodo')

[('sam', 0.9837819933891296),
 ('pippin', 0.9562926292419434),
 ('gandalf', 0.9426122903823853),
 ('merry', 0.9376106262207031),
 ('aragorn', 0.9370484948158264),
 ('she', 0.9118833541870117),
 ('strider', 0.9112873673439026),
 ('faramir', 0.9096335768699646),
 ('gollum', 0.9093053936958313),
 ('boromir', 0.9041380286216736)]

In [12]:
skipg.wv.most_similar('frodo')

[('pippin', 0.9122840166091919),
 ('gollum', 0.8741874098777771),
 ('sam', 0.8641220331192017),
 ('merry', 0.8606770634651184),
 ('strider', 0.8568487167358398),
 ('butterbur', 0.8523883819580078),
 ('treebeard', 0.8486931920051575),
 ('beregond', 0.843740701675415),
 ('farmer', 0.8267589211463928),
 ('bilbo', 0.8228003978729248)]

In [13]:
wvec.wv.most_similar('gandalf')

[('aragorn', 0.9658728837966919),
 ('merry', 0.9483356475830078),
 ('strider', 0.9426972270011902),
 ('frodo', 0.9426122903823853),
 ('sam', 0.9408676624298096),
 ('boromir', 0.9362186193466187),
 ('faramir', 0.9255779981613159),
 ('pippin', 0.9237229228019714),
 ('legolas', 0.919039785861969),
 ('gollum', 0.9128257036209106)]

In [14]:
skipg.wv.most_similar('gandalf')

[('faramir', 0.9170253872871399),
 ('aragorn', 0.916953444480896),
 ('strider', 0.911169171333313),
 ('beregond', 0.9089959263801575),
 ('jomer', 0.9006110429763794),
 ('boromir', 0.8966839909553528),
 ('treebeard', 0.8934426307678223),
 ('haldir', 0.8845245838165283),
 ('glorfindel', 0.883939266204834),
 ('farmer', 0.8756296634674072)]

In [15]:
wvec.wv.most_similar('arwen')

[('heralds', 0.9766462445259094),
 ('cracking', 0.9760091304779053),
 ('thorin', 0.9759398698806763),
 ('arod', 0.9752633571624756),
 ('3', 0.9744923710823059),
 ('terrific', 0.9742535948753357),
 ('den', 0.9732915163040161),
 ('cracker', 0.9715051651000977),
 ('water-lilies', 0.9709609746932983),
 ('shivering', 0.9694401025772095)]

In [16]:
skipg.wv.most_similar('arwen')

[('arod', 0.9686731696128845),
 ('alert', 0.9684125185012817),
 ('bregalad', 0.9645286202430725),
 ('stuttering', 0.9597437977790833),
 ('drowsily', 0.9588397741317749),
 ('pillows', 0.9583908319473267),
 ('solemnly', 0.9567469358444214),
 ('shivering', 0.9551138877868652),
 ('stammering', 0.9547591209411621),
 ('raising', 0.9533906579017639)]

In [17]:
wvec.wv.most_similar('bombadil')

[('l®thien', 0.9885807037353516),
 ('palan-dnriel', 0.9783605337142944),
 ('maggot', 0.9774553775787354),
 ('limb', 0.9769629240036011),
 ('sn', 0.9761889576911926),
 ('goldberry', 0.9756543636322021),
 ('hints', 0.9738003015518188),
 ('thorin', 0.9735133647918701),
 ('edrendil', 0.9727005362510681),
 ('shouting', 0.972480058670044)]

In [18]:
skipg.wv.most_similar('bombadil')

[('balin', 0.9466532468795776),
 ('hbma', 0.9405666589736938),
 ("'gandalf", 0.93732750415802),
 ('ranger', 0.9370142221450806),
 ('party', 0.9356539845466614),
 ('halfling', 0.9303222298622131),
 ('gamling', 0.9298248291015625),
 ('council', 0.9267533421516418),
 ('maggot', 0.9261330366134644),
 ('galdor', 0.9255490303039551)]

In [19]:
wvec.wv.most_similar('beginning')

[("o'clock", 0.975773811340332),
 ('rapids', 0.9673560857772827),
 ('events', 0.9587860703468323),
 ('moot', 0.958209216594696),
 ('daylight', 0.955962061882019),
 ('supper', 0.955619752407074),
 ('archet', 0.9545978307723999),
 ('buckland', 0.9536428451538086),
 ('amazing', 0.9528564810752869),
 ('bree', 0.9511289596557617)]

In [20]:
skipg.wv.most_similar('beginning')

[('feast', 0.9360736608505249),
 ('recall', 0.9293016791343689),
 ('finding', 0.9236054420471191),
 ('adventure', 0.9222742319107056),
 ('discovered', 0.9220582246780396),
 ('crickhollow', 0.9219976663589478),
 ('ordered', 0.9208730459213257),
 ('parting', 0.9206583499908447),
 ('perceive', 0.9204188585281372),
 ('impossible', 0.920349657535553)]

In [21]:
wvec.wv.most_similar('shire', topn=25)

[('ringwraiths', 0.9055027365684509),
 ('enemy', 0.8884632587432861),
 ('war', 0.8869511485099792),
 ('elves', 0.8822416067123413),
 ('nine', 0.881231963634491),
 ('paths', 0.8787387609481812),
 ('world', 0.8738198280334473),
 ('battle', 0.8674814105033875),
 ('fulfil', 0.8671588897705078),
 ('rohirrim', 0.8663228750228882),
 ('future', 0.86480712890625),
 ('gondor', 0.8631766438484192),
 ('windings', 0.8627133369445801),
 ('ores', 0.8615931272506714),
 ('beornings', 0.8596514463424683),
 ('city', 0.8558559417724609),
 ('isengard', 0.8553903698921204),
 ('outlands', 0.8550843000411987),
 ('custom', 0.8544521331787109),
 ('parts', 0.8540695905685425),
 ('rivendell', 0.8529644012451172),
 ('peace', 0.8524543642997742),
 ('riders', 0.8520135879516602),
 ('bred', 0.8506495952606201),
 ('ents', 0.8503402471542358)]

In [22]:
skipg.wv.most_similar('shire', topn=25)

[('nine', 0.8754252791404724),
 ('truth', 0.8732672929763794),
 ('middle-earth', 0.8700699806213379),
 ('rivendell', 0.8648386001586914),
 ('perished', 0.8646013736724854),
 ('galadhrim', 0.8599073886871338),
 ('quest', 0.8585939407348633),
 ('defence', 0.8573663234710693),
 ('halflings', 0.857085645198822),
 ('children', 0.8564490675926208),
 ('story', 0.8534996509552002),
 ('legends', 0.8528523445129395),
 ('order', 0.8528256416320801),
 ('history', 0.8527851104736328),
 ('peril', 0.8516143560409546),
 ('mirkwood', 0.8510025143623352),
 ('tales', 0.8496300578117371),
 ('bree', 0.849562406539917),
 ('count', 0.8490866422653198),
 ('doings', 0.8482289910316467),
 ('lothlurien', 0.8458220958709717),
 ('hunters', 0.8453497290611267),
 ('wilderness', 0.8449330925941467),
 ('news', 0.8446534872055054),
 ('perils', 0.8430689573287964)]

In [23]:
wvec.wv.most_similar('ring', topn=25)

[('enemy', 0.8938414454460144),
 ('journey', 0.88094562292099),
 ('company', 0.8700255155563354),
 ('word', 0.8694908022880554),
 ('story', 0.8671610355377197),
 ('strength', 0.8656622171401978),
 ('place', 0.8581945300102234),
 ('work', 0.8575025796890259),
 ('council', 0.8515850305557251),
 ('just', 0.8482983708381653),
 ('tale', 0.8472678661346436),
 ('parting', 0.8464469909667969),
 ('watch', 0.8458412885665894),
 ('choice', 0.8444128036499023),
 ('day', 0.8408998250961304),
 ('beginning', 0.8391226530075073),
 ('home', 0.8389843702316284),
 ('ending', 0.8388870358467102),
 ('burden', 0.8375438451766968),
 ('same', 0.8366794586181641),
 ('given', 0.8358065485954285),
 ('power', 0.8343549370765686),
 ('way', 0.8336449861526489),
 ('errand', 0.8334192037582397),
 ('quarrel', 0.8332650065422058)]

In [24]:
skipg.wv.most_similar('ring', topn=25)

[('enemy', 0.8354686498641968),
 ('burden', 0.806827187538147),
 ('council', 0.7940381765365601),
 ('strength', 0.7914930582046509),
 ('wisdom', 0.7823638916015625),
 ('fate', 0.7779597640037537),
 ('saruman', 0.7767810821533203),
 ('family', 0.7741245031356812),
 ('sauron', 0.7740589380264282),
 ('account', 0.7728832364082336),
 ('finger', 0.7724927067756653),
 ('folly', 0.771098792552948),
 ('grief', 0.7706279158592224),
 ('death', 0.7664724588394165),
 ('weight', 0.7655009031295776),
 ('treasure', 0.7619128227233887),
 ('desire', 0.7619084119796753),
 ('quest', 0.7606112360954285),
 ('master', 0.7590110301971436),
 ('word', 0.7563979029655457),
 ('final', 0.7549488544464111),
 ('madness', 0.7526630759239197),
 ('key', 0.7524664998054504),
 ('bearer', 0.7504841685295105),
 ('mood', 0.7502917051315308)]

### Print Results of Cosine Similarity

In [31]:
# CBOW results

f"Cosine similarity between 'ring' and 'story' - CBOW : ", wvec.wv.similarity('ring', 'story')

("Cosine similarity between 'ring' and 'story' - CBOW : ", 0.8671610517935457)

In [32]:
f"Cosine similarity between 'shire' and 'war' - CBOW : ", wvec.wv.similarity('shire', 'war')

("Cosine similarity between 'shire' and 'war' - CBOW : ", 0.8869511711975223)

In [33]:
# Skip-Gram results

f"Cosine similarity between 'ring' and 'story' - CBOW : ", skipg.wv.similarity('ring', 'story')

("Cosine similarity between 'ring' and 'story' - CBOW : ", 0.7298129787501337)

In [34]:
f"Cosine similarity between 'shire' and 'war' - CBOW : ", skipg.wv.similarity('shire', 'war')

("Cosine similarity between 'shire' and 'war' - CBOW : ", 0.8056954717263576)