# Play with Word2Vec using Gensim (https://radimrehurek.com/gensim/intro.html)

In [1]:
!pip install gensim



## 1. Find the similar words

In [2]:
from gensim.models import KeyedVectors
from gensim import downloader

model = downloader.load("glove-wiki-gigaword-100")
model['bread']



array([-0.66146  ,  0.94335  , -0.72214  ,  0.17403  , -0.42524  ,
        0.36303  ,  1.0135   , -0.14802  ,  0.25817  , -0.20326  ,
       -0.64338  ,  0.16632  ,  0.61518  ,  1.397    , -0.094506 ,
        0.0041843, -0.18976  , -0.55421  , -0.39371  , -0.22501  ,
       -0.34643  ,  0.32076  ,  0.34395  , -0.7034   ,  0.23932  ,
        0.69951  , -0.16461  , -0.31819  , -0.34034  , -0.44906  ,
       -0.069667 ,  0.35348  ,  0.17498  , -0.95057  , -0.2209   ,
        1.0647   ,  0.23231  ,  0.32569  ,  0.47662  , -1.1206   ,
        0.28168  , -0.75172  , -0.54654  , -0.66337  ,  0.34804  ,
       -0.69058  , -0.77092  , -0.40167  , -0.069351 , -0.049238 ,
       -0.39351  ,  0.16735  , -0.14512  ,  1.0083   , -1.0608   ,
       -0.87314  , -0.29339  ,  0.68278  ,  0.61634  , -0.088844 ,
        0.88094  ,  0.099809 , -0.27161  , -0.58026  ,  0.50364  ,
       -0.93814  ,  0.67576  , -0.43124  , -0.10517  , -1.2404   ,
       -0.74353  ,  0.28637  ,  0.29012  ,  0.89377  ,  0.6740

In [3]:
model.most_similar('bread')

[('flour', 0.7654520869255066),
 ('baked', 0.7607272863388062),
 ('cake', 0.7605516910552979),
 ('loaf', 0.7457114458084106),
 ('toast', 0.7397798895835876),
 ('cheese', 0.7374635338783264),
 ('potato', 0.7367483973503113),
 ('butter', 0.7279618978500366),
 ('potatoes', 0.7085272669792175),
 ('pasta', 0.7071876525878906)]

In [4]:
model.most_similar('beer')

[('drink', 0.8182137608528137),
 ('drinks', 0.7982838153839111),
 ('liquor', 0.7321157455444336),
 ('coffee', 0.7253385782241821),
 ('vodka', 0.7246952056884766),
 ('wine', 0.724450945854187),
 ('bottle', 0.717643141746521),
 ('beers', 0.6993280053138733),
 ('bottled', 0.6970836520195007),
 ('coke', 0.6947750449180603)]

In [5]:
model.most_similar('china')

[('beijing', 0.8378757834434509),
 ('chinese', 0.8370456695556641),
 ('taiwan', 0.8251469135284424),
 ('mainland', 0.7728948593139648),
 ('korea', 0.7463889122009277),
 ('hong', 0.7441108226776123),
 ('japan', 0.7408212423324585),
 ('vietnam', 0.7311980724334717),
 ('shanghai', 0.7294734120368958),
 ('thailand', 0.721368670463562)]

In [6]:
model.most_similar('hongkong')

[('kong', 0.6236236095428467),
 ('hk', 0.5994575023651123),
 ('hong', 0.5972347855567932),
 ('hsbc', 0.5900639891624451),
 ('boc', 0.5869478583335876),
 ('telecom', 0.5810696482658386),
 ('singapore', 0.5557530522346497),
 ('cathay', 0.5556235909461975),
 ('unicom', 0.5548734068870544),
 ('citic', 0.55435711145401)]

## 2. Analogy

In [7]:
# relational meaning of (w1: w2) is similar to (w3: w4)
#           e.g., (man, king) , (women, queen)
#                 (italy, rome) , (china, beijing)

# The "positive and negative" performs vector arithmetic: adding the positive vectors,
# subtracting the negative, then from that resulting position, listing the known-vectors closest to the position

# (man, king), (?, queen)
# queen + man - king =

model.most_similar(positive=['queen', 'man'],negative=['king'])

[('woman', 0.8183383345603943),
 ('girl', 0.7466668486595154),
 ('she', 0.695443332195282),
 ('her', 0.6720750331878662),
 ('mother', 0.6705916523933411),
 ('boy', 0.6660704016685486),
 ('teenager', 0.6439697742462158),
 ('herself', 0.6383519172668457),
 ('wife', 0.6235859394073486),
 ('young', 0.6189526319503784)]

In [8]:
# (rome, italy), (?, china)
# china + rome - italy = beijing
model.most_similar(positive=['china', 'rome'],negative=['italy'])

[('beijing', 0.8046145439147949),
 ('shanghai', 0.6948860287666321),
 ('chinese', 0.686490535736084),
 ('taipei', 0.654118001461029),
 ('hu', 0.6354351043701172),
 ('jiang', 0.6279569268226624),
 ('hanoi', 0.6215198636054993),
 ('wu', 0.6035358905792236),
 ('guangzhou', 0.6007013320922852),
 ('wang', 0.5984368920326233)]

In [9]:
# You can play with it for other analogy
# (taller, tall), (?, long)
# long + taller - tall =
model.most_similar(positive=['long', 'taller'],negative=['tall'])

[('longer', 0.7338849306106567),
 ('even', 0.7002877593040466),
 ('far', 0.6744087934494019),
 ('much', 0.6737490296363831),
 ('because', 0.6579323410987854),
 ('though', 0.6556640863418579),
 ('less', 0.6539509892463684),
 ('shorter', 0.6517217755317688),
 ('too', 0.6515427231788635),
 ('better', 0.6394868493080139)]

In [10]:
# (Sushi, Janpan), (?, China)
# China + Sushi - Janpan =
model.most_similar(positive=['china', 'sushi'],negative=['japan'])

[('seafood', 0.6261202692985535),
 ('takeout', 0.5936048626899719),
 ('chefs', 0.5764648914337158),
 ('cuisine', 0.5731133222579956),
 ('dessert', 0.5686538219451904),
 ('gourmet', 0.5683862566947937),
 ('restaurant', 0.5582833290100098),
 ('cooks', 0.5500884652137756),
 ('dishes', 0.5473605990409851),
 ('delicious', 0.544467568397522)]

In [11]:
#(fish, cat), (?, Dog)
# Dog + fish - cat
model.most_similar(positive=['dog', 'fish'],negative=['cat'])

[('meat', 0.7357233762741089),
 ('tuna', 0.6953045725822449),
 ('seafood', 0.6752336025238037),
 ('shrimp', 0.659959614276886),
 ('salmon', 0.6539468765258789),
 ('eggs', 0.6352959275245667),
 ('fishing', 0.6342913508415222),
 ('eat', 0.6289576292037964),
 ('animal', 0.6197546720504761),
 ('chicken', 0.6173664927482605)]