# Word embeddings
We will use the [Gensim](https://github.com/RaRe-Technologies/gensim) library to load and play around the the word2vec word emebddings.

In [0]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
import gensim.downloader as api
import warnings
warnings.filterwarnings("ignore") 

Download the pretrained w2vec embedding by google

In [0]:
wv = api.load('word2vec-google-news-300')
# wv = api.load('word2vec-ruscorpora-300 ') #Smaller corpus (text data)



# Basics

In [0]:
# wv["some_word"] returns the vector for "some_word"
vector = wv["programming"]
print(vector)

[-0.16699219 -0.00335693  0.10498047  0.2578125   0.05566406 -0.0703125
  0.2890625   0.06494141 -0.09179688 -0.17578125 -0.18652344  0.02709961
 -0.31835938  0.07421875 -0.04150391  0.0703125  -0.05761719  0.08349609
  0.26757812 -0.21191406  0.00878906  0.22265625  0.16992188  0.28515625
  0.02636719 -0.16796875  0.09277344  0.17089844 -0.06787109 -0.19726562
 -0.03564453 -0.012146   -0.03588867  0.24511719  0.11328125 -0.34179688
  0.05395508  0.19238281  0.18261719 -0.03295898  0.02636719 -0.02099609
  0.28320312  0.09570312 -0.36328125  0.11914062 -0.12011719  0.08789062
  0.10449219  0.04785156 -0.11816406 -0.33789062 -0.11132812  0.10644531
 -0.0111084  -0.16992188  0.02307129 -0.08056641 -0.28125     0.03149414
  0.109375   -0.11865234 -0.07666016 -0.05029297 -0.01257324 -0.00491333
 -0.05444336  0.07470703  0.00265503 -0.0324707   0.15625    -0.15625
 -0.05932617 -0.14257812  0.01623535  0.07275391  0.11328125 -0.36914062
  0.02600098  0.0267334   0.15332031  0.10839844 -0.177

In [0]:
# Find the dimension of vector (it's 300)
print(len(vector))

300


In [0]:
# Find words similar to a vector
wv.most_similar(positive=[vector])

[('programming', 1.0),
 ('programing', 0.8606169819831848),
 ('Programming', 0.6899746060371399),
 ('NLP_neuro_linguistic', 0.6174068450927734),
 ('broadcasts', 0.5984179973602295),
 ('primetime_programming', 0.5968459248542786),
 ('ensnare_accelerators', 0.5962189435958862),
 ('telecasts', 0.5839172601699829),
 ('sublicensing_rights', 0.5721215009689331),
 ('primetime', 0.5611954927444458)]

In [0]:
def print_similar_words(word=None, wv=wv):
  if word is None:
    print("No word given")
  else:
    for w in wv.most_similar(positive=[wv[word]]):
      print(w)

In [0]:
print_similar_words("test")

('test', 1.0)
('tests', 0.8223524689674377)
('testing', 0.7770693898200989)
('tested', 0.6987471580505371)
('Testing', 0.6231584548950195)
('testings', 0.5537309646606445)
('Chlamydia_Rapid', 0.5230551362037659)
('extramarital_affairs_flunk', 0.5202082395553589)
('exam', 0.5065851211547852)
('pretests', 0.5008640885353088)


In [0]:
print_similar_words("coffee") #Food

('coffee', 1.0)
('coffees', 0.721267819404602)
('gourmet_coffee', 0.7057087421417236)
('Coffee', 0.6900455951690674)
('o_joe', 0.6891065835952759)
('Starbucks_coffee', 0.6874972581863403)
('coffee_beans', 0.6749705076217651)
('latté', 0.664122462272644)
('cappuccino', 0.6625496745109558)
('brewed_coffee', 0.6621608734130859)


In [0]:
print_similar_words("hockey") #Sports

('hockey', 1.0)
('Hockey', 0.7227486371994019)
('Ice_Hockey', 0.6408184170722961)
('lacrosse', 0.6390798091888428)
('peewee_hockey', 0.6332175135612488)
('soccer', 0.6270937323570251)
('Hockey_League', 0.6250644326210022)
('pee_wee_hockey', 0.6238211393356323)
('basketball', 0.6131463646888733)
('midget_hockey', 0.6043297052383423)


In [0]:
print_similar_words("Beethoven") #Music composers

('Beethoven', 1.0)
('Mozart', 0.8049119710922241)
('Brahms', 0.801065981388092)
('Tchaikovsky', 0.7523819208145142)
('Liszt', 0.7483128309249878)
('JS_Bach', 0.7367250323295593)
('Rachmaninoff', 0.7326319217681885)
('Shostakovich', 0.7264085412025452)
('Debussy', 0.7217074632644653)
('Scriabin', 0.7209811210632324)


In [0]:
print_similar_words("Shakira") #Music artists

('Shakira', 1.0)
('singer_Shakira', 0.642167866230011)
('Beyonce', 0.6416192650794983)
('Fijación_Oral_Vol', 0.617068886756897)
('Alicia_Keys', 0.599069356918335)
('Enrique_Iglesias', 0.5965898036956787)
('Beyoncé', 0.5943112373352051)
('manager_Fifi_Kurzman', 0.5922718644142151)
('Nelly_Furtado', 0.5856097936630249)
('Cape_Town_Freshlyground', 0.5709559917449951)


In [0]:
print_similar_words("love") #Abstract concepts

('love', 1.0)
('loved', 0.6907791495323181)
('adore', 0.6816873550415039)
('loves', 0.6618633270263672)
('passion', 0.6100709438323975)
('hate', 0.600395679473877)
('loving', 0.5886635780334473)
('Ilove', 0.5702950954437256)
('affection', 0.5664337873458862)
('undying_love', 0.5547305345535278)


In [0]:
print_similar_words("Vaibhav") #Names

('Vaibhav', 1.0)
('Prashant', 0.7736366987228394)
('Gaurav', 0.7660701870918274)
('Rohit', 0.7647777795791626)
('Rajesh', 0.7497475743293762)
('Sumit', 0.7495461702346802)
('Deepak', 0.7480446100234985)
('Saurabh', 0.7463053464889526)
('Vinay', 0.7428852915763855)
('Vishal', 0.7413797378540039)


# Analogies

In [0]:
# India : Delhi :: Berlin : ??
vec = wv['Germany'] - wv['India'] + wv['Delhi'] #Capitals
wv.most_similar(positive=[vec])

[('Berlin', 0.7211145162582397),
 ('Germany', 0.6761950254440308),
 ('Dusseldorf', 0.6079208850860596),
 ('Delhi', 0.6058413982391357),
 ('Leipzig', 0.6056665778160095),
 ('Frankfurt', 0.5892257690429688),
 ('Cologne', 0.5859718322753906),
 ('Munich', 0.5853998064994812),
 ('Hamburg', 0.581586480140686),
 ('Münster', 0.5783610343933105)]

In [0]:
# life : death :: white : ??
vec = wv['white'] - wv['life'] + wv['death'] #Philosophy
wv.most_similar(positive=[vec])

[('white', 0.6257379055023193),
 ('death', 0.5636220574378967),
 ('black', 0.5485159158706665),
 ('deaths', 0.4514094293117523),
 ('nooses_dangling', 0.4427206516265869),
 ('blue', 0.4416115880012512),
 ('slaying', 0.43937206268310547),
 ('homicide', 0.4355132579803467),
 ('bloodstained_robe', 0.4230232238769531),
 ('murder', 0.42059534788131714)]

In [0]:
vec = wv['uncle'] - wv['man'] + wv['woman'] #Gender
wv.most_similar(positive=[vec])

[('uncle', 0.8322185277938843),
 ('aunt', 0.8243412971496582),
 ('mother', 0.8033039569854736),
 ('niece', 0.7899689674377441),
 ('father', 0.7580469846725464),
 ('grandmother', 0.7534991502761841),
 ('daughter', 0.7406191229820251),
 ('husband', 0.7206656336784363),
 ('nephew', 0.7067581415176392),
 ('sister', 0.7060453295707703)]

In [0]:
vec = wv['fast'] - wv['quick'] + wv['quickest'] #Degrees of adjectives
wv.most_similar(positive=[vec])

[('quickest', 0.7896946668624878),
 ('fastest', 0.7499547600746155),
 ('fast', 0.6063127517700195),
 ('slowest', 0.5907609462738037),
 ('faster', 0.5143135786056519),
 ('2min_##.###sec', 0.5023351907730103),
 ("#'##_.7", 0.49874669313430786),
 ('#m##.###_seconds', 0.4948148727416992),
 ('#.###secs_slower', 0.4833757281303406),
 ('1min_##.###s_##.##mph', 0.47246843576431274)]

## Bias!
![](https://i.imgur.com/4WuyKeJ.png)
- Tolga et al. 2016

In [0]:
vec = wv['computer_programmer'] - wv['man'] + wv['woman']
wv.most_similar(positive=[vec])

[('computer_programmer', 0.9105811715126038),
 ('homemaker', 0.5771316289901733),
 ('schoolteacher', 0.5500192046165466),
 ('graphic_designer', 0.5464699268341064),
 ('mechanical_engineer', 0.539836585521698),
 ('electrical_engineer', 0.533705472946167),
 ('housewife', 0.5274524688720703),
 ('programmer', 0.5096209049224854),
 ('businesswoman', 0.5029540657997131),
 ('keypunch_operator', 0.4974639415740967)]

In [0]:
vec = wv['doctor'] - wv['man'] + wv['woman']
wv.most_similar(positive=[vec])

[('doctor', 0.883492112159729),
 ('gynecologist', 0.7276507019996643),
 ('nurse', 0.6698512434959412),
 ('physician', 0.6674121022224426),
 ('doctors', 0.6649492979049683),
 ('pediatrician', 0.6398377418518066),
 ('nurse_practitioner', 0.6237459778785706),
 ('obstetrician', 0.6188926696777344),
 ('midwife', 0.6041982769966125),
 ('dentist', 0.5999662280082703)]