In [1]:
import os
os.environ['GENSIM_DATA_DIR'] = './models'

In [2]:
# pandas number format
import pandas as pd
pd.options.display.float_format = '{:.0f}'.format

In [3]:
import gensim.downloader as api

info_df = pd.DataFrame.from_dict(api.info()['models'], orient='index')
info_df[['file_size', 'base_dataset', 'parameters']].head(5)

Unnamed: 0,file_size,base_dataset,parameters
fasttext-wiki-news-subwords-300,1005007116,"Wikipedia 2017, UMBC webbase corpus and statmt...",{'dimension': 300}
conceptnet-numberbatch-17-06-300,1225497562,"ConceptNet, word2vec, GloVe, and OpenSubtitles...",{'dimension': 300}
word2vec-ruscorpora-300,208427381,Russian National Corpus (about 250M words),"{'dimension': 300, 'window_size': 10}"
word2vec-google-news-300,1743563840,Google News (about 100 billion words),{'dimension': 300}
glove-wiki-gigaword-50,69182535,"Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)",{'dimension': 50}


In [4]:
model = api.load("glove-wiki-gigaword-50")



IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [5]:
v_king = model['king']
v_queen = model['queen']

print("Vector size:", model.vector_size)
print("v_king  =", v_king[:10])
print("v_queen =", v_queen[:10])
print("similarity:", model.similarity('king', 'queen'))

Vector size: 50
v_king  = [ 0.50451   0.68607  -0.59517  -0.022801  0.60046  -0.13498  -0.08813
  0.47377  -0.61798  -0.31012 ]
v_queen = [ 0.37854   1.8233   -1.2648   -0.1043    0.35829   0.60029  -0.17538
  0.83767  -0.056798 -0.75795 ]
similarity: 0.7839043


In [6]:
model.most_similar('king', topn=3)

[('prince', 0.8236179947853088),
 ('queen', 0.7839042544364929),
 ('ii', 0.7746230363845825)]

In [7]:
v_lion = model['lion']
v_nano = model['nanotechnology']

model.cosine_similarities(v_king, [v_queen, v_lion, v_nano])

array([ 0.78390425,  0.47800115, -0.25490996], dtype=float32)

In [9]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=3)

[('queen', 0.8523604273796082),
 ('throne', 0.7664334177970886),
 ('prince', 0.7592144012451172)]

In [10]:
model.most_similar(positive=['paris', 'germany'], negative=['france'], topn=3)

[('berlin', 0.9203965663909912),
 ('frankfurt', 0.8201637268066406),
 ('vienna', 0.8182449340820312)]

In [11]:
model.most_similar(positive=['france', 'capital'], topn=1)

[('paris', 0.7835100293159485)]

In [12]:
model.most_similar(positive=['greece', 'capital'], topn=3)

[('central', 0.7972516417503357),
 ('western', 0.7565553188323975),
 ('region', 0.7500612735748291)]

In [15]:
import sqlite3

In [17]:
db_name = "./data/reddit-selfposts-ch10.db"
con = sqlite3.connect(db_name)
df = pd.read_sql("select subreddit, lemmas, text from posts_nlp", con)
con.close()

df['lemmas'] = df['lemmas'].str.lower().str.split() # lower case tokens
sents = df['lemmas'] # our training "sentences"

In [18]:
from gensim.models.phrases import Phrases, npmi_scorer
import gensim

# solved compatibility issue for Gensim 4.x
if gensim.__version__[0] > '3': # gensim 4.x string delimiter
    delim = '-'
else: # gensim 3.x - byte delimiter
    delim = b'-'

phrases = Phrases(sents, min_count=10, threshold=0.3, 
                  delimiter=delim, scoring=npmi_scorer)

In [19]:
sent = "I had to replace the timing belt in my mercedes c300".split()
phrased = phrases[sent]
print('|'.join(phrased))

I|had|to|replace|the|timing-belt|in|my|mercedes-c300


In [20]:
# solved compatibility issue for Gensim 4.x
if gensim.__version__[0] > '3': # gensim 4.x - find_phrases / string phrases

    phrase_df = pd.DataFrame(phrases.find_phrases(sents), 
                             columns =['phrase', 'score'])
    phrase_df = pd.DataFrame.from_dict(phrases.find_phrases(sents), orient='index').reset_index()
    phrase_df.columns = ['phrase', 'score']
    phrase_df = phrase_df[['phrase', 'score']].drop_duplicates() \
            .sort_values(by='score', ascending=False).reset_index(drop=True)

else: # gensim 3.x - export_phrases / byte phrases
    phrase_df = pd.DataFrame(phrases.export_phrases(sents, out_delimiter=delim), 
                             columns =['phrase', 'score'])
    phrase_df = phrase_df[['phrase', 'score']].drop_duplicates() \
        .sort_values(by='score', ascending=False).reset_index(drop=True)
    phrase_df['phrase'] = phrase_df['phrase'].map(lambda p: p.decode('utf-8'))

In [21]:
phrase_df[phrase_df['phrase'].str.contains('mercedes')] .head(3)

Unnamed: 0,phrase,score
83,mercedes-benz,1
1416,mercedes-c300,0


In [22]:
phrase_df.query('score > 0.7').sample(100)

Unnamed: 0,phrase,score
152,c-class,1
182,intake-manifold,1
98,z-axi,1
231,panoramic-sunroof,1
207,west-coast,1
...,...,...
35,keyless-entry,1
95,american-muscle,1
86,horror-story,1
120,acura-tl,1


In [25]:
phrases = Phrases(sents, min_count=10, threshold=0.7, 
                  delimiter=delim, scoring=npmi_scorer)

df['phrased_lemmas'] = df['lemmas'].map(lambda s: phrases[s])
sents = df['phrased_lemmas']

In [26]:
from gensim.models import Word2Vec

model = Word2Vec(sents,           # tokenized input sentences
                 vector_size=100, # size of word vectors (default 100)
                 window=2,        # context window size (default 5)
                 sg=1,            # use skip-gram (default 0 = CBOW)
                 negative=5,      # number of negative samples (default 5)
                 min_count=5,     # ignore infrequent words (default 5)
                 workers=4,       # number of threads (default 3)
                 epochs=5)         # number of epochs (default 5)