In [None]:
"""https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html
"""

In [1]:
import os
import gensim
# set file names for train and test data
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

ModuleNotFoundError: No module named 'gensim'

In [15]:
# preprocess the file (tokenize, remove punctuation, lower-case)
# one line = a doc, the file = the corpus

import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding='iso-8859-1') as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # for training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

n_docs = len(train_corpus)
#n_words = len(model.wv.key_to_index)

print(n_docs)
#print(n_words)

300


In [43]:
print(len(train_corpus))
print(len(test_corpus))
print("")
print(train_corpus[2])
print("")
print(test_corpus[2])  # no doc tag

300
50

TaggedDocument(['the', 'national', 'road', 'toll', 'for', 'the', 'christmas', 'new', 'year', 'holiday', 'period', 'stands', 'at', 'eight', 'fewer', 'than', 'for', 'the', 'same', 'time', 'last', 'year', 'people', 'have', 'died', 'on', 'new', 'south', 'wales', 'roads', 'with', 'eight', 'fatalities', 'in', 'both', 'queensland', 'and', 'victoria', 'western', 'australia', 'the', 'northern', 'territory', 'and', 'south', 'australia', 'have', 'each', 'recorded', 'three', 'deaths', 'while', 'the', 'act', 'and', 'tasmania', 'remain', 'fatality', 'free'], [2])

['the', 'united', 'states', 'government', 'has', 'said', 'it', 'wants', 'to', 'see', 'president', 'robert', 'mugabe', 'removed', 'from', 'power', 'and', 'that', 'it', 'is', 'working', 'with', 'the', 'zimbabwean', 'opposition', 'to', 'bring', 'about', 'change', 'of', 'administration', 'as', 'scores', 'of', 'white', 'farmers', 'went', 'into', 'hiding', 'to', 'escape', 'round', 'up', 'by', 'zimbabwean', 'police', 'senior', 'bush', 'ad

In [44]:
## build and train the model

# initialize the model
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

# build the vocabulary
model.build_vocab(train_corpus)

print(model.wv.index_to_key)  # list of the vocabulary
print("")
print(model.wv.get_vecattr('penalty', 'count'))  # number of appearance of 'penalty'


4


In [45]:
# actually training the model
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [46]:
# a few functions and tests

vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)  # some randomization -> not always exactly the same vector
print("")

# most similar docs to doc 1
similar_doc = model.dv.most_similar(1)
print(similar_doc)
print("")

# similarity
print(model.dv.similarity(1,1))
print(model.dv.similarity(131,1))
print("")
print(model.wv.wmdistance(train_corpus[1][0], ['wind'])) # distance between a doc and a word
print("")

# get a doc vector
print(model.dv[1])

[-2.9283923e-01 -4.0880138e-01 -1.5998605e-01  2.1287189e-01
 -1.1779668e-01 -3.7680645e-02  9.3359619e-02  1.3919352e-02
 -2.4870393e-01 -1.7914987e-01  1.2609284e-01 -2.9767849e-04
  1.6952196e-02 -7.7888690e-02 -1.2198131e-01 -4.6229117e-02
  1.2895429e-01  1.9961989e-01  7.6181434e-02 -1.1011779e-01
  2.2354845e-02  6.0970351e-02  1.3074282e-01  5.7402097e-02
 -1.4611398e-02  5.4038428e-03 -2.7368551e-01  3.9636787e-02
 -1.4585263e-01  7.2339840e-02  2.8679675e-01 -7.2431043e-02
  1.2950321e-01 -6.6164799e-02  2.0271251e-01  1.2278174e-01
  7.7841920e-03 -2.2162078e-01 -1.6593821e-01  4.8543286e-02
 -9.9956535e-02 -6.4829826e-02 -6.2253837e-02 -9.0442002e-02
  1.4013019e-01  2.9873710e-02 -7.7813938e-02 -1.5028825e-01
  1.7250584e-01  1.2777558e-02]

[(223, 0.8572266101837158), (141, 0.7918969988822937), (143, 0.7896435856819153), (85, 0.7753568887710571), (51, 0.7677361369132996), (208, 0.7642424702644348), (122, 0.7624757289886475), (277, 0.760118842124939), (242, 0.7353090047836

In [47]:
## assessing the model

# test with the training set


# loop over the documents
# inferred_vector: doc vectors generated with the words in the current doc
# sims: similarity between inferred_vectors and the trained corpus
# rank: similarity rank of the original doc compared to its generated copy
#     => should be very small each time


ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    second_ranks.append(sims[1])
print(ranks)

# print the repartition of the ranks
import collections

counter = collections.Counter(ranks)
print(counter)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Counter({0: 291, 1: 9})


In [48]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (290): «third case of mad cow disease has been confirmed in japan panel of experts at japan health ministry has confirmed that another cow has the disease officials say all meat and organs from the dairy cow will be incinerated it is japan third case of mad cow disease and ministerial spokesman says he cannot tell how many more cases will be found as nationwide test continues the government has not determined the source of the outbreak»

Similar Document (210, 0.6455407738685608): «the australian government is continuing to talk to indian authorities about man who has confessed to planning attacks against australia amongst other countries twenty eight year old mohammed afroz who undertook pilot training in australia in and has been charged with waging war against india he has also made claims about planning terrorist attacks in australia with the rialto towers in melbourne one target while the government is taking the claims seriously there is some skepticism about them 

In [49]:
# test with the testing set

# pic a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (16): «beijing has abruptly withdrawn new car registration system after drivers demonstrated an unhealthy fixation with symbols of western military and industrial strength such as fbi and senior officials have been infuriated by popular demonstration of interest in american institutions such as the fbi particularly galling was one man choice of tmd which stands for theatre missile defence us designed missile system that is regularly vilified by chinese propaganda channels»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (66, 0.6726648211479187): «argentina government has crumbled after at least people were killed and hundreds injured in nationwide riots argentina president fernando de la rua has resigned and called for national unity government with the opposition peronists the president resignation followed hours of rioting across the country people took to the streets protesting against the government economic austerity program argentina 

In [50]:
# test if wmd works here

print(test_corpus[0])
print(test_corpus[1])
print("")

distance = model.wv.wmdistance(test_corpus[0], test_corpus[1])
print("wmd:", distance)

['the', 'national', 'executive', 'of', 'the', 'strife', 'torn', 'democrats', 'last', 'night', 'appointed', 'little', 'known', 'west', 'australian', 'senator', 'brian', 'greig', 'as', 'interim', 'leader', 'shock', 'move', 'likely', 'to', 'provoke', 'further', 'conflict', 'between', 'the', 'party', 'senators', 'and', 'its', 'organisation', 'in', 'move', 'to', 'reassert', 'control', 'over', 'the', 'party', 'seven', 'senators', 'the', 'national', 'executive', 'last', 'night', 'rejected', 'aden', 'ridgeway', 'bid', 'to', 'become', 'interim', 'leader', 'in', 'favour', 'of', 'senator', 'greig', 'supporter', 'of', 'deposed', 'leader', 'natasha', 'stott', 'despoja', 'and', 'an', 'outspoken', 'gay', 'rights', 'activist']
['cash', 'strapped', 'financial', 'services', 'group', 'amp', 'has', 'shelved', 'million', 'plan', 'to', 'buy', 'shares', 'back', 'from', 'investors', 'and', 'will', 'raise', 'million', 'in', 'fresh', 'capital', 'after', 'profits', 'crashed', 'in', 'the', 'six', 'months', 'to', 

In [51]:
doc1 = train_corpus[1][0]
print(doc1)
print("")

print(model.similarity_unseen_docs(doc_words1=doc1, doc_words2=['indian']))
# as the ['indian'] doc vector is generated each time (as not in the db), the result is slightly different each time

['indian', 'security', 'forces', 'have', 'shot', 'dead', 'eight', 'suspected', 'militants', 'in', 'night', 'long', 'encounter', 'in', 'southern', 'kashmir', 'the', 'shootout', 'took', 'place', 'at', 'dora', 'village', 'some', 'kilometers', 'south', 'of', 'the', 'kashmiri', 'summer', 'capital', 'srinagar', 'the', 'deaths', 'came', 'as', 'pakistani', 'police', 'arrested', 'more', 'than', 'two', 'dozen', 'militants', 'from', 'extremist', 'groups', 'accused', 'of', 'staging', 'an', 'attack', 'on', 'india', 'parliament', 'india', 'has', 'accused', 'pakistan', 'based', 'lashkar', 'taiba', 'and', 'jaish', 'mohammad', 'of', 'carrying', 'out', 'the', 'attack', 'on', 'december', 'at', 'the', 'behest', 'of', 'pakistani', 'military', 'intelligence', 'military', 'tensions', 'have', 'soared', 'since', 'the', 'raid', 'with', 'both', 'sides', 'massing', 'troops', 'along', 'their', 'border', 'and', 'trading', 'tit', 'for', 'tat', 'diplomatic', 'sanctions', 'yesterday', 'pakistan', 'announced', 'it', 'h

In [52]:
# comparing docvec of a word and a word vector

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


# closest word neighbour of a document

docvec = model.dv[1]  # assuming such a doc-tag exists
similar_words = model.wv.most_similar(positive=[docvec])
print("Best words for doc 1:")
print(similar_words)
print("")


# comparing a single-word doc to its word

doc_vec = model.infer_vector(['indian'])
print("Best words for ['indian'] document:")
print(model.wv.most_similar(positive=[doc_vec]))
print("")

print("Similarity 'indian' (word) and ['indian'] (doc):")
word_vec = model.wv['indian']

print(cosine_similarity([doc_vec], [word_vec]))

# checking that similarity is equal to normalized scalar product
sims = model.wv.most_similar(positive=[doc_vec], topn=len(model.wv.key_to_index))
for word, sim in sims:
    if word == 'indian':
        print(sim)
# it is!
# however, its value changes, as the indian doc is generated (and slightly different) each time

Best words for doc 1:
[('tensions', 0.890010416507721), ('warplanes', 0.8727753758430481), ('kashmiri', 0.8454746603965759), ('arresting', 0.8430169224739075), ('armed', 0.8388275504112244), ('massing', 0.8366424441337585), ('palace', 0.8321982622146606), ('groups', 0.8282104134559631), ('possibly', 0.8191446661949158), ('bomber', 0.8183948397636414)]

Best words for ['indian'] document:
[('prompting', 0.8862071633338928), ('atal', 0.8838209509849548), ('airspace', 0.8808726072311401), ('la', 0.8774158358573914), ('criticised', 0.8739700317382812), ('de', 0.8736332654953003), ('funding', 0.8683246970176697), ('fernando', 0.8631479740142822), ('presidents', 0.8601347208023071), ('billion', 0.8543084263801575)]

Similarity 'indian' (word) and ['indian'] (doc):
[[0.8093329]]
0.8093329071998596


In [53]:
# save model

model.save('w2v_tuto_model')
model.wv.save('word_vectors')
model.dv.save('doc_vectors')

In [14]:
# tests on loading models
import gensim
Doc2Vec = gensim.models.doc2vec.Doc2Vec


# load the full model
model = Doc2Vec.load('w2v_tuto_model')
wv = model.wv
print(wv['indian'])
print("")
print(model.dv[1])

print("")
print(model.vector_size) # should be 50
print(model.min_count)  # should be 2
print(model.epochs)  # should be 40

[-0.08053529 -0.31170493 -0.6246018   0.08785795 -1.7932534   1.1329191
  1.5318613   0.7997973  -1.4399565  -1.753942   -0.4591222   1.6116158
  0.17891823  0.44310346 -0.42870468  1.7388625   0.48369253 -0.4495798
 -1.2298552  -0.26455963 -0.32319427  0.67901236  1.0954758  -0.41132
  0.68269557 -0.01654905 -0.50618726 -0.07899472 -1.0640057   0.6894789
  1.1052592  -0.66796803 -0.3543477  -0.19738258 -0.2954523   0.37720394
 -1.2817628   0.19632885  1.1323453   0.82990915  0.40673476  0.08475801
 -1.1504569  -0.16489862  0.22001448 -0.2515836   0.26171848  0.02284985
  1.0731535   0.86899257]

[-0.6598845  -1.6887672  -0.6530931   0.4611041  -0.9966809   1.3497047
  0.25404143  0.31817505 -2.0864186  -1.1775217  -0.9185994   1.0908557
 -0.00444157  0.44348875 -0.35954046  0.82380265  0.45617577  0.753337
 -0.39640644 -0.5452258   0.491928   -0.09105577  1.4061642  -1.2189744
  1.4214972   0.15950783 -1.0791452   0.6936278  -1.235952    0.3959979
  0.6448099  -0.09816211  0.77896786 

In [13]:
# load subparts
import gensim
Doc2Vec = gensim.models.doc2vec.Doc2Vec

model = Doc2Vec()
model.wv = model.wv.load('word_vectors')
model.dv = model.dv.load('doc_vectors')

print(model.wv['wind'])
print("")
print(model.dv[1])
print("")
print(model.dv.most_similar(1))
print("")

# doesn't remember correctly these
print(model.vector_size) # should be 50
print(model.min_count)  # should be 2
print(model.epochs)  # should be 40

[-0.95529085 -0.0025306  -0.1585522   0.10220353  0.31530416  0.15018095
  0.41613835  0.23638444 -0.01177099 -0.10067333 -0.01750907 -0.5921014
 -0.01215258 -0.53711647  0.31549335 -0.32618922  0.39495748  0.8168934
  0.09353938  0.08008084  0.02198712  0.01110487  0.49316376 -0.40439475
  0.10861848  0.09116187 -0.5288482  -0.76469976  0.27945432 -0.3553097
  0.55315834  0.17749658  0.03566943  0.329717    0.2068358   0.3937574
  0.7548316  -0.7015027   0.28713027  0.25565478  0.41316122  0.08341684
 -0.36993167 -0.5580509   0.38938388  0.50712013  0.11416388 -0.4494152
  0.41166827  0.06591243]

[-0.6598845  -1.6887672  -0.6530931   0.4611041  -0.9966809   1.3497047
  0.25404143  0.31817505 -2.0864186  -1.1775217  -0.9185994   1.0908557
 -0.00444157  0.44348875 -0.35954046  0.82380265  0.45617577  0.753337
 -0.39640644 -0.5452258   0.491928   -0.09105577  1.4061642  -1.2189744
  1.4214972   0.15950783 -1.0791452   0.6936278  -1.235952    0.3959979
  0.6448099  -0.09816211  0.7789678

In [16]:
# test on pickle

import pickle

model = Doc2Vec.load('w2v_tuto_model')

with open("pickle_w2v", 'wb') as file:
    pickle.dump(model, file)

with open('pickle_w2v', 'rb') as file:
    m_d2v = pickle.load(file)
    
    print(m_d2v.dv[1])
    print("")
    print(m_d2v.vector_size) # should be 50
    print(m_d2v.min_count)  # should be 2
    print(m_d2v.epochs)  # should be 40

    
# without using a file
model = Doc2Vec.load('w2v_tuto_model')

pickled_model = pickle.dumps(model)
print(type(pickled_model))

m_d2v = pickle.loads(pickled_model)

print(m_d2v.dv[1])
print("")
print(m_d2v.vector_size) # should be 50
print(m_d2v.min_count)  # should be 2
print(m_d2v.epochs)  # should be 40

[-0.6598845  -1.6887672  -0.6530931   0.4611041  -0.9966809   1.3497047
  0.25404143  0.31817505 -2.0864186  -1.1775217  -0.9185994   1.0908557
 -0.00444157  0.44348875 -0.35954046  0.82380265  0.45617577  0.753337
 -0.39640644 -0.5452258   0.491928   -0.09105577  1.4061642  -1.2189744
  1.4214972   0.15950783 -1.0791452   0.6936278  -1.235952    0.3959979
  0.6448099  -0.09816211  0.77896786 -0.02370657 -0.71020687  0.5951086
 -1.347144   -0.3200763   0.9068389   0.35159966  1.1433625   0.33211815
 -0.598028   -0.6451648   0.28997627  0.6139304  -0.02604933  0.49811757
  0.5400547   0.45566353]

50
2
40
<class 'bytes'>
[-0.6598845  -1.6887672  -0.6530931   0.4611041  -0.9966809   1.3497047
  0.25404143  0.31817505 -2.0864186  -1.1775217  -0.9185994   1.0908557
 -0.00444157  0.44348875 -0.35954046  0.82380265  0.45617577  0.753337
 -0.39640644 -0.5452258   0.491928   -0.09105577  1.4061642  -1.2189744
  1.4214972   0.15950783 -1.0791452   0.6936278  -1.235952    0.3959979
  0.6448099  

In [None]:
### train further a pre trained model

In [11]:
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

In [10]:
wv = api.load("glove-wiki-gigaword-50")
wv['horse']  # it is actually a KeyedVector, not the global model

array([-0.20454  ,  0.23321  , -0.59158  , -0.29205  ,  0.29391  ,
        0.31169  , -0.94937  ,  0.055974 ,  1.0031   , -1.0761   ,
       -0.0094648,  0.18381  , -0.048405 , -0.35717  ,  0.26004  ,
       -0.41028  ,  0.51489  ,  1.2009   , -1.6136   , -1.1003   ,
       -0.23455  , -0.81654  , -0.15103  ,  0.37068  ,  0.477    ,
       -1.7027   , -1.2183   ,  0.038898 ,  0.23327  ,  0.028245 ,
        1.6588   ,  0.26703  , -0.29938  ,  0.99149  ,  0.34263  ,
        0.15477  ,  0.028372 ,  0.56276  , -0.62823  , -0.67923  ,
       -0.163    , -0.49922  , -0.8599   ,  0.85469  ,  0.75059  ,
       -1.0399   , -0.11033  , -1.4237   ,  0.65984  , -0.3198   ],
      dtype=float32)

In [16]:
new_model = Word2Vec()
new_model.wv = wv

new_model.build_vocab(train_corpus, update=True)

TypeError: unhashable type: 'list'