# Building paragraph vectors using Doc2Vec

### Import common text corpus, Doc2Vec algorithm and Tagged Document functionality from Gensim

In [1]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### Corpus on which training will happen

In [2]:
common_texts
#Here's our training corpus:

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

### Building Tagged Documents from the corpus 

In [3]:
#We will now convert the tokenized documents into TaggedDocument format and validate this:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]

In [4]:
#Here is our corpus in the TaggedDocument form:
documents

[TaggedDocument(words=['human', 'interface', 'computer'], tags=[0]),
 TaggedDocument(words=['survey', 'user', 'computer', 'system', 'response', 'time'], tags=[1]),
 TaggedDocument(words=['eps', 'user', 'interface', 'system'], tags=[2]),
 TaggedDocument(words=['system', 'human', 'system', 'eps'], tags=[3]),
 TaggedDocument(words=['user', 'response', 'time'], tags=[4]),
 TaggedDocument(words=['trees'], tags=[5]),
 TaggedDocument(words=['graph', 'trees'], tags=[6]),
 TaggedDocument(words=['graph', 'minors', 'trees'], tags=[7]),
 TaggedDocument(words=['graph', 'minors', 'survey'], tags=[8])]

### Building a basic Doc2Vec model

In [5]:
#let's build and train a basic Doc2Vec model 
model = Doc2Vec(documents, vector_size=5, min_count=1, workers=4, epochs = 40)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

### What's the vector size?

In [6]:
#validate the vector size for the document embeddings
model.vector_size

5

### How many document vectors did we train?

In [7]:
#check whether the number of document vectors being built is equal to the number of documents 
#being used in the training process:
len(model.docvecs)

  len(model.docvecs)


9

### Let's check out the vocabulary information for the model we built

In [8]:
len(model.wv.vocab)
#len(model.wv.index_to_key)

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [None]:
#model.wv.index_to_key
model.wv.vocab
#Here's our vocabulary

### Let's infer a vector based on the trained Doc2Vec model

In [9]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[ 0.0252012   0.09010135 -0.07160246 -0.04264753  0.08760676]


In [10]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer', 'trees'])
print(vector)

[ 0.042032   -0.02170439  0.06077698  0.03611257 -0.04993226]


### Building a new model changing vector size and minimum count eligibility

In [11]:
model = Doc2Vec(documents, vector_size=50, min_count=3, epochs=40)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [12]:
len(model.wv.vocab)
#len(model.wv.index_to_key)

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [13]:
model.wv.vocab

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [14]:
vector1 = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector1)

[ 0.00275315  0.00927881 -0.00654727 -0.00375533  0.00865707  0.0030918
 -0.00983206  0.00781086 -0.00534719  0.0023765   0.00832855  0.00914602
  0.00722278 -0.0080494   0.00907288 -0.00941162 -0.00481918 -0.00206123
 -0.00371853 -0.00747079  0.00799334  0.0063884  -0.00626846  0.00345137
  0.00894975  0.00576471  0.00202684  0.00142986 -0.00124991  0.00473358
  0.00555094 -0.00711092 -0.00405471 -0.00299963 -0.00878438 -0.00835046
 -0.00179125  0.00224573 -0.00236428 -0.00667231  0.00669324  0.00735251
  0.00543838  0.001361    0.00751761  0.00958376 -0.00315277  0.00431433
  0.00614541 -0.00591232]


In [15]:
len(vector1)

50

In [16]:
vector2 = model.infer_vector(['user', 'interface', 'for', 'computer', 'trees'])
print(vector2)

[ 5.4323128e-03 -1.0688989e-03  8.0083860e-03  4.7749323e-03
 -5.6397715e-03 -5.3514340e-03 -4.8345756e-03 -3.1998379e-03
 -3.6909492e-03 -4.4625914e-03  7.7370653e-04  7.4750911e-03
  5.1512723e-03 -4.9892073e-03  3.3175389e-03  9.6221659e-03
 -5.9786118e-03 -3.5752922e-03 -6.7621320e-03  4.2291293e-03
  5.9762462e-03  1.7234727e-03 -8.5616549e-03 -2.6883094e-03
  6.2798010e-03  6.8307105e-03 -4.2846720e-03  9.9157430e-03
  3.5272821e-04  6.7129666e-03 -2.9019930e-03 -5.7344292e-03
  8.7347096e-03 -3.2570078e-03  7.0166835e-03 -8.3021913e-03
 -1.1807675e-03 -6.5778745e-03  2.4815386e-03 -3.3156641e-04
  7.7153952e-03 -5.0899419e-03  3.1003896e-05 -9.2797363e-03
  8.9662205e-03  7.0243294e-04  1.7250769e-04  7.1954587e-04
 -1.3388265e-03 -7.9992237e-03]


In [17]:
len(vector2)

50

## There are two approaches to build paragraph vectors: the PV-DM and PV-DBOW

### Doc2Vec based on the distributed memory model (dm=1)

In [18]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, dm=1)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [19]:
vector1 = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector1)

[ 0.00211574  0.00915337 -0.00693545 -0.00377622  0.00843657  0.00327316
 -0.01007721  0.00827041 -0.00596042  0.00268909  0.00865541  0.0096917
  0.00745454 -0.00838413  0.00936548 -0.00994017 -0.00480654 -0.00179974
 -0.00436808 -0.00758565  0.00761374  0.00643536 -0.00654957  0.00362568
  0.00885319  0.00580416  0.00164739  0.00079204 -0.00116876  0.00446676
  0.00611638 -0.00664235 -0.0047061  -0.00312531 -0.00881625 -0.00792156
 -0.00191989  0.00165224 -0.00216513 -0.0067455   0.00625954  0.00778112
  0.00525368  0.00075871  0.00718643  0.00992841 -0.0028768   0.0042999
  0.00656147 -0.00607138]


In [20]:
len(vector1)

50

### Doc2Vec built next would be based on the distributed bag of words approach (dm=0)

In [21]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, dm=0)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [22]:
vector2 = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector2)

[ 7.96577951e-04  8.92572477e-03 -7.53967091e-03 -4.11836011e-03
  7.97703955e-03  3.69162462e-03 -1.05823399e-02  9.33239609e-03
 -6.61957497e-03  3.48863285e-03  9.06384178e-03  1.06493644e-02
  7.93359801e-03 -8.85376055e-03  1.00990785e-02 -1.06949601e-02
 -5.47884498e-03 -1.62192597e-03 -5.00717293e-03 -7.37058837e-03
  6.98664133e-03  6.48449687e-03 -7.31107127e-03  3.67962942e-03
  8.58085137e-03  6.02903264e-03  1.18653907e-03 -3.68781446e-04
 -1.16685941e-03  4.25128406e-03  7.17442296e-03 -5.65095711e-03
 -5.95077593e-03 -2.76214327e-03 -9.03856475e-03 -6.83999807e-03
 -2.41128053e-03  9.68245498e-04 -1.83563703e-03 -6.96342764e-03
  5.50191477e-03  8.62030592e-03  4.91271028e-03 -6.57548080e-05
  6.12137699e-03  1.07721910e-02 -2.29483400e-03  4.46516369e-03
  7.25069223e-03 -6.37954660e-03]


In [23]:
len(vector2)

50

### Adding the window size which controls the maximum distance between current and predicted word

In [24]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=0)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [25]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[ 7.96577951e-04  8.92572477e-03 -7.53967091e-03 -4.11836011e-03
  7.97703955e-03  3.69162462e-03 -1.05823399e-02  9.33239609e-03
 -6.61957497e-03  3.48863285e-03  9.06384178e-03  1.06493644e-02
  7.93359801e-03 -8.85376055e-03  1.00990785e-02 -1.06949601e-02
 -5.47884498e-03 -1.62192597e-03 -5.00717293e-03 -7.37058837e-03
  6.98664133e-03  6.48449687e-03 -7.31107127e-03  3.67962942e-03
  8.58085137e-03  6.02903264e-03  1.18653907e-03 -3.68781446e-04
 -1.16685941e-03  4.25128406e-03  7.17442296e-03 -5.65095711e-03
 -5.95077593e-03 -2.76214327e-03 -9.03856475e-03 -6.83999807e-03
 -2.41128053e-03  9.68245498e-04 -1.83563703e-03 -6.96342764e-03
  5.50191477e-03  8.62030592e-03  4.91271028e-03 -6.57548080e-05
  6.12137699e-03  1.07721910e-02 -2.29483400e-03  4.46516369e-03
  7.25069223e-03 -6.37954660e-03]


### Adding initial learning rate and to what value should the learning rate drop to linearly over training (alpha and min_alpha)

In [26]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [27]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.30997148 -0.05419222 -0.20869155 -0.0094907  -0.12514514  0.12009168
 -0.17600422  0.1918814  -0.27752745  0.12041155  0.24283369  0.27641186
  0.1240024  -0.25481758  0.18184145 -0.3541592  -0.08445887  0.13247225
 -0.34137362 -0.04781209 -0.0639575   0.0759771  -0.176685    0.16863742
 -0.03636871  0.00173062 -0.20881394 -0.28400123  0.05654295 -0.1837985
  0.33333668  0.2160777  -0.35022303 -0.07953406 -0.09522511  0.1714073
 -0.02705434 -0.17897056 -0.02569358 -0.01905384 -0.11683214  0.14928779
  0.00940733 -0.31112286 -0.06431808  0.1757684   0.18200447 -0.00736232
  0.24387175 -0.18112005]


In [28]:
vector1 = model.infer_vector(['user', 'interface', 'for', 'computer', 'trees'])
print(vector1)

[-0.3183066  -0.09446383 -0.2057174   0.03354369 -0.10750768  0.09016126
 -0.15225726  0.17208654 -0.2296918   0.10481628  0.19542044  0.2910848
  0.09422569 -0.18281795  0.14324011 -0.20875874 -0.0382637   0.14597176
 -0.2458414   0.0191445  -0.14056654  0.05110768 -0.17930299  0.12107011
  0.02017868 -0.01917686 -0.19636363 -0.26353478  0.03660677 -0.12466112
  0.19426861  0.19536856 -0.28239316 -0.02848074 -0.07285984  0.20085602
 -0.04884389 -0.16789638 -0.08010467 -0.02761406 -0.13304755  0.07684526
 -0.08604465 -0.26050475 -0.033079    0.16747399  0.0999692   0.02130131
  0.23142117 -0.19943373]


### Adding the dm_concat parameter to use concatenation of the word vectors

In [29]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, alpha=0.3, min_alpha=0.05, dm_concat=1)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [30]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[ 0.04736445 -0.19152007 -0.1187445   0.00652415  0.1621975   0.07260685
  0.01800431 -0.25172278  0.00330857  0.16784172 -0.09943117 -0.00070348
 -0.17003772 -0.17329094 -0.00398928 -0.16182724  0.03845131  0.0763863
 -0.04634567 -0.20069659  0.01315746  0.02568795 -0.09783111  0.03022138
 -0.22853899 -0.07779045 -0.26734018 -0.01142945  0.0744584  -0.16209894
  0.18346117 -0.03536795  0.05940613 -0.20905031  0.08863018 -0.15960747
  0.05738065 -0.11140674 -0.00863791  0.0303864   0.21128744 -0.00946644
  0.22708647 -0.21699703  0.05482844 -0.06493609  0.04571979 -0.00197637
  0.01464513 -0.15476249]


In [31]:
vector1 = model.infer_vector(['user', 'interface', 'for', 'computer', 'trees'])
print(vector1)

[ 0.04323404 -0.16983673 -0.07961286  0.07344522  0.1641275   0.02981976
  0.02511325 -0.26060644  0.03541024  0.10010695 -0.0991822  -0.02858623
 -0.18646541 -0.15313089 -0.00816079 -0.13503584  0.05226213  0.0659497
 -0.01745317 -0.13549705  0.02619831  0.03481792 -0.09810316  0.10668296
 -0.1635071  -0.08974929 -0.2531391   0.00346655  0.07761529 -0.19694258
  0.08558537 -0.06855388  0.10778017 -0.19971116  0.04375234 -0.1629273
  0.03714892 -0.09109904 -0.07635406  0.00886129  0.24953265 -0.06930447
  0.23476505 -0.15721983  0.1046213  -0.11670952 -0.0183713   0.0476134
 -0.01371263 -0.16642454]


### Adding the dm_mean parameter to use sum of the context word vectors (dm_mean=1)

In [32]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, dm_concat=0, dm_mean=1, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [33]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.30997148 -0.05419222 -0.20869155 -0.0094907  -0.12514514  0.12009168
 -0.17600422  0.1918814  -0.27752745  0.12041155  0.24283369  0.27641186
  0.1240024  -0.25481758  0.18184145 -0.3541592  -0.08445887  0.13247225
 -0.34137362 -0.04781209 -0.0639575   0.0759771  -0.176685    0.16863742
 -0.03636871  0.00173062 -0.20881394 -0.28400123  0.05654295 -0.1837985
  0.33333668  0.2160777  -0.35022303 -0.07953406 -0.09522511  0.1714073
 -0.02705434 -0.17897056 -0.02569358 -0.01905384 -0.11683214  0.14928779
  0.00940733 -0.31112286 -0.06431808  0.1757684   0.18200447 -0.00736232
  0.24387175 -0.18112005]


### Adding the dm_mean parameter to use mean of the context word vectors (dm_mean=0)

In [34]:
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40, window=2, dm=1, dm_concat=0, dm_mean=0, alpha=0.3, min_alpha=0.05)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [35]:
vector = model.infer_vector(['user', 'interface', 'for', 'computer'])
print(vector)

[-0.30997148 -0.05419222 -0.20869155 -0.0094907  -0.12514514  0.12009168
 -0.17600422  0.1918814  -0.27752745  0.12041155  0.24283369  0.27641186
  0.1240024  -0.25481758  0.18184145 -0.3541592  -0.08445887  0.13247225
 -0.34137362 -0.04781209 -0.0639575   0.0759771  -0.176685    0.16863742
 -0.03636871  0.00173062 -0.20881394 -0.28400123  0.05654295 -0.1837985
  0.33333668  0.2160777  -0.35022303 -0.07953406 -0.09522511  0.1714073
 -0.02705434 -0.17897056 -0.02569358 -0.01905384 -0.11683214  0.14928779
  0.00940733 -0.31112286 -0.06431808  0.1757684   0.18200447 -0.00736232
  0.24387175 -0.18112005]
