# doc2vec: How To Implement doc2vec

### Train Our Own Model

In [1]:
# Read in data, clean it, and then split into train and test sets
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

In [3]:
# Create tagged document objects to prepare to train the model
tagged_docs = [(i, v) for i, v in enumerate(X_train)]

In [5]:
# Look at what a tagged document looks like
for i in tagged_docs[:5]:
    print(i)
# tagged_docs[0]

(0, ['what', 'are', 'your', 'new', 'years', 'plans'])
(1, ['thk', 'some', 'of', 'em', 'find', 'wtc', 'too', 'far', 'weiyi', 'not', 'goin', 'rest', 'dunno', 'yet', 'ur', 'goin', 'dinner', 'den', 'might', 'able', 'to', 'join'])
(2, ['twinks', 'bears', 'scallies', 'skins', 'and', 'jocks', 'are', 'calling', 'now', 'don', 'miss', 'the', 'weekend', 'fun', 'call', 'at', 'min', 'stop', 'texts', 'call', 'nat', 'rate'])
(3, ['headin', 'towards', 'busetop'])
(4, ['double', 'mins', 'and', 'txts', 'months', 'free', 'bluetooth', 'on', 'orange', 'available', 'on', 'sony', 'nokia', 'motorola', 'phones', 'call', 'mobileupd', 'on', 'or', 'call', 'optout', 'dx'])


In [6]:
# Create tagged document objects to prepare to train the model
tagged_docs = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

In [7]:
# Look at what a tagged document looks like
for i in tagged_docs[:5]:
    print(i)
# tagged_docs[0]

TaggedDocument<['what', 'are', 'your', 'new', 'years', 'plans'], [0]>
TaggedDocument<['thk', 'some', 'of', 'em', 'find', 'wtc', 'too', 'far', 'weiyi', 'not', 'goin', 'rest', 'dunno', 'yet', 'ur', 'goin', 'dinner', 'den', 'might', 'able', 'to', 'join'], [1]>
TaggedDocument<['twinks', 'bears', 'scallies', 'skins', 'and', 'jocks', 'are', 'calling', 'now', 'don', 'miss', 'the', 'weekend', 'fun', 'call', 'at', 'min', 'stop', 'texts', 'call', 'nat', 'rate'], [2]>
TaggedDocument<['headin', 'towards', 'busetop'], [3]>
TaggedDocument<['double', 'mins', 'and', 'txts', 'months', 'free', 'bluetooth', 'on', 'orange', 'available', 'on', 'sony', 'nokia', 'motorola', 'phones', 'call', 'mobileupd', 'on', 'or', 'call', 'optout', 'dx'], [4]>


In [8]:
# Train a basic doc2vec model
d2v_model = gensim.models.Doc2Vec( tagged_docs,
                                    vector_size=100,
                                    window=5,
                                    min_count=2)

In [9]:
# What happens if we pass in a single word like we did for word2vec?
d2v_model.infer_vector('Gay')

TypeError: Parameter doc_words of infer_vector() must be a list of strings (not a single string).

In [10]:
# What happens if we pass in a list of words?
d2v_model.infer_vector(['I', 'am', 'gay'])

array([-0.00815558,  0.00435068,  0.00844627, -0.01270869, -0.00952795,
       -0.02947788, -0.00926636,  0.03616752, -0.00639116, -0.0120073 ,
       -0.00629653, -0.01861083,  0.00598847,  0.0107854 ,  0.0005481 ,
       -0.01934811,  0.00073374, -0.01855777,  0.00172599, -0.03146045,
        0.01091976,  0.01712501,  0.01684329, -0.01426483, -0.00590244,
        0.00376845, -0.01407465, -0.01225969, -0.01167423,  0.0015641 ,
        0.02643669,  0.00378451,  0.01154035, -0.02249581, -0.00876446,
        0.0267533 , -0.00704357, -0.01268446, -0.01390227, -0.02744064,
       -0.00872063, -0.00463759, -0.00046223, -0.02032328,  0.02243973,
       -0.00548267, -0.01710602, -0.00641512,  0.00721884,  0.00865342,
        0.00627967, -0.00907352,  0.00260557,  0.00211565, -0.00550139,
        0.01048372,  0.00533774, -0.00644588, -0.0190436 ,  0.00567503,
        0.00028225,  0.00132124, -0.00303744,  0.00214167, -0.02391867,
        0.0141179 ,  0.00289306,  0.00346482, -0.02734283,  0.02

### What About Pre-trained Document Vectors?

There are not as many options as there are for word vectors. There also is not an easy API to read these in like there is for `word2vec` so it is more time consuming.

Pre-trained vectors from training on Wikipedia and Associated Press News can be found [here](https://github.com/jhlau/doc2vec). Feel free to explore on your own!