# doc2vec: How To Prep Document Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in data, clean it, split it into train/test, and then train a doc2vec model
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('/Users/JacobRaymond 1/Desktop/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

tagged_docs_tr = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

d2v_model = gensim.models.Doc2Vec(tagged_docs_tr,
                                  vector_size=50,
                                  window=2,
                                  min_count=2)

In [2]:
# What does a document vector look like again?
d2v_model.infer_vector(['convert', 'words', 'to', 'vectors'])

array([ 0.00549775, -0.00479138, -0.00453697,  0.00310753,  0.00496718,
       -0.0141829 , -0.00978521,  0.00475991, -0.00920011, -0.0198237 ,
        0.01978293, -0.02314577, -0.0131962 , -0.01974957, -0.01941078,
        0.02106815,  0.00937061, -0.00900033, -0.01053536, -0.0093924 ,
       -0.01030937,  0.00231917,  0.00744366,  0.00988739,  0.01410409,
        0.00065239, -0.0096357 , -0.00270245,  0.01245961, -0.01107858,
        0.02550416,  0.00590822, -0.00408787, -0.00393258,  0.0154953 ,
        0.00664163,  0.0090831 ,  0.01284913, -0.00244675, -0.0101524 ,
       -0.01219207,  0.00228777,  0.00476197,  0.00904806,  0.00158163,
        0.0142659 , -0.03730001, -0.00859052,  0.00307706, -0.01216811],
      dtype=float32)

In [3]:
# How do we prepare these vectors to be used in a machine learning model?
vectors=[[d2v_model.infer_vector(words)] for words in X_test]

In [4]:
vectors[0]

[array([ 0.0166454 , -0.024421  ,  0.0145875 ,  0.01300649, -0.01341675,
        -0.05900685, -0.00967271,  0.02800223, -0.00236587, -0.04638653,
         0.04136406, -0.05129437, -0.01588824, -0.04722268, -0.05877544,
         0.04227637,  0.03215004, -0.04348295, -0.03935796, -0.0240198 ,
        -0.01556802,  0.03021969,  0.03468537,  0.02838834,  0.02227793,
        -0.01700861, -0.03907032, -0.04402911,  0.06017286, -0.00885053,
         0.0482679 , -0.00273001, -0.02167987, -0.00608855,  0.02277244,
         0.03922192,  0.00222365,  0.0026599 , -0.0153467 , -0.03201162,
        -0.02324964, -0.00532325,  0.0221997 , -0.00034635, -0.01663011,
         0.04976422, -0.10098164, -0.00433089, -0.01163869, -0.04852285],
       dtype=float32)]