In [1]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('Data/spam.csv', encoding='latin-1')
messages = messages.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)
messages.columns = ['label', 'text']

messages['text_clean'] =  messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'], messages['label'], test_size=0.2)

messages.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [2]:
# Create tagged document objects to prepare to train the model
tagged_docs = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

In [3]:
tagged_docs[0]

TaggedDocument(words=['stop', 'calling', 'everyone', 'saying', 'might', 'have', 'cancer', 'my', 'throat', 'hurts', 'to', 'talk', 'can', 'be', 'answering', 'everyones', 'calls', 'if', 'get', 'one', 'more', 'call', 'not', 'babysitting', 'on', 'monday'], tags=[0])

In [4]:
# Train a Basic Doc2Vec Model
d2v_model = gensim.models.Doc2Vec(tagged_docs, vector_size=100, window=5, min_count=2)

In [5]:
# What happens if we pass in a single word like we did for word2vec?
d2v_model.infer_vector('text')

TypeError: Parameter doc_words of infer_vector() must be a list of strings (not a single string).

In [6]:
d2v_model.infer_vector(['i', 'am', 'learning', 'nlp'])

array([ 9.60972160e-04,  1.21568656e-02, -4.75017494e-03,  5.06986445e-03,
        1.13065131e-02, -9.98045504e-03,  8.18353333e-03, -4.55971994e-03,
       -1.27405450e-02,  6.71711890e-03,  9.04162735e-05,  4.31471411e-03,
        2.74440111e-03, -1.09607177e-02,  6.80300919e-03, -3.77165480e-03,
       -4.68598586e-03,  1.77999469e-03,  8.01777281e-03,  1.17838643e-02,
       -6.79766107e-03,  1.13808913e-02,  4.60355124e-03,  1.17149567e-02,
        9.47538204e-03, -1.13394903e-03, -2.11673509e-03, -5.77386981e-03,
       -7.63371028e-03,  1.27107627e-03, -9.27049387e-03, -9.84835345e-03,
       -3.91052198e-03,  1.94672134e-03, -1.17537528e-02, -1.08299227e-02,
        4.29700688e-03, -4.04704222e-03,  1.30519678e-03,  1.24423904e-02,
       -6.43567974e-03, -3.03326105e-03, -4.97345021e-03, -6.50324393e-03,
       -1.36413714e-02, -3.59274913e-03,  5.66283334e-03, -1.03945490e-02,
       -1.50131423e-03, -2.10987730e-03, -5.00960741e-03, -1.08786151e-02,
        5.16096130e-03, -