In [2]:
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv("data/spam.csv", encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
messages.columns = ['label', 'text']
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [3]:
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'], messages['label'], test_size=0.2, random_state=42)

In [6]:
print(X_train.shape)
print(X_test.shape)

(4457,)
(1115,)


In [7]:
tagged_docs = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]
tagged_docs[0]

TaggedDocument(words=['no', 'in', 'the', 'same', 'boat', 'still', 'here', 'at', 'my', 'moms', 'check', 'me', 'out', 'on', 'yo', 'half', 'naked'], tags=[0])

In [8]:
d2v_model = gensim.models.Doc2Vec(tagged_docs, vector_size=100, window=5, min_count=2)

In [9]:
d2v_model.infer_vector(['i', 'am', 'learning', 'NLP'])

array([-0.01642826,  0.0087899 ,  0.00727921,  0.00163909, -0.00260686,
       -0.03123031,  0.00052865,  0.05391577, -0.02602936, -0.00615708,
       -0.01955118, -0.02150568,  0.0069836 ,  0.0173604 ,  0.00884985,
       -0.02627418,  0.00964367, -0.02220846, -0.00315863, -0.03980689,
        0.01643363,  0.00845358,  0.01330932, -0.01494466,  0.00075984,
       -0.00826531, -0.01473922, -0.00870075, -0.01209086, -0.00403463,
        0.02912069,  0.00592557,  0.01747879, -0.00634774, -0.00648373,
        0.03498967, -0.00035372, -0.01826677, -0.01749962, -0.03224976,
        0.00222359, -0.02247489,  0.00380694, -0.01501684,  0.01033392,
       -0.00820219, -0.00511041, -0.01247929,  0.00768868,  0.01953254,
        0.00646099, -0.01438479, -0.00085483, -0.00680471, -0.01489087,
        0.01936117,  0.01085061,  0.00940092, -0.02120397,  0.00916996,
        0.00803589, -0.00912922, -0.00338499, -0.0060307 , -0.03246205,
        0.02913532, -0.0007981 ,  0.02153483, -0.0265501 ,  0.03

In [11]:
vectors = [[d2v_model.infer_vector(words)] for words in X_test]

In [12]:
vectors[0]

[array([-0.05151838,  0.01446807,  0.04297442,  0.02152314,  0.01306247,
        -0.09142785,  0.00306269,  0.16295826, -0.08739644, -0.02904986,
        -0.07639629, -0.10112891, -0.00725739,  0.03282927,  0.0106625 ,
        -0.07848337,  0.03185017, -0.07524703,  0.00520316, -0.1505978 ,
         0.02326543,  0.04620664,  0.04721679, -0.02116672,  0.00855021,
        -0.0153354 , -0.05339457, -0.01738951, -0.04247596, -0.0140232 ,
         0.07569564,  0.01535793,  0.05590167, -0.01145789, -0.04396398,
         0.08959933,  0.0123566 , -0.08512382, -0.05466197, -0.11431774,
        -0.00863689, -0.08400486, -0.00654267, -0.03630346,  0.01660048,
        -0.04184675, -0.0219517 , -0.0442448 ,  0.02756467,  0.06765153,
         0.02949198, -0.04231436,  0.01468367, -0.0058047 , -0.05899435,
         0.04045742,  0.05262645,  0.009686  , -0.04640332,  0.04331235,
         0.02419148, -0.02977659,  0.01002352, -0.02989593, -0.10468096,
         0.0903938 ,  0.00676139,  0.07438606, -0.0