# word2vec: How To Prep Word Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

### Prep Word Vectors

In [7]:
# Generate a list of words the word2vec model learned word vectors for
w2v_model.wv.index_to_key[:10]

['to', 'you', 'the', 'and', 'in', 'is', 'me', 'my', 'it', 'for']

In [3]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
w2v_vect = np.array([np.array([w2v_model.wv[word] for word in ls if word in w2v_model.wv.index_to_key])
                     for ls in X_test], dtype=object)

In [4]:
# # Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vect[:10]):
    print(len(X_test.iloc[i]), len(v))

7 7
20 17
9 9
29 24
50 43
15 15
6 6
7 7
15 15
31 29


In [5]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [6]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect_avg[:10]):
    print(len(X_test.iloc[i]), len(v))

7 100
20 100
9 100
29 100
50 100
15 100
6 100
7 100
15 100
31 100
