# word2vec: How To Prep Word Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('/Users/JacobRaymond 1/Desktop/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

### Prep Word Vectors

In [2]:
# Generate a list of words the word2vec model learned word vectors for (i.e. they appeared at least twice in the corpus)
w2v_model.wv.index2word

['you',
 'to',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'it',
 'for',
 'your',
 'of',
 'call',
 'have',
 'that',
 'on',
 'now',
 'are',
 'can',
 'so',
 'not',
 'but',
 'or',
 'at',
 'get',
 'we',
 'be',
 'with',
 'do',
 'ur',
 'no',
 'will',
 'if',
 'just',
 'this',
 'lt',
 'gt',
 'how',
 'up',
 'when',
 'what',
 'from',
 'ok',
 'free',
 'go',
 'all',
 'out',
 'll',
 'am',
 'know',
 'then',
 'like',
 'come',
 'got',
 'good',
 'there',
 'time',
 'its',
 'day',
 'he',
 'was',
 'only',
 'send',
 'want',
 'as',
 'love',
 'text',
 'going',
 'txt',
 'home',
 'today',
 'about',
 'need',
 'she',
 'one',
 'by',
 'still',
 'don',
 'lor',
 'sorry',
 'stop',
 'da',
 'our',
 'see',
 'back',
 'tell',
 'reply',
 'dont',
 'new',
 'take',
 'hi',
 'her',
 'think',
 'please',
 'pls',
 'any',
 'mobile',
 'later',
 'they',
 'did',
 'here',
 'been',
 'week',
 'dear',
 'much',
 'who',
 'phone',
 'ì_',
 'oh',
 'has',
 're',
 'him',
 'wat',
 'well',
 'claim',
 'some',
 'great',
 'night',
 'where',
 'an',
 

In [4]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
#The nested array must only contain vectors that have been determined to exist for words in the test set
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word])
                     for ls in X_test])
w2v_vect

array([array([[ 5.60949266e-01,  8.84910673e-02,  1.35544062e-01,
        -5.81646478e-03,  2.81139743e-02,  2.17227072e-01,
         1.18596286e-01,  8.94862562e-02,  6.75938046e-03,
         5.01265407e-01,  8.32193121e-02, -9.45343971e-02,
        -5.37489772e-01, -2.93896496e-01, -1.50942579e-01,
        -7.32686460e-01,  3.20114672e-01,  2.83345073e-01,
         5.38903832e-01,  1.18621238e-01, -2.37606183e-01,
        -2.35526145e-01,  2.36559659e-01,  1.00819588e-01,
        -2.71831661e-01, -4.28385854e-01, -2.41813853e-01,
        -1.21396951e-01, -5.72697818e-01, -5.04797935e-01,
         5.15912354e-01,  9.91237238e-02, -1.66614279e-01,
        -8.17288995e-01,  4.05429482e-01, -3.98451686e-01,
        -2.82906085e-01,  1.21654859e-02, -7.03095317e-01,
         2.80946456e-02, -2.21761301e-01,  1.06747262e-01,
        -5.08869824e-04,  8.03457499e-02,  2.38460213e-01,
        -6.49377644e-01, -4.48597252e-01,  3.61209661e-01,
        -3.82942799e-03,  1.93347067e-01,  6.7742

In [5]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))
    
#Returns the number of instances and the number of features.
#However, the algorithm expects a constant number of features.

4 4
22 22
24 19
8 7
7 6
8 7
21 21
8 7
31 29
5 5
18 18
6 6
20 19
6 6
18 16
18 17
17 17
16 13
8 7
24 22
8 8
24 24
6 5
5 4
25 25
9 8
9 7
10 8
12 12
12 11
8 3
7 7
10 9
21 21
3 1
25 23
11 8
13 10
8 8
23 7
19 14
8 7
11 11
4 4
12 10
19 19
10 9
15 14
6 6
22 20
22 22
3 3
9 8
5 5
9 9
11 10
17 16
19 19
9 8
5 5
4 4
23 22
10 10
6 6
9 8
19 17
28 28
7 6
5 4
7 7
24 18
7 7
9 8
22 20
13 12
11 8
4 4
9 7
13 13
60 55
24 19
10 10
14 14
4 4
5 5
10 10
7 7
6 6
22 13
23 18
16 16
5 3
4 4
7 7
8 8
6 5
19 19
13 8
63 57
6 5
22 22
20 17
9 9
8 7
15 13
19 17
26 19
13 12
9 8
6 6
23 22
26 26
8 7
31 27
22 20
5 3
22 19
10 7
30 28
16 15
16 16
6 4
31 26
7 7
21 19
5 5
15 11
7 6
7 7
6 4
11 11
6 5
8 6
7 6
0 0
12 12
18 18
47 38
9 8
21 20
13 13
8 8
12 12
8 3
8 7
24 21
10 8
14 14
10 10
3 2
4 3
17 17
15 15
31 30
10 7
4 4
20 20
23 23
6 5
8 6
5 5
26 22
24 20
6 5
6 6
22 17
10 9
22 20
8 8
4 4
21 18
14 12
14 13
6 6
21 10
10 8
11 11
6 6
13 10
9 7
26 23
13 10
7 5
8 8
11 11
21 12
11 8
24 24
14 13
22 20
5 3
18 15
9 8
36 35
8 8
4 4
16 15
15 

In [6]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [8]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

4 100
22 100
24 100
8 100
7 100
8 100
21 100
8 100
31 100
5 100
18 100
6 100
20 100
6 100
18 100
18 100
17 100
16 100
8 100
24 100
8 100
24 100
6 100
5 100
25 100
9 100
9 100
10 100
12 100
12 100
8 100
7 100
10 100
21 100
3 100
25 100
11 100
13 100
8 100
23 100
19 100
8 100
11 100
4 100
12 100
19 100
10 100
15 100
6 100
22 100
22 100
3 100
9 100
5 100
9 100
11 100
17 100
19 100
9 100
5 100
4 100
23 100
10 100
6 100
9 100
19 100
28 100
7 100
5 100
7 100
24 100
7 100
9 100
22 100
13 100
11 100
4 100
9 100
13 100
60 100
24 100
10 100
14 100
4 100
5 100
10 100
7 100
6 100
22 100
23 100
16 100
5 100
4 100
7 100
8 100
6 100
19 100
13 100
63 100
6 100
22 100
20 100
9 100
8 100
15 100
19 100
26 100
13 100
9 100
6 100
23 100
26 100
8 100
31 100
22 100
5 100
22 100
10 100
30 100
16 100
16 100
6 100
31 100
7 100
21 100
5 100
15 100
7 100
7 100
6 100
11 100
6 100
8 100
7 100
0 100
12 100
18 100
47 100
9 100
21 100
13 100
8 100
12 100
8 100
8 100
24 100
10 100
14 100
10 100
3 100
4 100
17 100
15 10