# word2vec: How To Prep Word Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

### Prep Word Vectors

In [4]:
# Generate a list of words the word2vec model learned word vectors for
words = set(w2v_model.wv.index_to_key)
words

{'convey',
 'enjoyed',
 'enuff',
 'weekly',
 'yep',
 'gold',
 'thanks',
 'blur',
 'convincing',
 'prizes',
 'respect',
 'whenever',
 'kb',
 'benefits',
 'santa',
 'blame',
 'rights',
 'thnk',
 'cancel',
 'tmr',
 'mall',
 'plaza',
 'wiv',
 'inc',
 'good',
 'help',
 'called',
 'contents',
 'worlds',
 'nearly',
 'drinkin',
 'yummy',
 'challenge',
 'luxury',
 'hope',
 'reveal',
 'workin',
 'open',
 'wife',
 'spoiled',
 'prepared',
 'ym',
 'frndship',
 'discuss',
 'accordingly',
 'so',
 'askd',
 'blackberry',
 'stamps',
 'big',
 'callcost',
 'student',
 'questioned',
 'vodka',
 'evrey',
 'hold',
 'january',
 'prabha',
 'drivin',
 'calm',
 'idiot',
 'roads',
 'exact',
 'truly',
 'somtimes',
 'suprman',
 'outstanding',
 'hurry',
 'getting',
 'ad',
 'morow',
 'talk',
 'closed',
 'roommates',
 'dificult',
 'havnt',
 'sake',
 'perwksub',
 'tells',
 'make',
 'window',
 'shoppin',
 'gym',
 'wkly',
 'its',
 'thousands',
 'un',
 'lookatme',
 'okie',
 'anywhere',
 'cash',
 'women',
 'which',
 'rest',

In [6]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key])
                     for ls in X_test])

  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

18 16
3 3
17 16
21 21
5 5
20 20
12 12
25 23
22 20
52 48
6 3
14 14
4 4
5 4
14 13
10 10
6 6
9 9
7 7
55 49
6 4
6 4
9 9
35 28
10 10
9 9
6 5
7 7
12 9
6 6
5 5
25 24
11 11
4 4
8 8
5 5
23 19
9 9
4 4
5 5
18 17
23 22
24 22
11 11
13 12
6 6
15 15
19 19
25 20
19 18
22 20
13 10
19 16
5 4
28 25
30 28
11 10
11 11
6 6
7 5
34 30
30 22
5 5
19 19
14 14
7 7
6 6
14 12
9 8
7 7
10 10
16 16
3 3
14 13
20 17
14 14
4 4
5 5
5 3
26 24
19 14
7 6
18 17
14 13
22 22
7 6
13 10
17 15
29 28
16 10
23 22
5 5
6 5
24 14
9 9
19 16
19 19
7 6
32 31
29 28
27 27
8 8
7 2
24 23
6 6
11 10
6 5
6 5
24 23
5 5
20 20
12 12
17 12
10 10
9 9
14 14
14 14
11 11
8 8
6 5
6 6
6 6
7 7
19 14
12 10
22 20
18 16
14 14
27 26
4 3
38 35
19 17
4 4
10 10
11 9
7 5
25 23
37 34
7 6
27 26
8 8
8 5
5 3
23 23
40 33
4 4
6 6
5 5
6 6
23 23
10 10
22 21
21 20
27 27
25 24
28 25
19 17
8 8
22 19
4 4
9 8
7 6
7 7
24 23
19 19
14 13
6 0
12 11
9 9
9 9
9 8
8 7
12 10
12 12
27 26
9 9
31 26
8 8
6 4
17 16
19 19
20 17
24 23
18 17
7 7
23 23
28 22
26 26
9 8
6 4
9 6
4 4
8 8
21 20
8 8


In [9]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [10]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

18 100
3 100
17 100
21 100
5 100
20 100
12 100
25 100
22 100
52 100
6 100
14 100
4 100
5 100
14 100
10 100
6 100
9 100
7 100
55 100
6 100
6 100
9 100
35 100
10 100
9 100
6 100
7 100
12 100
6 100
5 100
25 100
11 100
4 100
8 100
5 100
23 100
9 100
4 100
5 100
18 100
23 100
24 100
11 100
13 100
6 100
15 100
19 100
25 100
19 100
22 100
13 100
19 100
5 100
28 100
30 100
11 100
11 100
6 100
7 100
34 100
30 100
5 100
19 100
14 100
7 100
6 100
14 100
9 100
7 100
10 100
16 100
3 100
14 100
20 100
14 100
4 100
5 100
5 100
26 100
19 100
7 100
18 100
14 100
22 100
7 100
13 100
17 100
29 100
16 100
23 100
5 100
6 100
24 100
9 100
19 100
19 100
7 100
32 100
29 100
27 100
8 100
7 100
24 100
6 100
11 100
6 100
6 100
24 100
5 100
20 100
12 100
17 100
10 100
9 100
14 100
14 100
11 100
8 100
6 100
6 100
6 100
7 100
19 100
12 100
22 100
18 100
14 100
27 100
4 100
38 100
19 100
4 100
10 100
11 100
7 100
25 100
37 100
7 100
27 100
8 100
8 100
5 100
23 100
40 100
4 100
6 100
5 100
6 100
23 100
10 100
22 100
