# word2vec: How To Prep Word Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [23]:
messages.shape
messages.head()


Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


### Prep Word Vectors

In [2]:
# Generate a list of words the word2vec model learned word vectors for
w2v_model.wv.index_to_key

['you',
 'to',
 'the',
 'and',
 'in',
 'is',
 'me',
 'it',
 'my',
 'for',
 'your',
 'of',
 'call',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'but',
 'not',
 'so',
 'or',
 'do',
 'at',
 'we',
 'get',
 'with',
 'be',
 'ur',
 'no',
 'if',
 'will',
 'just',
 'this',
 'how',
 'gt',
 'lt',
 'when',
 'from',
 'ok',
 'up',
 'free',
 'what',
 'go',
 'all',
 'll',
 'out',
 'know',
 'am',
 'day',
 'like',
 'good',
 'then',
 'he',
 'there',
 'was',
 'got',
 'its',
 'time',
 'come',
 'love',
 'only',
 'send',
 'want',
 'text',
 'txt',
 'one',
 'as',
 'about',
 'by',
 'home',
 'going',
 'sorry',
 'she',
 'today',
 'stop',
 'see',
 'our',
 'need',
 'back',
 'don',
 'lor',
 'still',
 'mobile',
 'dont',
 'pls',
 'think',
 'later',
 'reply',
 'tell',
 'been',
 'any',
 'da',
 'hi',
 'take',
 'please',
 'dear',
 'well',
 'new',
 'some',
 'week',
 'phone',
 'they',
 'her',
 'did',
 'who',
 'here',
 'much',
 'has',
 'great',
 'hey',
 'night',
 'hope',
 'an',
 'claim',
 'ì_',
 'him',
 'where',
 're',

In [7]:
len(w2v_model.wv['free'])

100

In [12]:
X_test

5109                                                                                   [oh, you, got, many]
4929    [hi, the, sexychat, girls, are, waiting, for, you, to, text, them, text, now, for, great, night,...
4719              [forgot, to, tell, ì_, smth, can, ì_, like, number, the, sections, so, that, it, clearer]
1333                                                                 [oh, icic, lor, den, meet, other, day]
4693    [pls, give, her, the, food, preferably, pap, very, slowly, with, loads, of, sugar, you, can, tak...
                                                       ...                                                 
939     [better, made, up, for, friday, and, stuffed, myself, like, pig, yesterday, now, feel, bleh, but...
4140    [beautiful, truth, expression, of, the, face, could, be, seen, by, everyone, but, the, depressio...
5150    [happy, new, year, to, and, ur, family, may, this, new, year, bring, happiness, stability, and, ...
4811                        

In [24]:
max_len = max(len(ls) for ls in X_test) 
max_len

w2v_vect = [np.array([w2v_model.wv[i] if i in w2v_model.wv.index_to_key else np.zeros(100) for i in ls] + [np.zeros(100)] * (max_len - len(ls))) for ls in X_test]

In [33]:
# w2v_vect = np.vstack([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key] or [np.zeros(100)]) for ls in X_test])
# w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key]) for ls in X_test])
w2v_vect = [np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key]) for ls in X_test]
w2v_vect[0:2]

[array([[-0.19997314,  0.29739287,  0.02854843,  0.04640778,  0.10774346,
         -0.5791091 ,  0.26312023,  0.7571186 , -0.29510304, -0.348881  ,
         -0.19531594, -0.6796173 , -0.02252132,  0.16435447,  0.19086671,
         -0.29333663,  0.05761117, -0.45026943, -0.02443293, -0.7126424 ,
          0.26291877,  0.29872987,  0.14668882, -0.20118624, -0.07120603,
          0.03741152, -0.3581561 , -0.28194207, -0.44366527,  0.0718427 ,
          0.31125015,  0.10162631,  0.15961413, -0.44784734, -0.21245445,
          0.43474838, -0.08147884, -0.25366697, -0.2514893 , -0.67671657,
          0.12074463, -0.4324356 , -0.27451602,  0.04734866,  0.35193917,
         -0.0981661 , -0.22302508, -0.07424755,  0.16763717,  0.22930141,
          0.04732389, -0.32749227, -0.01273228, -0.09027507, -0.06575549,
          0.05605656,  0.24350779, -0.10795879, -0.4206674 ,  0.21648133,
          0.11331315,  0.078687  ,  0.07727103, -0.11739239, -0.38242498,
          0.40158165,  0.07309917,  0.

In [28]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))


4 4
23 22
15 13
7 6
30 28
13 12
6 6
5 5
15 14
9 7
25 20
25 22
4 4
7 7
19 19
35 35
4 3
16 13
8 8
15 9
18 16
24 21
7 7
16 15
20 20
22 20
15 13
28 28
7 4
20 19
7 5
19 16
27 25
6 6
16 12
14 13
4 4
9 9
25 25
9 6
7 7
5 5
8 7
8 8
19 12
18 17
7 6
6 6
25 25
27 26
13 13
21 19
14 12
5 4
16 13
19 19
5 2
15 14
10 9
7 7
32 31
5 5
6 5
28 24
6 6
18 14
10 10
6 5
29 29
15 15
29 28
4 1
28 21
2 2
2 0
15 14
18 16
10 7
19 17
8 8
25 19
24 23
7 2
25 24
28 26
4 4
13 13
9 9
5 5
6 6
25 23
7 6
5 5
10 9
16 13
20 20
21 18
30 28
13 13
14 13
9 9
16 15
5 5
8 8
20 19
8 8
9 7
29 26
7 7
9 8
7 7
11 9
20 19
13 12
21 21
17 9
4 4
7 6
58 53
42 38
6 4
25 24
21 18
9 9
2 2
9 9
8 7
9 8
6 6
11 11
12 11
17 16
7 7
6 3
13 13
7 6
2 2
5 5
22 21
18 15
14 11
6 6
6 5
9 9
5 4
7 7
17 17
16 16
7 7
14 11
7 5
5 4
11 11
15 14
21 14
7 5
7 7
12 11
9 8
5 5
4 4
4 4
18 17
17 16
24 18
27 24
9 9
24 24
49 43
21 19
18 18
6 6
5 5
9 8
10 10
6 5
7 6
23 23
4 4
6 6
8 3
8 8
9 9
21 21
16 14
4 4
4 4
28 27
10 9
10 8
14 12
9 9
5 3
14 14
9 9
7 5
37 34
28 24
17 14


In [37]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [38]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

4 100
23 100
15 100
7 100
30 100
13 100
6 100
5 100
15 100
9 100
25 100
25 100
4 100
7 100
19 100
35 100
4 100
16 100
8 100
15 100
18 100
24 100
7 100
16 100
20 100
22 100
15 100
28 100
7 100
20 100
7 100
19 100
27 100
6 100
16 100
14 100
4 100
9 100
25 100
9 100
7 100
5 100
8 100
8 100
19 100
18 100
7 100
6 100
25 100
27 100
13 100
21 100
14 100
5 100
16 100
19 100
5 100
15 100
10 100
7 100
32 100
5 100
6 100
28 100
6 100
18 100
10 100
6 100
29 100
15 100
29 100
4 100
28 100
2 100
2 100
15 100
18 100
10 100
19 100
8 100
25 100
24 100
7 100
25 100
28 100
4 100
13 100
9 100
5 100
6 100
25 100
7 100
5 100
10 100
16 100
20 100
21 100
30 100
13 100
14 100
9 100
16 100
5 100
8 100
20 100
8 100
9 100
29 100
7 100
9 100
7 100
11 100
20 100
13 100
21 100
17 100
4 100
7 100
58 100
42 100
6 100
25 100
21 100
9 100
2 100
9 100
8 100
9 100
6 100
11 100
12 100
17 100
7 100
6 100
13 100
7 100
2 100
5 100
22 100
18 100
14 100
6 100
6 100
9 100
5 100
7 100
17 100
16 100
7 100
14 100
7 100
5 100
11 100