### Train Our Own Model

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv("Data/spam.csv", encoding='latin-1')
messages = messages.drop(labels=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
messages.columns = ['label', 'text']

messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'], messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train, size=100, window=5, min_count=2)

### Prep Word Vectors

In [4]:
# Generate a list of words the word2vec model learned word vectors for
w2v_model.wv.index2word

['to',
 'you',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'it',
 'your',
 'for',
 'of',
 'call',
 'have',
 'that',
 'on',
 'are',
 'now',
 'so',
 'can',
 'not',
 'but',
 'or',
 'at',
 'get',
 'do',
 'if',
 'we',
 'ur',
 'be',
 'will',
 'no',
 'with',
 'just',
 'this',
 'gt',
 'lt',
 'up',
 'what',
 'how',
 'from',
 'when',
 'go',
 'free',
 'll',
 'ok',
 'all',
 'out',
 'know',
 'was',
 'day',
 'am',
 'there',
 'good',
 'got',
 'like',
 'then',
 'he',
 'come',
 'its',
 'time',
 'love',
 'only',
 'want',
 'send',
 'text',
 'txt',
 'by',
 'going',
 'as',
 'don',
 'one',
 'home',
 'need',
 'sorry',
 'she',
 'stop',
 'lor',
 'today',
 'about',
 'see',
 'da',
 'back',
 'still',
 'our',
 'mobile',
 'take',
 'reply',
 'later',
 'dont',
 'tell',
 'think',
 'did',
 'they',
 'been',
 'some',
 'ì_',
 'please',
 'hi',
 'phone',
 'here',
 'an',
 'pls',
 'her',
 'night',
 'week',
 'new',
 'great',
 'any',
 'who',
 'where',
 'has',
 'well',
 'him',
 'claim',
 'dear',
 'msg',
 'hope',
 'much',
 're',

In [8]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word]) for ls in X_test])

In [9]:
# Why is the length of the sentence different than the length of the sentence vector ?
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

20 17
9 8
27 25
25 19
13 10
6 6
29 26
19 19
4 4
3 1
21 21
15 15
7 5
4 4
32 27
6 6
14 13
22 22
15 15
18 15
0 0
10 9
17 16
7 7
16 13
5 5
21 20
12 11
8 8
4 4
8 8
6 6
4 1
4 4
27 26
8 7
11 11
8 8
7 6
4 3
6 6
8 8
11 9
15 13
20 14
5 4
8 8
7 6
26 23
11 9
7 7
31 31
13 10
9 9
23 20
3 2
4 4
28 25
6 6
26 26
8 8
6 6
5 5
3 3
15 14
7 4
32 30
9 4
6 6
13 10
5 5
25 25
11 11
5 4
8 8
22 20
12 10
25 24
25 22
9 9
9 7
22 20
25 24
7 7
6 6
12 12
24 17
7 7
28 25
11 8
14 12
8 8
14 14
7 7
5 5
35 34
9 9
24 22
2 2
11 11
25 25
12 10
3 2
34 32
11 11
19 19
8 7
7 5
11 11
11 10
24 20
8 5
27 25
8 8
26 26
12 12
24 20
6 6
36 33
18 16
6 6
18 14
7 7
18 16
18 16
25 23
8 8
6 6
7 6
10 7
25 24
14 14
18 18
12 10
11 9
13 11
5 3
21 19
8 8
9 8
17 15
20 20
7 5
10 10
14 10
5 5
6 5
9 9
6 6
27 25
9 8
8 8
9 8
1 1
6 6
5 2
21 19
30 30
22 17
12 11
15 15
18 16
28 28
10 9
7 7
5 5
7 5
44 41
12 11
30 30
26 24
8 7
27 27
9 8
6 4
13 11
4 4
10 10
20 20
9 9
6 6
6 6
12 9
12 9
20 18
8 7
12 9
6 6
25 19
5 5
7 7
16 11
20 18
7 7
7 5
9 9
14 14
8 6
9 9
10 9

In [10]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect) != 0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))


In [12]:
# Are our sentence vector lenghts consistent?
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

20 100
9 100
27 100
25 100
13 100
6 100
29 100
19 100
4 100
3 100
21 100
15 100
7 100
4 100
32 100
6 100
14 100
22 100
15 100
18 100
0 100
10 100
17 100
7 100
16 100
5 100
21 100
12 100
8 100
4 100
8 100
6 100
4 100
4 100
27 100
8 100
11 100
8 100
7 100
4 100
6 100
8 100
11 100
15 100
20 100
5 100
8 100
7 100
26 100
11 100
7 100
31 100
13 100
9 100
23 100
3 100
4 100
28 100
6 100
26 100
8 100
6 100
5 100
3 100
15 100
7 100
32 100
9 100
6 100
13 100
5 100
25 100
11 100
5 100
8 100
22 100
12 100
25 100
25 100
9 100
9 100
22 100
25 100
7 100
6 100
12 100
24 100
7 100
28 100
11 100
14 100
8 100
14 100
7 100
5 100
35 100
9 100
24 100
2 100
11 100
25 100
12 100
3 100
34 100
11 100
19 100
8 100
7 100
11 100
11 100
24 100
8 100
27 100
8 100
26 100
12 100
24 100
6 100
36 100
18 100
6 100
18 100
7 100
18 100
18 100
25 100
8 100
6 100
7 100
10 100
25 100
14 100
18 100
12 100
11 100
13 100
5 100
21 100
8 100
9 100
17 100
20 100
7 100
10 100
14 100
5 100
6 100
9 100
6 100
27 100
9 100
8 100
9 100
1