## Train word2vect model


In [1]:
!pip install pip install gensim==4.1.1

Collecting install
  Using cached install-1.3.5-py3-none-any.whl (3.2 kB)
Collecting gensim==4.1.1
  Downloading gensim-4.1.1-cp36-cp36m-win_amd64.whl (24.0 MB)
Collecting Cython==0.29.23
  Downloading Cython-0.29.23-cp36-cp36m-win_amd64.whl (1.6 MB)
Collecting smart-open>=1.8.1
  Using cached smart_open-6.3.0-py3-none-any.whl (56 kB)
Installing collected packages: install, Cython, smart-open, gensim
Successfully installed Cython-0.29.23 gensim-4.1.1 install-1.3.5 smart-open-6.3.0


In [5]:
!pip install numpy



In [3]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.1.5-cp36-cp36m-win_amd64.whl (8.7 MB)
Collecting pytz>=2017.2
  Using cached pytz-2022.7.1-py2.py3-none-any.whl (499 kB)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.1.5 pytz-2022.7.1


In [4]:
! pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp36-cp36m-win_amd64.whl (6.8 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=0.11
  Downloading joblib-1.1.1-py2.py3-none-any.whl (309 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.1.1 scikit-learn-0.24.2 threadpoolctl-3.1.0


In [6]:
# Read in the data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [7]:
messages["text"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [8]:
# Clean data using the built in cleaner in gensim
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head(10)

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"
5,spam,FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for ...,"[freemsg, hey, there, darling, it, been, week, now, and, no, word, back, like, some, fun, you, u..."
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[as, per, your, request, melle, melle, oru, minnaminunginte, nurungu, vettam, has, been, set, as..."
8,spam,WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To ...,"[winner, as, valued, network, customer, you, have, been, selected, to, receivea, prize, reward, ..."
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,"[had, your, mobile, months, or, more, entitled, to, update, to, the, latest, colour, mobiles, wi..."


In [11]:
print(messages['text'][5])
messages['text_clean'][5]


FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv


['freemsg',
 'hey',
 'there',
 'darling',
 'it',
 'been',
 'week',
 'now',
 'and',
 'no',
 'word',
 'back',
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still',
 'tb',
 'ok',
 'xxx',
 'std',
 'chgs',
 'to',
 'send',
 'to',
 'rcv']

In [12]:
messages['label']=messages['label'].map({'ham':1,'spam':0})

In [13]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

In [14]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [17]:
#  it represents all of the words that our Word2Vec model learned a vector for. 
# Or put another way, it's all of the words that appeared in the training data at least twice. So you can exp
w2v_model.wv.index_to_key[0:10]

['to', 'you', 'the', 'and', 'in', 'is', 'me', 'my', 'for', 'it']

In [18]:
# Find the most similar words to "king" based on word vectors from our trained model
w2v_model.wv.most_similar('king')

[('heart', 0.994536280632019),
 ('every', 0.9943959712982178),
 ('on', 0.9943388104438782),
 ('ve', 0.9943382740020752),
 ('number', 0.9943357110023499),
 ('for', 0.9943258166313171),
 ('urgent', 0.9942834973335266),
 ('orange', 0.9942804574966431),
 ('thanks', 0.9942728877067566),
 ('www', 0.9942715167999268)]

In [19]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  """
  import sys


In [21]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))
    if i > 10: 
        break

9 9
20 20
7 7
20 19
5 5
14 13
15 15
15 13
35 35
13 13
10 10
21 20


In [22]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [24]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))
    if i > 10: 
        break

9 100
20 100
7 100
20 100
5 100
14 100
15 100
15 100
35 100
13 100
10 100
21 100


## Fit RandomForestClassifier On Top Of Word Vectors

In [25]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [26]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [27]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.967 / Recall: 0.99 / Accuracy: 0.962


Inference

In [20]:
sentence = gensim.utils.simple_preprocess("Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat")


In [28]:
words = set(w2v_model.wv.index_to_key )
test_vect = np.array([np.array([w2v_model.wv[i] for i in sentence if i in words])])

In [30]:
test_vect_avg = []
for v in test_vect:
    if v.size:
        test_vect_avg.append(v.mean(axis=0))
    else:
        test_vect_avg.append(np.zeros(100, dtype=float))

In [33]:
y_pred = rf_model.predict(test_vect_avg)

In [35]:
y_pred

array([1])