### Read in the data and clean up column names

In [183]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
pd.set_option('display.max_colwidth',100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages = messages.drop(labels=["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis = 1)
messages.columns = ["label","text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [184]:
print(messages.groupby('label').count())
drop_random=np.random.choice(messages[messages['label']=='ham'].index,size=4000,replace=False)
messages=messages.drop(drop_random)


       text
label      
ham    4825
spam    747


In [185]:
messages.groupby('label').count()


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
ham,825
spam,747


### Clean data using the built in cleaner in gensim

In [186]:
#messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages['text_list'] = messages['text'].apply(lambda x: x.split(" "))
messages.head()

Unnamed: 0,label,text,text_list
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005., Text, FA, t..."
5,spam,FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for ...,"[FreeMsg, Hey, there, darling, it's, been, 3, week's, now, and, no, word, back!, I'd, like, some..."
8,spam,WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To ...,"[WINNER!!, As, a, valued, network, customer, you, have, been, selected, to, receivea, å£900, pri..."
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,"[Had, your, mobile, 11, months, or, more?, U, R, entitled, to, Update, to, the, latest, colour, ..."
11,spam,"SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, ...","[SIX, chances, to, win, CASH!, From, 100, to, 20,000, pounds, txt>, CSH11, and, send, to, 87575...."


### Encoding the label column

In [187]:
messages['label'] = messages['label'].map({'ham':1,'spam':0})


### Split data into train and test sets

In [188]:
# X_train, X_test, y_train,y_test = train_test_split(messages['text_clean'],messages['label'],test_size=0.2)
X_train, X_test, y_train,y_test = train_test_split(messages['text_list'],messages['label'],test_size=0.2)


### Train the word2vec model

In [189]:
w2v_model = gensim.models.Word2Vec(X_train , vector_size = 100 , window = 5 , min_count = 2)

##### Represents all of the words that our Word2Vec model learned a vector for. Or put another way, it's all of the words that appeared in the training data at least twice. So you can exp

In [190]:
w2v_model.wv.index_to_key[90:99]

['&lt;#&gt;', 'Text', 'come', 'This', 'n', 'Please', 'then', 'awarded', 'they']

### Find most similar words to "King" based on word vectors from our trained model

In [191]:
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train],dtype=object)
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test],dtype=object)

In [192]:
len(w2v_model.wv.index_to_key)

2457

### Generate aggregated sentence vectors based on the word vectors for each word in the sentence

In [193]:

# from keras_preprocessing.sequence import pad_sequences

# max_len = 100 # set the maximum length of the sequences
# words = set(w2v_model.wv.index_to_key )
# X_train_vect = pad_sequences([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train], maxlen=max_len, dtype='float32', padding='post', truncating='post', value=0.0)
# X_test_vect = pad_sequences([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test], maxlen=max_len, dtype='float32', padding='post', truncating='post', value=0.0)

#### Why is the length of the sentence different than the length of the sentence vector?

In [194]:
for i,v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]),len(v))
    if i>10:
        break

24 24
23 18
9 9
28 25
28 21
10 8
24 24
27 24
30 20
10 9
7 6
4 3


### Compute sentence vectors by avergaing the word vectors for the words contained in the sentence

In [195]:

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

#### Are our sentence vector lengths consistent?

In [196]:
for i,v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]),len(v))
    if i > 10:
        break

24 100
23 100
9 100
28 100
28 100
10 100
24 100
27 100
30 100
10 100
7 100
4 100


#### Fit RandomForestClassifier On Top Of Word Vectors

In [197]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

### Prediction 

In [198]:
y_pred = rf_model.predict(X_test_vect_avg)

In [199]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.898 / Recall: 0.863 / Accuracy: 0.863


In [200]:
svc = SVC()
svc_model = rf.fit(X_train_vect_avg,y_train.values.ravel())

In [201]:
y_pred = svc_model.predict(X_test_vect_avg)

In [202]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.888 / Recall: 0.869 / Accuracy: 0.86


In [203]:
sentence_test = "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"
#sentence_test = "Nah I don't think he goes to usf, he lives around here though"
sentence_test =  gensim.utils.simple_preprocess(sentence_test)
words = set(w2v_model.wv.index_to_key )
sentence_test_vect = np.array([np.array([w2v_model.wv[i] for i in sentence_test if i in words])])

sentence_test_vect_avg = []
for v in sentence_test_vect:
    if v.size:
        sentence_test_vect_avg.append(v.mean(axis=0))
    else:
        sentence_test_vect_avg.append(np.zeros(100, dtype=float))
rf.predict(sentence_test_vect_avg)

array([0], dtype=int64)

In [204]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

[[112  20]
 [ 24 159]]
