In [1]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
dataset = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['label', 'message'])
lemmatizer = WordNetLemmatizer()

In [17]:
dataset.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
corpus = []
for i in range(len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['message'][i])
    review = review.lower()
    review = review.split()
    review = ' '.join([lemmatizer.lemmatize(word) for word in review])
    corpus.append(review)

In [5]:
dataset = dataset[list(map(lambda x: len(x)>0, corpus))]
y = pd.get_dummies(dataset['label'], drop_first=True).values.flatten('C')
y.shape

(5569,)

In [6]:
words = []
for sent in corpus:
    sent_token = sent_tokenize(sent)
    for token in sent_token:
        words.append(simple_preprocess(token))

In [7]:
model = Word2Vec(words)

In [8]:
model.corpus_count, model.epochs

(5569, 5)

In [9]:
def avg_word2vec(doc):
    valid_words = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    return np.mean(valid_words, axis=0) if valid_words else np.zeros(model.vector_size)


In [10]:
X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))
X = np.array(X)

100%|███████████████████████████████████████████████████████████████████████████| 5569/5569 [00:00<00:00, 11135.18it/s]


In [11]:
df = pd.DataFrame()
for i in range(len(X)):
    df = pd.concat([df, pd.DataFrame(X[i].reshape(1,-1))], ignore_index=True)

df['output'] = y

In [12]:
df.dropna(axis=0, inplace=True)
X = df.drop('output', axis=1).values
y = df['output'].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)

RandomForestClassifier()

In [15]:
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.9748653500897666

In [16]:
np.mean(cross_val_score(RandomForestClassifier(), X, y))

0.9683969926170395