In [19]:
import numpy as np
import pandas as pd
import nltk

In [3]:
df = pd.read_csv('./spam.csv',encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df = df[['v1','v2']]

In [5]:
df = df.rename(columns={'v1':'label','v2':'text'})

In [6]:
df.shape

(5572, 2)

In [8]:
df.isnull().sum()

label    0
text     0
dtype: int64

In [7]:
df.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

##### Remove Punctuations

In [9]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def remove_punct(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

In [14]:
df['test_clean'] = df.text.apply(lambda x: remove_punct(x))
df.head()

Unnamed: 0,label,text,test_clean
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...


##### Tokenize

In [15]:
import re

def tokenize(x):
    tokens = re.split('\W',x)
    return tokens

In [16]:
df['text_tokens'] = df['test_clean'].apply(lambda x: tokenize(x.lower()))

In [17]:
df.head()

Unnamed: 0,label,text,test_clean,text_tokens
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


##### Remove Stopwords

In [27]:
import requests
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = set(stopwords_list.decode().splitlines()) 

In [30]:
def remove_stopwords(text_tokens):
    text = [word for word in text_tokens if word not in stopwords]
    return text

In [31]:
df['without_stopwords'] = df['text_tokens'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,label,text,test_clean,text_tokens,without_stopwords
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[jurong, point, crazy, bugis, great, buffet, c..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, cup, final, ..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[dun, early, hor]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, usf, lives]"


##### Vectorize

In [32]:
def clean_text(text):
    text = "".join([char.lower() for char in text if char not in string.punctuation ])
    tokens = re.split("\W",text)
    no_stopwords = [word for word in tokens if word not in stopwords]
    return no_stopwords

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfidf_vect.fit_transform(df['text'])

In [34]:
X_features = pd.DataFrame(X_tfidf.toarray())
X_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8852,8853,8854,8855,8856,8857,8858,8859,8860,8861
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### ML Model

In [35]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_features,df['label'],test_size=0.3,random_state=0)

In [36]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train,y_train)
y_pred = rf_model.predict(X_test)

In [38]:
from sklearn.metrics import accuracy_score,precision_score,recall_score

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred,pos_label='spam')
recall = recall_score(y_test,y_pred,pos_label='spam')

print('accuracy:',round(accuracy,3))
print('precision:',round(precision,3))
print('recall:',round(recall,3))

accuracy: 0.967
precision: 1.0
recall: 0.769
