# Ham or Spam

## (0) The NTLK library (Natural Language Toolkit)

In [1]:
# !pip install nltk

In [2]:
import nltk

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)      # For nltk<3.9.0
nltk.download('punkt_tab', quiet=True)  # For nltk>=3.9.0
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

In [27]:
# Importações
import string
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score

In [11]:
df = pd.read_csv("../data/nlp/ham_spam_emails.csv")
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [12]:
df.shape

(5728, 2)

In [13]:
df.spam.value_counts(normalize=True)

spam
0    0.761173
1    0.238827
Name: proportion, dtype: float64

## (1) Cleaning the (text) dataset

### (1.1) Remove Punctuation

In [14]:
def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

df['clean_text'] = df.text.apply(remove_punctuation)

df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,Subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,Subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,Subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,Subject 4 color printing special request addi...
4,"Subject: do not have money , get software cds ...",1,Subject do not have money get software cds fr...


## (1.2) Lower Case 

In [16]:
def lowercase(text):
    text = text.lower()
    return text

df["clean_text"] = df.clean_text.apply(lowercase)
df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,subject 4 color printing special request addi...
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fr...


## (1.3) Remove Numbers

In [18]:
def remove_numbers(text):
    text = ''.join([char for char in text if not char.isdigit()])
    return text

df["clean_text"] = df.clean_text.apply(remove_numbers)
df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,subject color printing special request addit...
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fr...


## (1.4) Remove Stopwords

In [20]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokenized = word_tokenize(text)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

df["clean_text"] = df.clean_text.apply(remove_stopwords)
df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,"[subject, naturally, irresistible, corporate, ..."
1,Subject: the stock trading gunslinger fanny i...,1,"[subject, stock, trading, gunslinger, fanny, m..."
2,Subject: unbelievable new homes made easy im ...,1,"[subject, unbelievable, new, homes, made, easy..."
3,Subject: 4 color printing special request add...,1,"[subject, color, printing, special, request, a..."
4,"Subject: do not have money , get software cds ...",1,"[subject, money, get, software, cds, software,..."


## (1.5) Lemmatize

In [22]:
def lemma(text):
    lemmatizer = WordNetLemmatizer() # Instantiate lemmatizer
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    lemmatized_string = " ".join(lemmatized)
    return lemmatized_string

df['clean_text'] = df.clean_text.apply(lemma)

df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible corporate ident...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trading gunslinger fanny merrill...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new home made easy im wan...
3,Subject: 4 color printing special request add...,1,subject color printing special request additio...
4,"Subject: do not have money , get software cds ...",1,subject money get software cd software compati...


# (2) Bag-of-words Modeling

## (2.1) Digitizing the textual data into numbers

In [24]:
count_vectorizer = CountVectorizer()
X_bow = count_vectorizer.fit_transform(df.clean_text)

## (2.2) Multinomial Naive Bayes Modeling

In [32]:
cv_results = cross_validate(MultinomialNB(), X_bow, df.spam, cv=10, scoring=["accuracy"])
cv_results['test_accuracy'].mean()

np.float64(0.9900502202858223)