# Ham or Spam?

In [1]:
# when installing nltk for the first time we need to also download a few built in libraries
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/useradd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/useradd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/useradd/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd

df = pd.read_csv("emails.csv")

df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


The dataset is made up of email that are classified as ham [0] or spam[1]. You need to clean the dataset before training a prediction model.

## Remove Punctuation

üëá Create a function to remove the punctuation. Apply it to the entire data and add the output as a new column in the dataframe called `clean_text`

In [3]:
import string

def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

df['clean_text'] = df.text.apply(remove_punctuation)
df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,Subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,Subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,Subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,Subject 4 color printing special request addi...
4,"Subject: do not have money , get software cds ...",1,Subject do not have money get software cds fr...


## Lower Case

üëá Create a function to lower case the text. Apply it to `clean_text`

In [4]:
def lower_text(text):
    return text.lower()

df.clean_text = df.clean_text.apply(lower_text)

df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,subject 4 color printing special request addi...
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fr...


## Remove Numbers

üëá Create a function to remove numbers from the text. Apply it to `clean_text`

In [5]:
def remove_numbers(text):
    return ''.join(word for word in text if not word.isdigit())

df.clean_text = df.clean_text.apply(remove_numbers)

df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,subject color printing special request addit...
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fr...


## Remove StopWords

üëá Create a function to remove stopwords from the text. Apply it to `clean_text`.

In [6]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    return [w for w in word_tokens if not w in stop_words] 

df.clean_text = df.clean_text.apply(remove_stopwords)

df

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,"[subject, naturally, irresistible, corporate, ..."
1,Subject: the stock trading gunslinger fanny i...,1,"[subject, stock, trading, gunslinger, fanny, m..."
2,Subject: unbelievable new homes made easy im ...,1,"[subject, unbelievable, new, homes, made, easy..."
3,Subject: 4 color printing special request add...,1,"[subject, color, printing, special, request, a..."
4,"Subject: do not have money , get software cds ...",1,"[subject, money, get, software, cds, software,..."
...,...,...,...
5723,Subject: re : research and development charges...,0,"[subject, research, development, charges, gpg,..."
5724,"Subject: re : receipts from visit jim , than...",0,"[subject, receipts, visit, jim, thanks, invita..."
5725,Subject: re : enron case study update wow ! a...,0,"[subject, enron, case, study, update, wow, day..."
5726,"Subject: re : interest david , please , call...",0,"[subject, interest, david, please, call, shirl..."


## Lemmatize

üëá Create a function to lemmatize the text. Make sure the output is a single string, not a list of words. Apply it to `clean_text`.

In [7]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(word) for word in text]

def list_to_string(list):
    return ' '.join(list)

df.clean_text = df.clean_text.apply(lemmatize_text)

df.clean_text = df.clean_text.apply(list_to_string)

df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible corporate ident...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trading gunslinger fanny merrill...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new home made easy im wan...
3,Subject: 4 color printing special request add...,1,subject color printing special request additio...
4,"Subject: do not have money , get software cds ...",1,subject money get software cd software compati...


## Bag-of-words Modelling

üëá Vectorize the `clean_text` to a Bag-of-Words representation with a default CountVectorizer . Save as `X_bow`.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df.clean_text)

X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
X_bow = pd.DataFrame(X.toarray(),columns = vectorizer.get_feature_names())

X_bow



Unnamed: 0,aa,aaa,aaaenerfax,aadedeji,aagrawal,aal,aaldous,aaliyah,aall,aanalysis,...,zwzm,zxghlajf,zyban,zyc,zygoma,zymg,zzmacmac,zzn,zzncacst,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5724,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5725,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5726,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


üëá Cross-validate a MultinomialNB model with the Bag-of-words. Score the model's accuracy.

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

y = df['spam']

model.fit(X_bow, y)

cross_val_score(model, X_bow, y).mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.9895252901681946

‚ö†Ô∏è Please push the exercise once you are done üôÉ

## üèÅ 