# Ham or Spam?

In [59]:
# when installing nltk for the first time we need to also download a few built in libraries
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to /home/mz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [60]:
import pandas as pd
df = pd.read_csv("emails.csv")
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


The dataset is made up of email that are classified as ham [0] or spam[1]. You need to clean the dataset before training a prediction model.

## Remove Punctuation

👇 Create a function to remove the punctuation. Apply it to the entire data and add the output as a new column in the dataframe called `clean_text`

In [61]:
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [62]:
def remove_punct(x):
    for p in string.punctuation:
        x = x.replace(p, '')
    return x

In [63]:
df["clean_text"] = df['text'].apply(lambda x: remove_punct(x))
df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,Subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,Subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,Subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,Subject 4 color printing special request addi...
4,"Subject: do not have money , get software cds ...",1,Subject do not have money get software cds fr...


## Lower Case

👇 Create a function to lower case the text. Apply it to `clean_text`

In [64]:
df["clean_text"] = df['clean_text'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,subject 4 color printing special request addi...
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fr...


## Remove Numbers

👇 Create a function to remove numbers from the text. Apply it to `clean_text`

In [65]:
def remove_numbers(string):
    string_numberless = ''.join(word for word in string if not word.isdigit())
    return string_numberless

df["clean_text"] = df['clean_text'].apply(lambda x: remove_numbers(x))

df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,subject color printing special request addit...
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fr...


## Remove StopWords

👇 Create a function to remove stopwords from the text. Apply it to `clean_text`.

In [66]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

In [67]:
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    text = ' '.join([word for word in word_tokens if word not in stop_words])    
    return text

df["clean_text"] = df['clean_text'].apply(lambda x: remove_stopwords(x))

In [68]:
df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible corporate ident...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trading gunslinger fanny merrill...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im wa...
3,Subject: 4 color printing special request add...,1,subject color printing special request additio...
4,"Subject: do not have money , get software cds ...",1,subject money get software cds software compat...


## Lemmatize

👇 Create a function to lemmatize the text. Make sure the output is a single string, not a list of words. Apply it to `clean_text`.

In [54]:
import spacy

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm',  disable=["parser", "ner"])

def lemmatizer(text):
    doc = nlp(text)                                           # Parse the sentence using the loaded 'en' model object `nlp`
    root_words = " ".join([token.lemma_ for token in doc])    # Extract the lemma for each token and join
    return root_words

df["lemmatized"] = df['clean_text'].apply(lambda x: lemmatizer(x))

In [70]:
df.head()

Unnamed: 0,text,spam,clean_text,lemmatized
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible corporate ident...,subject naturally irresistible corporate ident...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trading gunslinger fanny merrill...,subject stock trading gunslinger fanny merrill...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im wa...,subject unbelievable new home make easy I m wa...
3,Subject: 4 color printing special request add...,1,subject color printing special request additio...,subject color print special request additional...
4,"Subject: do not have money , get software cds ...",1,subject money get software cds software compat...,subject money get software cd software compati...


## Bag-of-words Modelling

👇 Vectorize the `clean_text` to a Bag-of-Words representation with a default CountVectorizer . Save as `X_bow`.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

X = df["clean_text"]

X_bow = vectorizer.fit_transform(X)

X_bow.toarray()

👇 Cross-validate a MultinomialNB model with the Bag-of-words. Score the model's accuracy.

In [None]:
from sklearn.naive_bayes import MultinomialNB

y = df.spam
nb_model = MultinomialNB()
nb_model.fit(X_bow,y)
nb_model.score(X_bow,y)

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

# Create Pipeline
pipe = make_pipeline(
        (CountVectorizer()),
        (MultinomialNB()))

# Set parameters to search
parameters = {
'countvectorizer__ngram_range': ((1,1), (2,2)),
'multinomialnb__alpha': (0.1,1),}

# Perform grid search
grid_search = GridSearchCV(pipe, parameters, n_jobs=-1,
            verbose=1, scoring = "accuracy", 
            refit=True, cv=5)

grid_search.fit(df.clean_text, df.spam)

grid_search.best_params_

⚠️ Please push the exercise once you are done 🙃

## 🏁 