# Ham or Spam?

In [43]:
# when installing nltk for the first time we need to also download a few built in libraries
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/humbert/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/humbert/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/humbert/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/humbert/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [3]:
import pandas as pd

df = pd.read_csv("emails.csv")

df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


The dataset is made up of email that are classified as ham [0] or spam[1]. You need to clean the dataset before training a prediction model.

## Remove Punctuation

👇 Create a function to remove the punctuation. Apply it to the entire data and add the output as a new column in the dataframe called `clean_text`

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [18]:
df['spam'].dtype

dtype('int64')

In [11]:
[s for s in df.astype('object')]

['text', 'spam']

In [27]:
import string 

def remove_punct(df_to_treat):
    dataf = df_to_treat.copy()
    for col in dataf:
        if dataf[col].dtype == 'O':
            for punct in string.punctuation:
                dataf[col] = [text.replace(punct, '') for text in dataf[col]]
    return dataf

In [28]:
clean_text = remove_punct(df)

## Lower Case

👇 Create a function to lower case the text. Apply it to `clean_text`

In [29]:
def lower_func(df_to_treat):
    dataf = df_to_treat.copy()
    for col in dataf:
        if dataf[col].dtype == 'O':
            for punct in string.punctuation:
                dataf[col] = [text.lower() for text in dataf[col]]
    return dataf

In [31]:
lower_text = lower_func(clean_text)

## Remove Numbers

👇 Create a function to remove numbers from the text. Apply it to `clean_text`

In [35]:
def remove_nb(df_to_treat):
    dataf = df_to_treat.copy()
    for col in dataf:
        if dataf[col].dtype == 'O':
            dataf[col] = [''.join(word for word in text if not word.isdigit()) for text in dataf[col]]
    return dataf

In [39]:
clean_text = remove_nb(lower_text)
clean_text

Unnamed: 0,text,spam
0,subject naturally irresistible your corporate ...,1
1,subject the stock trading gunslinger fanny is...,1
2,subject unbelievable new homes made easy im w...,1
3,subject color printing special request addit...,1
4,subject do not have money get software cds fr...,1
...,...,...
5723,subject re research and development charges t...,0
5724,subject re receipts from visit jim thanks ...,0
5725,subject re enron case study update wow all ...,0
5726,subject re interest david please call shi...,0


## Remove StopWords

👇 Create a function to remove stopwords from the text. Apply it to `clean_text`.

In [37]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 

def remove_sw(df_to_treat):
    dataf = df_to_treat.copy()
    for col in dataf:
        if dataf[col].dtype == 'O':
            dataf[col] = [[w for w in word_tokenize(text) if not w in stop_words] for text in dataf[col]]
    return dataf


In [40]:
no_stopwords_df = remove_sw(clean_text)
no_stopwords_df

Unnamed: 0,text,spam
0,"[subject, naturally, irresistible, corporate, ...",1
1,"[subject, stock, trading, gunslinger, fanny, m...",1
2,"[subject, unbelievable, new, homes, made, easy...",1
3,"[subject, color, printing, special, request, a...",1
4,"[subject, money, get, software, cds, software,...",1
...,...,...
5723,"[subject, research, development, charges, gpg,...",0
5724,"[subject, receipts, visit, jim, thanks, invita...",0
5725,"[subject, enron, case, study, update, wow, day...",0
5726,"[subject, interest, david, please, call, shirl...",0


## Lemmatize

👇 Create a function to lemmatize the text. Make sure the output is a single string, not a list of words. Apply it to `clean_text`.

In [54]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemm_func(df_to_treat):
    dataf = df_to_treat.copy()
    for col in dataf:
        if dataf[col].dtype == 'O':
            dataf[col] = [" ".join([lemmatizer.lemmatize(word) for word in text]) for text in dataf[col]]
    return dataf

In [48]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('trading')

'trading'

In [55]:
clean_text = lemm_func(no_stopwords_df)
clean_text

Unnamed: 0,text,spam
0,subject naturally irresistible corporate ident...,1
1,subject stock trading gunslinger fanny merrill...,1
2,subject unbelievable new home made easy im wan...,1
3,subject color printing special request additio...,1
4,subject money get software cd software compati...,1
...,...,...
5723,subject research development charge gpg forwar...,0
5724,subject receipt visit jim thanks invitation vi...,0
5725,subject enron case study update wow day super ...,0
5726,subject interest david please call shirley cre...,0


## Bag-of-words Modelling

👇 Vectorize the `clean_text` to a Bag-of-Words representation with a default CountVectorizer . Save as `X_bow`.

In [56]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X_bow = vectorizer.fit_transform(clean_text.text)

X_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

👇 Cross-validate a MultinomialNB model with the Bag-of-words. Score the model's accuracy.

In [59]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

y = clean_text.spam

nb_model = MultinomialNB()

cv = cross_val_score(nb_model, X_bow, y, scoring = 'accuracy')

In [62]:
score = cv.mean()
score

0.9895252901681946

⚠️ Please push the exercise once you are done 🙃

## 🏁 