## Import Packages

In [1]:
import matplotlib.pyplot as plt
import re
import os
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix

import gensim

## Load Data

In [2]:
data = pd.read_csv("Spam Email.csv", usecols=["CATEGORY", "MESSAGE"])

In [3]:
data

Unnamed: 0,CATEGORY,MESSAGE
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ..."
1,1,ATTENTION: This is a MUST for ALL Computer Use...
2,1,This is a multi-part message in MIME format.\n...
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...
4,1,This is the bottom line. If you can GIVE AWAY...
...,...,...
5791,0,"I'm one of the 30,000 but it's not working ver..."
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\..."
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w..."


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CATEGORY  5796 non-null   int64 
 1   MESSAGE   5796 non-null   object
dtypes: int64(1), object(1)
memory usage: 90.7+ KB


In [5]:
data["CATEGORY"].value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

## Preprocessing

In [6]:
# remove non alphabets
remove_non_alphabets = lambda x: re.sub(r'[^a-zA-Z]',' ',x)

# tokenn alphabets-only list
tokenize = lambda x: word_tokenize(x)

# assign ps to a lambda function to run on each line of value
ps = PorterStemmer()
stem = lambda w: [ ps.stem(x) for x in w ]

# assign lemmatizer to a lambda function to run on each line of value
lemmatizer = WordNetLemmatizer()
leammtizer = lambda x: [ lemmatizer.lemmatize(word) for word in x ]

In [7]:
# apply all above methods to the column ''
print('Processing : [=', end='')
data['MESSAGE'] = data['MESSAGE'].apply(remove_non_alphabets)
print('=', end='')
data['MESSAGE'] = data['MESSAGE'].apply(tokenize)
print('=', end='')
data['MESSAGE'] = data['MESSAGE'].apply(stem)
print('=', end='')
data['MESSAGE'] = data['MESSAGE'].apply(leammtizer)
print('=', end='')
data['MESSAGE'] = data['MESSAGE'].apply(lambda x: ' '.join(x))
print('] : Completed', end='')
data.head()

Processing : [=====] : Completed

Unnamed: 0,CATEGORY,MESSAGE
0,1,dear homeown interest rate are at their lowest...
1,1,attent thi is a must for all comput user new s...
2,1,thi is a multi part messag in mime format next...
3,1,import inform the new domain name are final av...
4,1,thi is the bottom line If you can give away CD...


## Split Train Test Sets

In [8]:
train_corpus, test_corpus, train_labels, test_labels = train_test_split(data["MESSAGE"],
                                                                        data["CATEGORY"],
                                                                        test_size=0.3)

## Features of Machine Learning

### Bag of Words

In [9]:
bow_vectorizer=CountVectorizer(min_df=1, ngram_range=(1,1))
bow_train_features = bow_vectorizer.fit_transform(train_corpus)
bow_test_features = bow_vectorizer.transform(test_corpus)

### TFIDF

In [10]:
tfidf_vectorizer=TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
tfidf_train_features = tfidf_vectorizer.fit_transform(train_corpus)  
tfidf_test_features = tfidf_vectorizer.transform(test_corpus)

### Binary

In [11]:
binary_vectorizer=CountVectorizer(binary = True,min_df=1, ngram_range=(1,1))
binary_train_features = binary_vectorizer.fit_transform(train_corpus)
binary_test_features = binary_vectorizer.transform(test_corpus)

## Define Evaluation Function

In [12]:
def get_metrics(true_labels, predicted_labels):
    metrics_dict = dict(zip(["accuracy", "precision", "recall", "f1"], [None]*4))
    #metrics_dict = {i:None for i in ["accuracy", "precision", "recall", "f1"]}
    for m in metrics_dict.keys():
        exec('''metrics_dict['{}'] = np.round(                                                    
                        metrics.{}_score(true_labels, 
                                               predicted_labels),
                        2)'''.format(m, m))
    return metrics_dict

## Define an Easy-to-use Function for Train/Test/Evaluate

In [13]:
def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    # evaluate model prediction performance   
    '''get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)'''
    print(confusion_matrix(test_labels, predictions))
    print('\n')
    print(metrics.classification_report(test_labels,predictions))
    #return predictions, get_metrics(true_labels=test_labels, predicted_labels=predictions)
    return confusion_matrix(test_labels, predictions)

## Import Classifiers

In [14]:
from sklearn.naive_bayes import MultinomialNB # import naive bayes
from sklearn.tree import DecisionTreeClassifier # import Decision Tree
from sklearn.ensemble import RandomForestClassifier

## Train and Test on BOW features

### Naive Bayes

In [15]:
mnb = MultinomialNB()

# predict and evaluate naive bayes
mnb_bow_predictions, mnb_bow_metrics = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

[[1157    5]
 [ 144  433]]


              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1162
           1       0.99      0.75      0.85       577

    accuracy                           0.91      1739
   macro avg       0.94      0.87      0.90      1739
weighted avg       0.92      0.91      0.91      1739



In [16]:
matrix_nb_bow = (mnb_bow_predictions, mnb_bow_metrics)

In [17]:
print(matrix_nb_bow)

(array([1157,    5]), array([144, 433]))


### Decision Tree

In [18]:
# assign decision tree function to an object
dt = DecisionTreeClassifier()

# predict and evaluate decision tree
dt_bow_predictions, dt_bow_metrics = train_predict_evaluate_model(classifier=dt,
                                                               train_features=bow_train_features,
                                                               train_labels=train_labels,
                                                               test_features=bow_test_features,
                                                               test_labels=test_labels)

[[1116   46]
 [  41  536]]


              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1162
           1       0.92      0.93      0.92       577

    accuracy                           0.95      1739
   macro avg       0.94      0.94      0.94      1739
weighted avg       0.95      0.95      0.95      1739



In [19]:
matrix_dt_bow = (dt_bow_predictions, dt_bow_metrics )

### Random Forest

In [20]:
# assign random forest function to an object
rf = RandomForestClassifier(criterion="entropy")

# predict and evaluate random forest
rf_bow_predictions, rf_bow_metrics = train_predict_evaluate_model(classifier=rf,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

[[1158    4]
 [  38  539]]


              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1162
           1       0.99      0.93      0.96       577

    accuracy                           0.98      1739
   macro avg       0.98      0.97      0.97      1739
weighted avg       0.98      0.98      0.98      1739



In [21]:
matrix_rf_bow = (rf_bow_predictions, rf_bow_metrics )

## Train and Test on TFIDF features

### Naive Bayes

In [22]:
# predict and evaluate naive bayes
mnb_tfidf_predictions, mnb_tfidf_metrics = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

[[1160    2]
 [ 217  360]]


              precision    recall  f1-score   support

           0       0.84      1.00      0.91      1162
           1       0.99      0.62      0.77       577

    accuracy                           0.87      1739
   macro avg       0.92      0.81      0.84      1739
weighted avg       0.89      0.87      0.86      1739



In [23]:
matrix_nb_tfidf = (mnb_tfidf_predictions, mnb_tfidf_metrics)

### Decision Tree

In [24]:
dt_tfidf_predictions, dt_tfidf_metrics = train_predict_evaluate_model(classifier=dt,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

[[1120   42]
 [  36  541]]


              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1162
           1       0.93      0.94      0.93       577

    accuracy                           0.96      1739
   macro avg       0.95      0.95      0.95      1739
weighted avg       0.96      0.96      0.96      1739



In [25]:
matrix_dt_tfidf = ( dt_tfidf_predictions, dt_tfidf_metrics)

### Random Forest

In [26]:
# predict and evaluate random forest
rf_tfidf_predictions, rf_tfidf_metrics = train_predict_evaluate_model(classifier=rf,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

[[1158    4]
 [  41  536]]


              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1162
           1       0.99      0.93      0.96       577

    accuracy                           0.97      1739
   macro avg       0.98      0.96      0.97      1739
weighted avg       0.97      0.97      0.97      1739



In [27]:
matrix_rf_tfidf = ( rf_tfidf_predictions, rf_tfidf_metrics)

## Train and Test on Binary features

### Naive Bayes

In [28]:
mnb_binary_predictions, mnb_binary_metrics = train_predict_evaluate_model(classifier=mnb,
                                           train_features=binary_train_features,
                                           train_labels=train_labels,
                                           test_features=binary_test_features,
                                           test_labels=test_labels)

[[1160    2]
 [ 157  420]]


              precision    recall  f1-score   support

           0       0.88      1.00      0.94      1162
           1       1.00      0.73      0.84       577

    accuracy                           0.91      1739
   macro avg       0.94      0.86      0.89      1739
weighted avg       0.92      0.91      0.90      1739



In [29]:
matrix_nb_binary = (mnb_binary_predictions, mnb_binary_metrics)

### Decision Tree

In [30]:
dt_binary_predictions, dt_binary_metrics = train_predict_evaluate_model(classifier=dt,
                                           train_features=binary_train_features,
                                           train_labels=train_labels,
                                           test_features=binary_test_features,
                                           test_labels=test_labels)

[[1123   39]
 [  42  535]]


              precision    recall  f1-score   support

           0       0.96      0.97      0.97      1162
           1       0.93      0.93      0.93       577

    accuracy                           0.95      1739
   macro avg       0.95      0.95      0.95      1739
weighted avg       0.95      0.95      0.95      1739



In [31]:
matrix_dt_binary = (dt_binary_predictions, dt_binary_metrics)

### Random Forest

In [32]:
rf_binary_predictions, rf_binary_metrics = train_predict_evaluate_model(classifier=rf,
                                           train_features=binary_train_features,
                                           train_labels=train_labels,
                                           test_features=binary_test_features,
                                           test_labels=test_labels)

[[1156    6]
 [  39  538]]


              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1162
           1       0.99      0.93      0.96       577

    accuracy                           0.97      1739
   macro avg       0.98      0.96      0.97      1739
weighted avg       0.97      0.97      0.97      1739



In [33]:
matrix_rf_binary = (rf_binary_predictions, rf_binary_metrics)

## Visualize Perfomance Matrix

In [34]:
# create a dictionary that stores all the accuracy information
performance_dict = {}

for me in ["accuracy", "precision", "recall", "f1"]:
    performance_dict[me] = {}
    for m in ["mnb","dt","rf"]:
        performance_dict[me][m] = {}
        for f in ["bow","tfidf","binary"]:
            exec('performance_dict["{}"]["{}"]["{}"] = {}_{}_metrics["{}"]'.format(me, m, f, m, f, me))
        
#Accuracy Matrix
print("\n\033[1;31mAccuracy Matrix\n\033[0m")
print(pd.DataFrame(performance_dict["accuracy"]).rename(columns={"mnb":"Naive Bayes", 
                                            "dt":"Decision Tree", 
                                            "rf":"Random Forest"}, 
                                   index={"bow":"Bag-of-words", 
                                          "tfidf":"TFIDF", 
                                         "binary":"Binary" }))

#Precision Matrix
print("\n\033[1;31mPrecision Matrix\n\033[0m")
print(pd.DataFrame(performance_dict["precision"]).rename(columns={"mnb":"Naive Bayes", 
                                            "dt":"Decision Tree", 
                                            "rf":"Random Forest"}, 
                                   index={"bow":"Bag-of-words", 
                                          "tfidf":"TFIDF", 
                                          "binary":"Binary"}))

#Recall Matrix
print("\n\033[1;31mRecall Matrix\n\033[0m")
print(pd.DataFrame(performance_dict["recall"]).rename(columns={"mnb":"Naive Bayes", 
                                            "dt":"Decision Tree", 
                                            "rf":"Random Forest"}, 
                                   index={"bow":"Bag-of-words", 
                                          "tfidf":"TFIDF", 
                                         "binary":"Binary" }))

#F1 Score Matrix
print("\n\033[1;31mF1 Score Matrix\n\033[0m")
print(pd.DataFrame(performance_dict["f1"]).rename(columns={"mnb":"Naive Bayes", 
                                            "dt":"Decision Tree", 
                                            "rf":"Random Forest"}, 
                                   index={"bow":"Bag-of-words", 
                                          "tfidf":"TFIDF", 
                                         "binary":"Binary"  }))

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

## Cost for Bag of Words (Frequency)

In [35]:
bow_nb_cost = matrix_nb_bow [1][0] * 5 + matrix_nb_bow[0][1] * 100
bow_dt_cost = matrix_dt_bow [1][0] * 5 + matrix_dt_bow[0][1] * 100
bow_rf_cost = matrix_rf_bow [1][0] * 5 + matrix_rf_bow[0][1] * 100

## Cost for TFIDF

In [36]:
tfidf_nb_cost = matrix_nb_tfidf[1][0] * 5 + matrix_nb_tfidf [0][1] * 100
tfidf_dt_cost = matrix_dt_tfidf[1][0] * 5 + matrix_dt_tfidf [0][1] * 100
tfidf_rf_cost = matrix_rf_tfidf[1][0] * 5 + matrix_rf_tfidf[0][1] * 100

## Cost for Binary

In [37]:
binary_nb_cost = matrix_nb_binary[1][0] * 5 + matrix_nb_binary [0][1] * 100
binary_dt_cost = matrix_dt_binary[1][0] * 5 + matrix_dt_binary [0][1] * 100
binary_rf_cost = matrix_rf_binary[1][0] * 5 + matrix_rf_binary [0][1] * 100

In [38]:
print('bow_nb_cost = ',bow_nb_cost)
print('bow_dt_cost = ',bow_dt_cost)
print('bow_rf_cost = ',bow_rf_cost)
print('tfidf_nb_cost = ',tfidf_nb_cost)
print('tfidf_dt_cost = ', tfidf_dt_cost)
print('tfidf_rf_cost = ',tfidf_rf_cost)
print('binary_nb_cost = ', binary_nb_cost)
print('binary_dt_cost = ', binary_dt_cost)
print('binary_rf_cost = ', binary_rf_cost)

bow_nb_cost =  1220
bow_dt_cost =  4805
bow_rf_cost =  590
tfidf_nb_cost =  1285
tfidf_dt_cost =  4380
tfidf_rf_cost =  605
binary_nb_cost =  985
binary_dt_cost =  4110
binary_rf_cost =  795
