In [None]:
### import packages

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score
# Do not change the random seed, or else the tests will fail!
np.random.seed(0)
%matplotlib inline

### Dataset

For this lab, we'll be working with the Spambase Dataset from UC Irvine's Machine Learning Repository.

This dataset contains emails that have already been vectorized, as well as summary statistics about each email that can also be useful in classification. In this case, the Data Dictionary containing the names and descriptions of each column is stored in a separate file from the dataset itself. For ease of use, we have included the spambase.csv file in this repo. However, we have not included the Data Dictionary and column names.

In [None]:
df = pd.read_csv('spambase.csv', header=None)
df.head()

In [None]:
### Adding in column names

column_names = ['word_freq_make',
    'word_freq_address',
    'word_freq_all',     
    'word_freq_3d',           
    'word_freq_our',          
    'word_freq_over',         
    'word_freq_remove',       
    'word_freq_internet',     
    'word_freq_order',        
    'word_freq_mail',         
    'word_freq_receive',      
    'word_freq_will',         
    'word_freq_people',       
    'word_freq_report',       
    'word_freq_addresses',    
    'word_freq_free',         
    'word_freq_business',     
    'word_freq_email',        
    'word_freq_you',          
    'word_freq_credit',       
    'word_freq_your',         
    'word_freq_font',         
    'word_freq_000',          
    'word_freq_money',        
    'word_freq_hp',           
    'word_freq_hpl',          
    'word_freq_george',       
    'word_freq_650',          
    'word_freq_lab',          
    'word_freq_labs',         
    'word_freq_telnet',       
    'word_freq_857',          
    'word_freq_data',         
    'word_freq_415',          
    'word_freq_85',           
    'word_freq_technology',   
    'word_freq_1999',         
    'word_freq_parts',        
    'word_freq_pm',           
    'word_freq_direct',       
    'word_freq_cs',           
    'word_freq_meeting',      
    'word_freq_original',     
    'word_freq_project',      
    'word_freq_re',           
    'word_freq_edu',          
    'word_freq_table',        
    'word_freq_conference',   
    'char_freq_;',            
    'char_freq_(',            
    'char_freq_[',            
    'char_freq_!',            
    'char_freq_$',            
    'char_freq_#',            
    'capital_run_length_average', 
    'capital_run_length_longest', 
    'capital_run_length_total',
     'is_spam'              
    ]   

df.columns = column_names
df.head()

In [None]:
# Check for Null Values
df.isna().sum().unique() 

In [None]:
#get basic statistical measures
df.describe()

In [None]:
#checking target information
df['is_spam'].sum() / len(df['is_spam'])

In [None]:
#Correlation Heatmap

sns.set(style='white')

corr = df.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(22,18))

cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5, center=0, 
            square=True, linewidths=.5)

In [None]:
#Create train and test data
target = df['is_spam']
clean_df = df.drop('is_spam', axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(clean_df, target, stratify=target)

In [None]:
#fit model
clf = GaussianNB()
clf.fit(X_train, y_train)
GaussianNB(priors=None)

In [None]:
#make predictions
preds = clf.predict(X_test)
#check accuracy score
accuracy = accuracy_score(y_test, preds)
#check f1 score
f1 = f1_score(y_test, preds)
print("Accuracy Score for model: {:.4}%".format(accuracy * 100))
print("F1 Score for model: {:.4}%".format(f1 * 100))

In [None]:
# confusion matrix to interpret results
def confusion_matrix(predictions, labels):
    labels = list(labels)
    cm = {'TP': 0, 'TN': 0, 'FP':0, 'FN':0}
    for i in range(len(predictions)):
        pred = predictions[i]
        label = labels[i]
        if pred == label:
            if pred == 1:
                cm['TP'] += 1
            else:
                cm['TN'] += 1
        else:
            if pred == 1:
                cm['FP'] += 1
            else:
                cm['FN'] += 1
    
    return cm

training_preds = clf.predict(X_train)
training_cm = confusion_matrix(training_preds, y_train)
testing_cm = confusion_matrix(preds, y_test)

print("Training Confusion Matrix: {}".format(training_cm))
print("Testing Confusion Matrix: {}".format(testing_cm))