In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# read datasets:
sp_data0 = pd.read_csv("./enronSpamSubset.csv").drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
sp_data1 = pd.read_csv("./lingSpam.csv").drop(['Unnamed: 0'], axis=1)
sp_data1.drop(sp_data1.tail(1).index,inplace=True) # drop last row since it summarizes all rows again
sp_data = pd.concat([sp_data0, sp_data1]) 
sp_data.head()

Unnamed: 0,Body,Label
0,Subject: stock promo mover : cwtd\n * * * urge...,1
1,Subject: are you listed in major search engine...,1
2,"Subject: important information thu , 30 jun 20...",1
3,Subject: = ? utf - 8 ? q ? bask your life with...,1
4,"Subject: "" bidstogo "" is places to go , things...",1


In [10]:
# test train split
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(sp_data, test_size=0.2, random_state=42)
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2521 entries, 8466 to 9802
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Body    2521 non-null   object
 1   Label   2521 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.1+ KB


In [11]:
# check if we have duplicates
print("There are {} duplicates.".format(np.sum(train_set.duplicated())))

# remove duplicates
if False:
    sp_data = train_set.drop_duplicates() 

There are 217 duplicates.


In [12]:
train_set.info() # check if we have missing values

# delete missing values
if False:
    train_set.dropna(inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10083 entries, 4299 to 7270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Body    10083 non-null  object
 1   Label   10083 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 236.3+ KB


In [117]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')


def prep_mails(text_col, features=[1,1,1,1,1]):
    
    text_col = [text[7:] for text in text_col] # do not include "Subject:" prefix
    
    if features[0]:
        # remove links since they end up in tokens with no meaning
        text_col = [re.sub(r'http\S+', '', text) for text in text_col]
    
    if features[1]:
        # remove everything exept alphabetical characters and numbers
        pattern = ["[^a-zA-Z0-9]", "[^a-zA-Z]"]
        text_col = [re.sub(pattern[1]," ",text) for text in text_col]
    
    if features[2]:
        # convert uppercase chars in lowercase chars
        text_col = [text.lower() for text in text_col]
    
    # turn sentences into seperate worlds
    data_tokenized = [nltk.word_tokenize(text) for text in text_col]
    
    if features[3]:
        # lemmatize all world, that is convert them into most basic form
        lemma = WordNetLemmatizer()
        data_tokenized = [[lemma.lemmatize(word) for word in text] for text in data_tokenized]
    
    if features[4]:
        stopwords = nltk.corpus.stopwords.words("english")
        data_tokenized = [[word for word in text if word not in stopwords] for text in data_tokenized]
    
    return data_tokenized

[nltk_data] Downloading package wordnet to /home/fabian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/fabian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/fabian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [118]:
# clean up text data 
train_set_prepped = prep_mails(train_set["Body"])

In [119]:
def split_words(document_list):
    return [" ".join(text) for text in document_list]

n_features = 300 # maximum number in bag of words extractor
# count occurences of attibutes in mails
# reduced number of attributes to the N-attributes with most occureces
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_bow = CountVectorizer(max_features=n_features)
# Learn the vocabulary dictionary and return document-term matrix.
train_vectors = vectorizer_bow.fit_transform(split_words(train_set_prepped)).toarray()
#vectorizer_bow.get_feature_names()


In [137]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(train_vectors, train_set["Label"] )

KNeighborsClassifier(n_neighbors=2)

In [128]:
# apply cleanup pipeline to testset
test_set_prepped = prep_mails(test_set["Body"])

In [138]:
# vectorize testset by same vectorizer as train set
test_vectors = vectorizer_bow.transform(split_words(test_set_prepped)).toarray()
pred_test = knn.predict(test_vectors)

In [139]:
# evaluate results
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score
accuracy_score(test_set["Label"], pred_test)

0.9044030146767156

In [168]:
# count ony if word occurs or not
train_vectors[train_vectors>0] = 1
test_vectors[test_vectors>0] = 1
knn1 = KNeighborsClassifier(n_neighbors = 1,weights="distance", algorithm="ball_tree")
knn1.fit(train_vectors, train_set["Label"] )
pred_test = knn1.predict(test_vectors)
accuracy_score(test_set["Label"], pred_test)

0.882189607298691

1