In [195]:
#Import libraries 
import os 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import sklearn as scikit_learn
import re 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import string
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/solougbane/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/solougbane/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/solougbane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [196]:
def wordcleaner(text):
    #Remove special characters, punctuations and numbers 
    text = re.sub(r'[^a-zA-Z\s]','',text)
    
    # turn text to lowercase
    text = text.lower()
    
    #Tokenize text
    tokens = word_tokenize(text)
    
    #Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # lemmatize test 
    lemmatizer = WordNetLemmatizer()
    lemwords = [lemmatizer.lemmatize(token) for token in tokens]
    
    #Join tokens back in single string 
    processed_text = ' '.join(lemwords)
    return processed_text 

In [197]:
#Write spam training files into training dataframe 
spamtrain_directory = '/Users/solougbane/Desktop/Project_1/emails/spamtraining'

spamdata = []

for filename in os.listdir(spamtrain_directory):
    if filename.endswith('.txt'):
        # read file contents 
        with open(os.path.join(spamtrain_directory, filename), 'r',errors = 'ignore') as file:
            spamcontent = file.read()
            #Pre-process the text 
            processed_text = wordcleaner(spamcontent)
            # add processed text to list 
            spamdata.append({'content': processed_text, 'spam' : 1})
            
#Write ham training files into training dataframe
hamtrain_directory = '/Users/solougbane/Desktop/Project_1/emails/hamtraining'
for filename in os.listdir(hamtrain_directory):
    if filename.endswith('.txt'):
        # read file contents 
        with open(os.path.join(hamtrain_directory, filename), 'r') as file:
            hamcontent = file.read()
            #Pre-process the ham text 
            processed_text = wordcleaner(hamcontent)
            #add processed ham text to list 
            spamdata.append({'content': processed_text, 'spam' : 0})

train_df = pd.DataFrame(spamdata)


In [198]:
print(content_filtered)

['subject', '', 'communicating', 'effectively', 'course', 'offering', 'daren', '', 'effective', 'communication', 'course', 'december', '7', '', 'thursday', '', '1', '', '5', 'pm', '', 'one', 'suggested', '', '', 'may', 'attend', 'one', '', 'please', 'let', 'know', '', 'listed', 'ernie', '', '200', '', 'thanks', 'looking', '']


In [199]:
print("raw text:", content)

raw text: Subject: communicating effectively course offering
daren -
there is an effective communication course on december 7 ( thursday ) from
1 - 5 pm . is this the one you suggested for me , and if it is , may i attend this
one ? please let me know . it is listed through ernie and is $ 200 . thanks
for looking into this .


In [200]:
print("filtered text:", processed_text)

filtered text: subject communicating effectively course offering daren effective communication course december thursday pm one suggested may attend one please let know listed ernie thanks looking


In [201]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  200 non-null    object
 1   spam     200 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 3.3+ KB


In [202]:
train_df.head(5)

Unnamed: 0,content,spam
0,subject quick way buy soft ware variety top ma...,1
1,subject heisser fetish mann war da ein wochene...,1
2,subject nb real vallum x anax l evitra soma mu...,1
3,subject work wondder dear sir madam please pol...,1
4,subject get free ibm thinkpad computer,1


In [203]:
train_df.tail(5)

Unnamed: 0,content,spam
195,subject neon verse see tonight love scripture doc,0
196,subject fw king ranch balancing xl saxet flowi...,0
197,subject hpl delivery meter cheryl documentatio...,0
198,subject devon availability may forwarded victo...,0
199,subject communicating effectively course offer...,0


In [204]:
# Turn training data (words/content) into vectors 
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
train_cv = cv.fit_transform(train_df['content'])

In [205]:
train_cv.shape

(200, 5946)

In [206]:
#assign X and Y training data
x_train = train_cv
train_label = train_df['spam']
y_train = train_label

In [207]:
#Import testing data 
spamtest_directory = '/Users/solougbane/Desktop/Project_1/emails/spamtesting'

spamtest = []

for filename in os.listdir(spamtest_directory):
    if filename.endswith('.txt'):
        # read spam testing files 
        with open(os.path.join(spamtest_directory, filename), 'r',errors = 'ignore') as file:
            spamtest_content = file.read()
            # pre-process the spam testing data
            processed_text = wordcleaner(spamtest_content)
            # add spam testing to list 
            spamtest.append({'content': processed_text, 'spam' : 1})

hamtest_directory = '/Users/solougbane/Desktop/Project_1/emails/hamtesting'
for filename in os.listdir(hamtest_directory):
    if filename.endswith('.txt'):
        # read ham testing files 
        with open(os.path.join(hamtest_directory, filename), 'r') as file:
            hamtest_content = file.read()
            # pre-processing the ham testing data
            processed_text = wordcleaner(hamtest_content)
            # add ham testing data to list 
            spamtest.append({'content': processed_text, 'spam' : 0})

test_df = pd.DataFrame(spamtest)

In [208]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  200 non-null    object
 1   spam     200 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 3.3+ KB


In [209]:
# turn testing data into vectors
test_cv = cv.transform(test_df['content'])
x_test = test_cv
testlabel = test_df['spam']
y_test = testlabel

Applying SVC 

In [210]:
#import SVC module 
from sklearn.svm import SVC
svm_classifier = SVC(kernel = 'rbf', random_state = 0)
svm_classifier.fit(x_train,y_train)

Evaluate Model 

In [211]:
from sklearn.metrics import classification_report,confusion_matrix
conmat = confusion_matrix(y_train,y_train_predict)
print(conmat)

[[ 74  26]
 [  0 100]]


In [212]:
from sklearn.metrics import classification_report,confusion_matrix
y_train_predict = svm_classifier.predict(x_train)
#predict labels 
y_test_predict = svm_classifier.predict(x_test)
print(classification_report(y_test,y_test_predict))

              precision    recall  f1-score   support

           0       1.00      0.44      0.61       100
           1       0.64      1.00      0.78       100

    accuracy                           0.72       200
   macro avg       0.82      0.72      0.70       200
weighted avg       0.82      0.72      0.70       200

