## Spam Classification with SVMs

### Email Preprocessing

###### To use an SVM to classify emails into Spam v.s. Non-Spam, you first need to convert each email into a vector of features. In this part,you will implement the preprocessing steps for each email. You should complete the code in processEmail.m to produce a word indices vector for a given email.

In [1]:
import numpy as np
import random
import re
import pandas as pd
import scipy.io as sio   ## for loading octave matlab file
import math
from sklearn.model_selection import train_test_split
from scipy.special import expit #Vectorized sigmoid function
import matplotlib.pyplot as plt
import scipy.optimize #fmin_cg to train neural network
from sklearn import svm
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import metrics
from nltk.stem import PorterStemmer   ###### THIS LIBRARY IS USED FOR THE PROCESS OF STEMMING 



In [2]:
original_file = open(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\emailSample1.txt','r')
email_contents = original_file.read()

In [3]:
'''
 GETVOCABLIST reads the fixed vocabulary list in vocab.txt and returns a
 cell array of the words list.

'''

def getVocabList():
    with open(r"C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\vocab.txt") as vocab:
        vocabList = vocab.read()
        
    splitted_list = vocabList.split('\n')
    
    vocab_dict = {}
    for i in range(len(splitted_list)-1):
        entry = splitted_list[i].split('\t')   
        vocab_dict[entry[1]] = entry[0]    ## note that we save the key-value pair as 'word':index
    return vocab_dict

In [4]:
vocab_dict = getVocabList()
print(" +++++++++++         raw email          ++++++++++++++++++\n")
print(email_contents)
vocab_dict

 +++++++++++         raw email          ++++++++++++++++++

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com




{'aa': '1',
 'ab': '2',
 'abil': '3',
 'abl': '4',
 'about': '5',
 'abov': '6',
 'absolut': '7',
 'abus': '8',
 'ac': '9',
 'accept': '10',
 'access': '11',
 'accord': '12',
 'account': '13',
 'achiev': '14',
 'acquir': '15',
 'across': '16',
 'act': '17',
 'action': '18',
 'activ': '19',
 'actual': '20',
 'ad': '21',
 'adam': '22',
 'add': '23',
 'addit': '24',
 'address': '25',
 'administr': '26',
 'adult': '27',
 'advanc': '28',
 'advantag': '29',
 'advertis': '30',
 'advic': '31',
 'advis': '32',
 'ae': '33',
 'af': '34',
 'affect': '35',
 'affili': '36',
 'afford': '37',
 'africa': '38',
 'after': '39',
 'ag': '40',
 'again': '41',
 'against': '42',
 'agenc': '43',
 'agent': '44',
 'ago': '45',
 'agre': '46',
 'agreement': '47',
 'aid': '48',
 'air': '49',
 'al': '50',
 'alb': '51',
 'align': '52',
 'all': '53',
 'allow': '54',
 'almost': '55',
 'alon': '56',
 'along': '57',
 'alreadi': '58',
 'alsa': '59',
 'also': '60',
 'altern': '61',
 'although': '62',
 'alwai': '63',
 'am': 

In [5]:
## two methods for removing html tags
# method 1
def cleanhtml(test):
    cleaner = re.compile('<.*?>')
    cleantext = re.sub(cleaner, '' , test)
    return cleantext
test = """
<html>
<body>

<h1>My First Heading</h1>
<p>My first paragraph.</p>

</body>
</html>
"""
print(cleanhtml(test))

# method 2
text = re.sub('<[^<]+?>', ' ', test)
print(text)





My First Heading
My first paragraph.





 
 

 My First Heading 
 My first paragraph. 

 
 



In [6]:
'''
PROCESSEMAIL preprocesses a the body of an email and returns a list of word_indices preprocesses the body of an email and 
returns a list of indices of the words contained in the email. 
'''
def processEmail(email_contents,vocab_dict):
    
    word_indices = []

    hdrstart = email_contents.find('\n\n')
    email_contents = email_contents[hdrstart:]
    
    email_contents = email_contents.lower()       ## converting all uppercase to lower case
    '''
     Strip all HTML
     Looks for any expression that starts with < and ends with > and replace
     and does not have any < or > in the tag it with a space

     email_contents = cleanhtml(email_contents)    ## removing html tags

    '''
    email_contents = re.sub('<[^<]+?>', ' ', email_contents)  ## remove html tags, note it will not remove simply < or >

    # Handle Numbers Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+?', ' ', email_contents)     

    #Handle URLS  Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)     

    #Handle Email Addresses Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)     

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    #print( " =====================          processed Email              ======================= \n")
    #print(email_contents)


    #######        TOKENIZE EMAIL      #########

    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%\\n]', email_contents)

    ps = PorterStemmer()            ####   porter stemmer is used to stem the word Eg. ps.stem('include') = includ

    for i in tokens:
        if len(i) > 1:
            i = ps.stem(i)
    ##Look up the word in the dictionary and add to word_indices if found
            if i in vocab_dict.keys():                 ## check if that word is in the vocab list
                word_indices.append(vocab_dict[i])     ## adding the index if the word is found 
    
    return word_indices


In [7]:
word_indices = processEmail(email_contents,vocab_dict)
print(len(word_indices))

12


### Extracting features from email

In [8]:
# takes in a word_indices vector and 
# produces a feature vector from the word indices.
def emailFeatures(word_indices):
    n = len(vocab_dict)  #% Total number of words in the dictionary
    featureVector = np.zeros((n,1))
    for i in word_indices:
        featureVector[int(i) - 1] = 1
    return featureVector


In [9]:
features = emailFeatures(word_indices)

In [10]:

# Print Stats
print('Length of feature vector: \n', len(features));
print('Number of non-zero entries: \n', sum(features > 0));


Length of feature vector: 
 1899
Number of non-zero entries: 
 [11]


## Part 3: Train Linear SVM for Spam Classification

##### In this section, you will train a linear classifier to determine if an email is Spam or Not-Spam.

In [11]:
#Load the Spam Email dataset You will have X, y in your environment
spamTrainMat = sio.loadmat(r"C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\spamTrain.mat")
spamTestMat =  sio.loadmat(r"C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\spamTest.mat")

X=spamTrainMat['X']
y=spamTrainMat['y']          #### taking out X and y from the matrix

Xtest = spamTestMat['Xtest']
ytest = spamTestMat['ytest']          #### taking out Xtest and ytest from the matrix

C = 0.1                     ## C =0.1  parameter (1/Lambda)
                                      
linear_svm = SVC(C=0.1,kernel='linear')            ## making the model
linear_svm.fit(X,y.ravel())                        ## fitting the model
y_pred = linear_svm.predict(X)                     ## predicting the values
y_pred_test = linear_svm.predict(Xtest)                     ## predicting the values

In [12]:
print("accuracy on training data ")
print(metrics.accuracy_score(y,y_pred))
print("accuracy on testing data ")
print(metrics.accuracy_score(ytest,y_pred_test))


accuracy on training data 
0.99825
accuracy on testing data 
0.989


### Top Predictors of Spam

In [13]:
'''
Since the model we are training is a linear SVM, we can inspect the
%  weights learned by the model to understand better how it is determining
%  whether an email is spam or not. The following code finds the words with
%  the highest weights in the classifier. Informally, the classifier
%  'thinks' that these words are the most likely indicators of spam.'''
wts = linear_svm.coef_
so = -np.sort(-linear_svm.coef_)
print(wts)
print(so)
vocab_dict_values = list(vocab_dict.keys())
for i in range(0,15):
    index = list(wts[0]).index(so[0][i])
    print(vocab_dict_values[index],so[0][i])
    

[[ 0.00793208  0.01563324  0.05546492 ... -0.08670606 -0.00661274
   0.06506632]]
[[ 0.50061374  0.46591639  0.42286912 ... -0.42835516 -0.43807244
  -0.60513164]]
our 0.5006137361746403
click 0.465916390688888
remov 0.42286911706104086
guarante 0.38362160179406524
visit 0.367710398245535
basenumb 0.3450640979461706
dollar 0.3236320357963838
will 0.2697241060374008
price 0.2672977146177071
pleas 0.2611688867001495
most 0.2572981979518164
nbsp 0.2539414551595324
lo 0.25346652431419925
ga 0.24829699045568662
hour 0.24640435783158998


### Try Your Own Emails

In [14]:
'''
  Now that you've trained the spam classifier, you can use it on your own
%  emails! In the starter code, we have included spamSample1.txt,
%  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. 
%  The following code reads in one of these emails and then uses your 
%  learned SVM classifier to determine whether the email is Spam or 
%  Not Spam 
Set the file to be read in (change this to spamSample2.txt,
% emailSample1.txt or emailSample2.txt to see different predictions on
% different emails types). Try your own emails as well!
'''

## trying spamsample1.txt
my_sample1 = open(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\emailSample2.txt','r')
email_contents = my_sample1.read()
word_indices = processEmail(email_contents,vocab_dict)
x = emailFeatures(word_indices)
y = linear_svm.predict(x.T)
if y == 1:
    print ('this is a SPAM')
else:
    print ('No, this is not a SPAM')
print(y)

No, this is not a SPAM
[0]


In [15]:
# some other examples 
email_list = [r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\my_email.txt',  #spam
             r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\my_email1.txt',  # not spam =
              r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\my_email2.txt', # not spam =
              r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\my_email3.txt', # not spam
              r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\my_email4.txt', # spam =
              r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\my_email5.txt', # spam =
              r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\my_email6.txt', # spam
              r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\my_email7.txt', # spam
              r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\my_email8.txt', # not spam
              r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\my_email9.txt', #  spam
              r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\my_email10.txt' # not spam
             ]

for i in email_list:
    my_sample1 = open(i,'r')                                 ## open
    email_contents = my_sample1.read()                       ## read
    word_indices = processEmail(email_contents,vocab_dict)   ## process 
    x = emailFeatures(word_indices)
    y = linear_svm.predict(x.T)
    if y == 1:
        print ('this is a SPAM')
    else:
        print ('No, this is not a SPAM')
    print(y)

this is a SPAM
[1]
this is a SPAM
[1]
this is a SPAM
[1]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
this is a SPAM
[1]
this is a SPAM
[1]
this is a SPAM
[1]
No, this is not a SPAM
[0]
this is a SPAM
[1]


In [16]:
# some other examples 
## doing the same for all emails in a directory
import os
for i in os.listdir(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\easy_ham_2'):
    i = "".join([r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\easy_ham_2\\',i])
    my_sample1 = open(i,'r')
    email_contents = my_sample1.read()
    word_indices = processEmail(email_contents,vocab_dict)
    x = emailFeatures(word_indices)
    y = linear_svm.predict(x.T)
    if y == 1:
        print ('this is a SPAM')
    else:
        print ('No, this is not a SPAM')
    print(y)

No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
N

No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
N

No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
N

No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
N

No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
No, this is not a SPAM
[0]
N

## Build your own dataset

In [150]:
email_list =[]
## making a list of emails containing some easy to check spam and some spam
for i in os.listdir(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\easy_ham'):
    email_list.append((i,0))
for i in os.listdir(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\easy_ham1'):
    email_list.append((i,0))
for i in os.listdir(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\easy_ham_2'):
    email_list.append((i,0))


for i in os.listdir(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\hard_ham'):
    email_list.append((i,0))
for i in os.listdir(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\hard_ham1'):
    email_list.append((i,0))
    
    
for i in os.listdir(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\spam'):
    email_list.append((i,1))
for i in os.listdir(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\spam_n'):
    email_list.append((i,1))
for i in os.listdir(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\spam_2'):
    email_list.append((i,1))
for i in os.listdir(r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\spam_2nn'):
    email_list.append((i,1))  

    
    
    
print(len(email_list))

14053


In [151]:
random.shuffle(email_list)     ## shuffle

In [152]:
'''
This function will make the list of all words(processed) in the email_contents and return it
'''
def Email_vocab(email_contents):

    words = []

    hdrstart = email_contents.find('\n\n')
    email_contents = email_contents[hdrstart:]
    
    email_contents = email_contents.lower()       ## converting all uppercase to lower case
    '''
     Strip all HTML
     Looks for any expression that starts with < and ends with > and replace
     and does not have any < or > in the tag it with a space

     email_contents = cleanhtml(email_contents)    ## removing html tags

    '''
    email_contents = re.sub('<[^<]+?>', ' ', email_contents)  ## remove html tags, note it will not remove simply < or >

    # Handle Numbers Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+?', ' ', email_contents)     

    #Handle URLS  Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)     

    #Handle Email Addresses Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)     

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    #print( " =====================          processed Email              ======================= \n")
    #print(email_contents)


    #######        TOKENIZE EMAIL      #########

    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%\\n]', email_contents)

    ps = PorterStemmer()            ####   porter stemmer is used to stem the word Eg. ps.stem('include') = includ

    for i in tokens:
        if len(i) > 1:
            i = ps.stem(i)
            words.append(i)     ## adding the word
    
    return words


In [153]:
len(email_list)

14053

In [154]:
listoflistvocab = [] ##this list contains the list of vocab for each email
for email in email_list:
    i = "".join([r'C:\Users\user\Documents\Machine learning\machine-learning-ex6\ex6\all_emails\\',email[0]])
    my_sample1 = open(i,'r',errors='ignore')   ## ignore value error
    email_contents = my_sample1.read()         ## read
    listoflistvocab.append(Email_vocab(email_contents))   ## making list


In [155]:
(listoflistvocab[0])  ## u can see that at 0th index there is list. therefore, it is a list of lists

['url', 'httpaddr', 'date', 'img', 'httpaddr', 'bostonglob']

In [23]:
print(listoflistvocab[0])
from collections import defaultdict   
'''
 this default dict provides the functionality in a dict that whether there is a 
 dict key in this dictionary or not when we call it, this will automatically gives 
 a value of 0 to it , as you can see when we call Newvocab_dict[j], if j is not in dict 
 then it give a 0 to Newvocab_dict[j] and if it is present then there are no worries.
'''
Newvocab_dict = defaultdict(lambda:0)
for i in listoflistvocab:
    for j in i:
        Newvocab_dict[j] = Newvocab_dict[j] +1   ## increasing counter for each occurence
    


['on', 'mon', 'oct', 'roi', 'dayan', 'wrote', 'when', 'tri', 'to', 'use', 'apt', 'get', 'upgrad', 'it', 'want', 'to', 'instal', 'libusb', 'while', 'got', 'it', 'same', 'version', 'and', 'all', 'collaps', 'becaus', 'of', 'thi', 'had', 'that', 'too', 'remov', 'libusb', 'don', 'think', 'it', 'need', 'nodep', 'and', 'then', 'apt', 'get', 'instal', 'and', 'all', 'wa', 'well', 'brian', 'fahrländer', 'linux', 'zealot', 'conserv', 'and', 'technomad', 'evansvil', 'in', 'my', 'voyag', 'httpaddr', 'icq', 'angegangen', 'schlang', 'hüften', 'sein', 'es', 'ganz', 'rüber', 'jetzt', 'bügel', 'innen', 'fest', 'weil', 'es', 'ein', 'lang', 'süsse', 'fahrt', 'ist', 'rpm', 'list', 'mail', 'list', 'httpaddr']
defaultdict(<function <lambda> at 0x0000025B8B705B70>, {'on': 6218, 'mon': 138, 'oct': 180, 'roi': 24, 'dayan': 1, 'wrote': 951, 'when': 1325, 'tri': 871, 'to': 24128, 'use': 2790, 'apt': 271, 'get': 2475, 'upgrad': 137, 'it': 9152, 'want': 1168, 'instal': 476, 'libusb': 3, 'while': 485, 'got': 412, 's

In [156]:
from collections import Counter
c = Counter(Newvocab_dict)
final_vocab_list=(c.most_common(2500))    ## method to find the most common words 

In [157]:
print(len(final_vocab_list))

2500


In [158]:
final_vocab_list = (list(dict(final_vocab_list).keys()))

In [159]:
num_emails = len(email_list)
X = np.zeros((num_emails,2500))
y = np.zeros((num_emails,1))
X.shape

(14053, 2500)

In [160]:
'''
function to create feature vector for each email

'''

def process_feature_vector(email_vocab):
    
    word_indices = np.zeros(( 1, (len(final_vocab_list)) ))
    
    for i in email_vocab:
        try:
            word_indices[0][final_vocab_list.index(i)] = 1
        except:
            continue
    return word_indices
    

In [161]:
for i in range(num_emails):
    X[i] = process_feature_vector(listoflistvocab[i])
    y[i] = email_list[i][1]

In [162]:
print(X)
print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]]
[[0.]
 [0.]
 [0.]
 ...
 [1.]
 [1.]
 [0.]]


In [163]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [164]:

C = 0.1                     ## C =0.1  parameter (1/Lambda)
                                      
linear_svm = SVC(C=0.1,kernel='linear')     
## making the model
linear_svm.fit(X_train,y_train.ravel())                        ## fitting the model


SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [165]:
y_pred = linear_svm.predict(X_train)                     ## predicting the values
y_pred_test = linear_svm.predict(X_test)                     ## predicting the values

In [166]:
print("accuracy on training data ")
print(metrics.accuracy_score(y_train,y_pred))
print("accuracy on testing data ")
print(metrics.accuracy_score(y_test,y_pred_test))


accuracy on training data 
0.9989834299074921
accuracy on testing data 
0.9952561669829222
