In [1]:
import re
import numpy as np
import pandas as pd
import os

def get_sample(fn):
    with open(fn, 'r') as f:
        content = f.read()
    return content

def word_tokenize(content):
    '''
    content: str - body of mail 
    return: list of tokens (str) e.g. ['>', 'Anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host', 'a']
    '''

    tokens = re.split(r'[ \n]', content)
    
    return tokens

def lower_case(tokens):
    tokens = [sub.lower() for sub in tokens]
    tokens = np.array(tokens)
    return tokens

def normalize_tokens (tokens):
    '''
    tokens: ndarry of str
    return: ndarry of tokens replaced with corresponding unified words
    '''

    # Function to replace numbers with "number"
    def replace_numbers(token):
        return re.sub(r'\d+', 'number', token)

    # Function to replace URLs with "httpaddr"
    def replace_urls(token):
        return re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'httpaddr', token)

    # Function to replace emails with "emailaddr"
    def replace_emails(token):
        return re.sub(r'\S+@\S+', 'emailaddr', token)

    # Function to replace $ with "dollar"
    def replace_dollar(token):
        return re.sub(r'\$', 'dollar', token)

    # Function to remove punctuation and non-alphanumeric characters
    def remove_punctuation(token):
        return re.sub(r'[^a-zA-Z0-9]', '', token)

    # Apply the defined functions to each token
    tokens = [replace_numbers(token) for token in tokens]
    tokens = [replace_urls(token) for token in tokens]
    tokens = [replace_emails(token) for token in tokens]
    tokens = [replace_dollar(token) for token in tokens]
    tokens = [remove_punctuation(token) for token in tokens]
    
    return np.array(tokens)

def filter_short_tokens (tokens):
    '''
    tokens: ndarry of str
    return: ndarry of filtered tokens (str)
    '''
    original_tokens_len = len(tokens)
    
    for token in tokens:
        if len(token) < 1:
            tokens = np.delete(tokens, np.where(tokens == token))
   
    print('Original len = {}\nRemaining len = {}'.format(original_tokens_len, len(tokens)))    
    return tokens

from nltk.stem import PorterStemmer

def stem_tokens(tokens):
    '''
    tokens: ndarry of str
    return: ndarry of stemmed tokens e.g. array(['anyon', 'know', 'how', 'much', 'it', 'cost', 'to', 'host', 'a',
       'web', 'portal', 'well', 'it', 'depend', 'on', 'how', 'mani']...
    '''
    porter_stemmer = PorterStemmer()

    # Apply the stemmer to each token
    tokens = [porter_stemmer.stem(token) for token in tokens]
   
    return np.array(tokens)

def get_vocabulary(fn):
    '''
    fn: str - full path to file 
    return: ndarray of str e.g. array(['aa', 'ab', 'abil', ..., 'zdnet', 'zero', 'zip'], dtype=object)
    '''
    vocab_list = pd.read_table(fn, header=None)
    vocab = np.array(vocab_list)[:,1] # first columns is index, select only words column  
    print ('len(vocab)= {:,}'.format(len(vocab)))
    return vocab

def represent_features(tokens,vocab):
    '''
    tokens: ndarry of str
    tokens: ndarry of str
    return: ndarry of binary values 1 if word from vocabulary is in mail 0 otherwise
    '''
    
    tokens_represented = list()
    vocab = get_vocabulary('vocab.txt')

    for word in vocab:
        if word in tokens:
            tokens_represented.append(1)
        else:
            tokens_represented.append(0)
    tokens_represented = np.array(tokens_represented)   

    print ('{} word(s) from vocab are in the tokens.'.format(np.sum(tokens_represented)))

    return tokens_represented

def preprocess (content,vocab):
    '''
    content: str - body of mail 
    vocab: ndarray of str - list of considered words 
    '''
    # tokenize content    
    tokens = word_tokenize(content)
    
    # make lower case
    tokens = lower_case(tokens)

    # normalize tokens
    tokens = normalize_tokens(tokens)

    # remove zero words
    tokens = filter_short_tokens(tokens)
    
    # stem words
    tokens = stem_tokens(tokens)
    
    # convert to binary array of features  
    tokens_represented = represent_features(tokens, vocab)    
    
    return tokens_represented

In [2]:
from scipy.io import loadmat

fn = 'spamTrain.mat'

mat= loadmat(fn)
X_train= mat['X']
y_train= mat['y'].ravel()

print ('X_train.shape= {}',X_train.shape)
print ('y_train.shape= {}',y_train.shape)

fn = 'spamTest.mat'
mat= loadmat(fn)
X_test = mat['Xtest']
y_test = mat['ytest'].ravel() 

print ('X_test.shape= {}',X_test.shape)
print ('y_test.shape= {}',y_test.shape)
index = 0 
print ('Sample with index  ={}: \n{}'.format(index, X_train[index]))

X_train.shape= {} (4000, 1899)
y_train.shape= {} (4000,)
X_test.shape= {} (1000, 1899)
y_test.shape= {} (1000,)
Sample with index  =0: 
[0 0 0 ... 0 0 0]


In [3]:
from sklearn.svm import SVC 
from sklearn.svm import LinearSVC

C = .1
clf= LinearSVC(C=C)
clf.fit(X_train,y_train)
print ('Score train = {}'.format(clf.score(X_train,y_train)))
print ('Score test = {}'.format(clf.score(X_test,y_test)))

Score train = 0.99975
Score test = 0.992


In [4]:
coefs = clf.coef_[0]
sorted = np.sort(coefs)[::-1]
top_20 = sorted[:20]
top_20_indexes = np.full(20, np.nan)
for i in range(20):
    top_20_indexes[i] = np.where(coefs == top_20[i])[0][0]

vocab = get_vocabulary('vocab.txt')
# Get the corresponding words from the vocabulary
top_spam_contributors = [vocab[int(index)] for index in top_20_indexes]
print(top_spam_contributors)

len(vocab)= 1,899
['our', 'remov', 'click', 'basenumb', 'guarante', 'visit', 'bodi', 'will', 'numberb', 'price', 'dollar', 'nbsp', 'below', 'lo', 'most', 'send', 'dollarnumb', 'credit', 'wi', 'hour']


In [5]:
for sfn in [ 'emailSample1.txt', 'emailSample2.txt', 'spamSample1.txt', 'spamSample2.txt']:
    fn = sfn
    content = get_sample(fn)
    
    # YOUR_CODE.  Preprocess the sample and get prediction 0 or 1 (1 is spam)
    # START_CODE 
    preprocessed = preprocess(content, vocab)
    preprocessed = preprocessed.reshape(1, -1)
    prediction = clf.predict(preprocessed)
    # END_CODE    
    print(prediction)
    print ('{} is {}\n'.format(sfn, ('Not Spam','Spam')[prediction[0]]))

print ('Latter sample:\n{1}\n{0}\n{1}'.format(content, '='*50))

Original len = 67
Remaining len = 61
len(vocab)= 1,899
44 word(s) from vocab are in the tokens.
[0]
emailSample1.txt is Not Spam

Original len = 247
Remaining len = 222
len(vocab)= 1,899
122 word(s) from vocab are in the tokens.
[0]
emailSample2.txt is Not Spam

Original len = 141
Remaining len = 97
len(vocab)= 1,899
46 word(s) from vocab are in the tokens.
[1]
spamSample1.txt is Spam

Original len = 39
Remaining len = 31
len(vocab)= 1,899
18 word(s) from vocab are in the tokens.
[1]
spamSample2.txt is Spam

Latter sample:
Best Buy Viagra Generic Online

Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed!

We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers!
http://medphysitcstech.ru



