In [1]:
import os
import scipy.io as scio
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
import re
import nltk
import nltk.stem.porter
import email.message
import operator

#### 23. Создайте свой набор данных из оригинального корпуса текстов - http://spamassassin.apache.org/old/publiccorpus/.
#### 24. Постройте собственный словарь.

In [2]:
def readFile(filename):
    file = open(filename, 'r')
    content = file.read()
    file.close()
    return content

In [3]:
def tokenizeEmail(email_contents):
    email_contents = email_contents.lower()
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)
    email_contents = re.sub('[0-9]+', 'number', email_contents)
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)
    email_contents = re.sub('[$]+', 'dollar', email_contents)
    email_contents = re.sub('\s+', ' ', email_contents)
    
    # Tokenize Email
    stemmer = nltk.stem.porter.PorterStemmer()
    tokens = re.split('_|number|\W', email_contents)
    
    return tokens

In [4]:
def getListOfFiles(dirName, with_names=False):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath, with_names)
        else:
            allFiles.append((dirName, fullPath)) if with_names else allFiles.append(fullPath)        
                
    return allFiles     

In [5]:
public_corpus_files = getListOfFiles('data/Lab 5/public_corpus')

good_files = 0
bad_files = 0
all_tokens = []

for index, public_corpus_file in enumerate(public_corpus_files, start=1):
    try:
        file_contents = readFile(public_corpus_file)
        b = email.message_from_string(file_contents)
        if b.is_multipart():
            for payload in b.get_payload():
                tokens = tokenizeEmail(payload.get_payload())
                all_tokens.append(tokens)
        else:
            tokens = tokenizeEmail(b.get_payload())
            all_tokens.append(tokens)
            
        good_files += 1
    except:
        bad_files += 1
    
    if index % 1000 == 0:
        print(f"'Done {index} / {len(public_corpus_files)}, good: {good_files}, bad: {bad_files}")

all_tokens_list = [item for sublist in all_tokens for item in sublist]
all_tokens_list = list(filter(lambda x: len(x) > 0, all_tokens_list))
all_tokens = set(all_tokens_list)

'Done 1000 / 10752, good: 862, bad: 138
'Done 2000 / 10752, good: 1749, bad: 251
'Done 3000 / 10752, good: 2637, bad: 363
'Done 4000 / 10752, good: 3562, bad: 438
'Done 5000 / 10752, good: 4508, bad: 492
'Done 6000 / 10752, good: 5441, bad: 559
'Done 7000 / 10752, good: 6305, bad: 695
'Done 8000 / 10752, good: 7238, bad: 762
'Done 9000 / 10752, good: 8180, bad: 820
'Done 10000 / 10752, good: 9109, bad: 891


In [6]:
all_tokens_grouped = {}
for token in all_tokens_list:
    all_tokens_grouped[token] = all_tokens_grouped.get(token, 0) + 1

tokens5000 = sorted(all_tokens_grouped.items(), key=operator.itemgetter(1), reverse=True)[:5000]

open('data/Lab 5/vocabPublicCorpus.txt', 'w').close()
with open('data/Lab 5/vocabPublicCorpus.txt', 'a') as file:
    for token_index, token in enumerate(tokens5000, start=1):
        token_name, tokens_number = token
        file.write(f'{token_index}\t{token_name}\t{tokens_number}\n')

In [7]:
file_contents = readFile('data/Lab 5/vocabPublicCorpus.txt').split("\n")
print("\n".join(file_contents[len(file_contents) - 20:len(file_contents)]))

4982	alternate	45
4983	antivirus	45
4984	clever	45
4985	certificate	45
4986	eastern	45
4987	motorola	45
4988	adobe	45
4989	residence	45
4990	locked	45
4991	fca	45
4992	bcd	45
4993	dac	45
4994	fbf	45
4995	slower	45
4996	informative	45
4997	chips	45
4998	individually	45
4999	au	45
5000	uninstall	45



#### 25. Как изменилось качество классификации? Почему?

In [8]:
def getVocabList(file='vocab.txt'):
    vocab = {}
    for line in open(f'data/Lab 5/{file}', 'r'):
        (val, key, *others) = line.split()
        vocab[key] = int(val)
    return vocab

In [9]:
def processEmail(email_contents, vocab_file='vocab.txt'):
    vocab = getVocabList(vocab_file)
    word_indices = []
    email_contents = email_contents.lower()
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)
    email_contents = re.sub('[0-9]+', 'number', email_contents)
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)
    email_contents = re.sub('[$]+', 'dollar', email_contents)
    email_contents = re.sub('\s+', ' ', email_contents)
    
    # Tokenize Email
    stemmer = nltk.stem.porter.PorterStemmer()
    tokens = re.split('_|number|\W', email_contents)
    
    for token in tokens:
        token = re.sub('[^a-zA-Z0-9]', '', token)
        token = stemmer.stem(token)
        if len(token) < 1:
            continue
        if token in vocab:
            word_indices.append(vocab[token])
            
    return word_indices

In [10]:
def emailFeatures(word_indices, number=1898):
    features = np.zeros((number + 1,), dtype=int)
    for w in word_indices:
        features[w] = 1
    return features

In [11]:
public_corpus_files = getListOfFiles('data/Lab 5/public_corpus', True)
good_files = 0
bad_files = 0
all_tokens = []

X = []
Y = []

for index, public_corpus_file in enumerate(public_corpus_files, start=1):
    dir_name, file_path = public_corpus_file
    try:
        file_contents = readFile(file_path)
        word_indices = processEmail(file_contents, 'vocabPublicCorpus.txt')
        X.append(emailFeatures(word_indices, 4999))
        Y.append(1) if re.search(r'spam', dir_name) else Y.append(0)
    except:
        pass

    if index % 1000 == 0:
        print(f"'Done {index} / {len(public_corpus_files)}")
        

'Done 1000 / 10752
'Done 2000 / 10752
'Done 3000 / 10752
'Done 4000 / 10752
'Done 5000 / 10752
'Done 6000 / 10752
'Done 7000 / 10752
'Done 8000 / 10752
'Done 9000 / 10752
'Done 10000 / 10752


In [15]:
Xtrain = X[:6000]
Ytrain = Y[:6000]

Xval = X[6001:9000]
Yval = Y[6001:9000]

Xtest = X[9001:]
Ytest = Y[9001:]

In [16]:
model = svm.SVC(C=10, kernel='rbf', gamma=1 / (2 * 10 ** 2))
model.fit(Xtrain, Ytrain)
p = model.predict(Xtest)
print('Test Accuracy: ', np.mean(p == Ytest) * 100)

Test Accuracy:  96.23287671232876


Качество классификации получилось немного меньше, думаю это можно объяснить тем, что мой словарь содержал большое количество общих слов, 
которые я не знал как полностью отфильтровать, и которые представляли из себя бесполезные признаки.