# Spam detector

## Fetch dataset

In [1]:
import tarfile
import os
import shutil
import requests

MAIN_URL = 'https://spamassassin.apache.org/old/publiccorpus/'
TAR_FILES = ['20030228_easy_ham.tar.bz2', '20030228_easy_ham_2.tar.bz2', '20030228_hard_ham.tar.bz2',
            '20030228_spam.tar.bz2', '20050311_spam_2.tar.bz2']
DATASET_PATH = './dataset'

if os.path.exists(DATASET_PATH):
    shutil.rmtree(DATASET_PATH)
os.makedirs(DATASET_PATH, exist_ok=True)

OUTPUT_FILE = os.path.join(DATASET_PATH, 'dataset.tar.bz2')

def fetch_dataset(url, path=DATASET_PATH):
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(OUTPUT_FILE, 'wb') as f:
            f.write(r.raw.read())
        file = tarfile.open(OUTPUT_FILE, 'r:bz2')
        file.extractall(path=DATASET_PATH)
        file.close()
      
for url in TAR_FILES:
    fetch_dataset(os.path.join(MAIN_URL, url))

## Move files to two directories: spam and ham (not spam)

In [2]:
SPAM_DIR = "spam"
HAM_DIR = "ham"

if os.path.exists(SPAM_DIR):
    shutil.rmtree(SPAM_DIR)

if os.path.exists(HAM_DIR):
    shutil.rmtree(HAM_DIR)

os.makedirs(SPAM_DIR)
os.makedirs(HAM_DIR)

spam_count = 0
not_spam_count = 0

for d in os.listdir(DATASET_PATH): 
    if '_ham' in d:
        for f in os.listdir(os.path.join(DATASET_PATH, d)):
            if 'cmds' in f:
                continue

            shutil.move(os.path.join(DATASET_PATH, *[d,f]), os.path.join(HAM_DIR, f'ham_{not_spam_count}'))
            not_spam_count += 1
    elif 'spam' in d:
        for f in os.listdir(os.path.join(DATASET_PATH, d)):
            if 'cmds' in f:
                continue
            shutil.move(os.path.join(DATASET_PATH, *[d,f]), os.path.join(SPAM_DIR, f'spam_{spam_count}'))
            spam_count += 1

#Delete dataset dir
if os.path.exists(DATASET_PATH):
    shutil.rmtree(DATASET_PATH)


## Make train and test set

In [17]:
import numpy as np
np.random.seed(42)

# Create 2d array (one row for every email) 
def create_emails_list():
    emails = []
    # 0 - spam
    # 1 - ham
    for i in range(len(os.listdir(SPAM_DIR))):
        emails.append([f'spam_{i}', 0])
    for i in range(len(os.listdir(HAM_DIR))):
        emails.append([f'ham_{i}',1])
    emails = np.array(emails)
    np.random.shuffle(emails)
    return emails

def split_train_test(emails, test_radio=0.2):
    thresh = int(test_radio * len(emails))
    return emails[:thresh,0], emails[:thresh,1], emails[thresh:,0], emails[thresh:,1] 
    
emails = create_emails_list()
test_x, test_y, train_x, train_y = split_train_test(emails)
print(test_x.shape)
print(train_x.shape)


(1209,)
(4837,)


## Classes for transforming each email to sparse word vector

In [24]:
import string
from sklearn.base import BaseEstimator, TransformerMixin

class EmailWordCounter(BaseEstimator, TransformerMixin):
    def __init__(self, keep_header=False):
        self.keep_header = keep_header
    
    def get_words(self, x):
        dirr = HAM_DIR if 'ham' in x else SPAM_DIR
        f = open(os.path.join(dirr, x), encoding='utf-8', errors='ignore')
        lines = f.readlines()
        f.close()
        # Strip off email header
        if not self.keep_header:
            for i, l in enumerate(lines):
                if l == '\n':
                    lines = lines[i+1:]
                    break
        # Join lines and remove punctuation
        text = " ".join(lines).lower()
        words = text.translate(str.maketrans('', '', string.punctuation)).split()
        return words
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        words_in_emails = []
        # Collect words
        for i, email in enumerate(x):
            wordsCounter = {}
            for w in self.get_words(email):
                if w.startswith('http') or w.startswith('www'):
                    w = 'URL'
                elif any(char.isdigit() for char in w):
                    w = 'NUMBER'

                if w in wordsCounter:
                    wordsCounter[w] += 1
                else:
                    wordsCounter[w] = 1
            words_in_emails.append(wordsCounter)
            
        return np.array(words_in_emails)

## Indexes to sklearn's sparse matrix

In [25]:
from scipy.sparse import csr_matrix
from collections import Counter

class WordsToSparse(BaseEstimator, TransformerMixin):
    def __init__(self, dict_size=1000):
        self.dict_size = dict_size
        self.dict = []

    def fit(self, x, y=None):
        final_words = Counter({})
        for email in x:
            final_words += email
        self.dict = [val[0] for val in final_words.most_common()][:self.dict_size]
        return self

    def transform(self, x):
        rows = []
        cols = []
        data = []
        for row, email in enumerate(x):
            for word, count in email.items():
                rows.append(row)
                data.append(count)
                col = 0 if word not in self.dict else self.dict.index(word) + 1
                cols.append(col)
        return csr_matrix((data, (rows, cols)), shape=(len(x), len(self.dict) + 1))

## Create pipeline to transform data

In [37]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('emailToWords', EmailWordCounter(keep_header=True)),
    ('WordsToSparse', WordsToSparse())
])

prepared = pipeline.fit_transform(train_x)
prepared

<4837x1001 sparse matrix of type '<class 'numpy.int64'>'
	with 592547 stored elements in Compressed Sparse Row format>

## Test RandomForestClassifier

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest = RandomForestClassifier()

score = cross_val_score(forest, prepared, train_y, cv=3)
score.mean()

0.9805669352146564

In [39]:
from sklearn.metrics import precision_score, recall_score

forest.fit(prepared, train_y)
pred = forest.predict(prepared)
precision = precision_score(pred, train_y, pos_label='1')
recall = recall_score(pred, train_y, pos_label='1')

print(precision)
print(recall)

1.0
1.0


## Test LinearRegression

In [40]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(prepared, train_y)

pred = model.predict(prepared)
precision = precision_score(pred, train_y, pos_label='1')
recall = recall_score(pred, train_y, pos_label='1')
print(precision)
print(recall)

0.9940173496859108
0.9913484486873508


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Test on test set

In [41]:
prepared_test = pipeline.transform(test_x)

pred = forest.predict(prepared_test)

precision = precision_score(pred, test_y, pos_label='1')
recall = recall_score(pred, test_y, pos_label='1')

print(precision)
print(recall)

0.9838909541511772
0.9887920298879203
