
##Spam classifier

### Step 1: Download dataset
Download examples of spam and ham from Apache SpamAssassin’s public datasets. Split the datasets into a training set and a test set. 

In [2]:
import os
import numpy as np
import glob
from urllib.request import urlretrieve
import tarfile
import shutil
import sklearn.utils
from sklearn.model_selection import train_test_split


def download_dataset():

    def download_url(url, dataset_dir="data"):

        tar_dir = os.path.join(dataset_dir, "tar")
        if not os.path.isdir(tar_dir):
            os.makedirs(tar_dir)

        filename = url.rsplit("/", 1)[-1]
        tarpath = os.path.join(tar_dir, filename)

        try:
            tarfile.open(tarpath)
        except:
            urlretrieve(url, tarpath)

        with tarfile.open(tarpath) as tar:
            dirname = os.path.join(dataset_dir, tar.getnames()[0])
            if os.path.isdir(dirname):
                shutil.rmtree(dirname)
            tar.extractall(path=dataset_dir)

            cmds_path = os.path.join(dirname, "cmds")
            if os.path.isfile(cmds_path):
                os.remove(cmds_path)

        return dirname

    def load_dataset(dirpath):
        files = []
        filepaths = glob.glob(dirpath + "/*")
        for path in filepaths:
            with open(path, "rb") as f:
                byte_content = f.read()
                str_content = byte_content.decode("utf-8", errors="ignore")
                files.append(str_content)
        return files

    spam_url = "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
    easy_ham_url = "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2"
    hard_ham_dir = "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"

    spam = load_dataset(download_url(spam_url))
    easy_ham = load_dataset(download_url(easy_ham_url))
    hard_ham = load_dataset(download_url(hard_ham_dir))

    X = spam + easy_ham + hard_ham
    y = np.concatenate((
        np.ones(len(spam)),
        np.zeros(len(easy_ham) + len(hard_ham)),
    ))

    return X, y


# Download dataset.
X, y = download_dataset()

# Split dataset into training and testing sets.
X, y = sklearn.utils.shuffle(X, y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)

print(f"The number of training samples: {len(X_train)}")
print(f"The number of test samples: {len(X_test)}")

The number of training samples: 2436
The number of test samples: 610


### Step 2: Feature extraction (5 points)

Next, we are going to do some data cleaning and feature extraction.

1. Some data cleaning functions have been provided to you. You'll need to implement `lower_letters()`, `convert_num_to_word()`, and `remove_punctuation()`. These functions will convert email to lowercase, replace all numbers with "NUM", and remove punctuation.
2. Convert each email into a feature vector. Your preparation pipeline should transform an email into a (sparse) vector that indicates the presence or absence of each possible word. For example, if all emails only ever contain four words, "Hello," "how," "are," "you," then the email "Hello you Hello Hello you" would be converted into a vector [1, 0, 0, 1] (meaning ["Hello" is present, "how" is absent, "are" is absent, "you" is present]), or [3, 0, 0, 2] if you prefer to count the number of occurrences of each word. You may check sklearn's `CountVectorizer` class for reference.

In [3]:
import enum
import re
import string
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


class EmailCleaner(BaseEstimator, TransformerMixin):

    def __init__(self,
                 no_header=True,
                 to_lowercase=True,
                 url_to_word=True,
                 num_to_word=True,
                 remove_punc=True):
        self.no_header = no_header
        self.to_lowercase = to_lowercase
        self.url_to_word = url_to_word
        self.num_to_word = num_to_word
        self.remove_punc = remove_punc

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_cleaned = []
        for email in X:
            if self.no_header:
                email = EmailCleaner.remove_header(email)
            if self.to_lowercase:
                email = EmailCleaner.lower_letters(email)

            email_words = email.split()
            if self.url_to_word:
                email_words = EmailCleaner.convert_url_to_word(email_words)
            if self.num_to_word:
                email_words = EmailCleaner.convert_num_to_word(email_words)
            email = " ".join(email_words)
            if self.remove_punc:
                email = EmailCleaner.remove_punctuation(email)
            X_cleaned.append(email)
        return X_cleaned

    @staticmethod
    def remove_header(email):
        return email[email.index("\n\n"):]

    @staticmethod
    def is_url(s):
        url = re.match(
            "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|"
            "[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", s)
        return url is not None

    @staticmethod
    def convert_url_to_word(words):
        for i, word in enumerate(words):
            if EmailCleaner.is_url(word):
              words[i] = "URL"
        return words

    @staticmethod
    def lower_letters(email):
      return email.lower()

    @staticmethod
    def convert_num_to_word(words):
      for i, word in enumerate(words):
        if type(word) == int:
          words[i] = "NUM"

        elif word.isnumeric():
          words[i] = "NUM"
      return words

    @staticmethod
    def remove_punctuation(email):
      return email.translate(str.maketrans('', '', string.punctuation))

    @staticmethod
    def remove_extraInfo(email):
      pass

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

class countVectorizer:

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    vectorizer = CountVectorizer()
    bag = vectorizer.fit_transform(X)
    return bag

    

In [5]:
# Here are some unit tests to check your code.
# Your code should at least pass the following tests.
# Feel free to add more tests if you"d like.

# Check lower_letters().
src_string = "Message-Id: <LISTMANAGERSQL-25343"
dst_string = "message-id: <listmanagersql-25343"
assert EmailCleaner.lower_letters(src_string) == dst_string

# Check convert_num_to_word().
src_string = "Date: Wed, 10 Jul 2002"
src_words = src_string.split()
dst_words = ["Date:", "Wed,", "NUM", "Jul", "NUM"]
assert EmailCleaner.convert_num_to_word(src_words) == dst_words

# Check remove_punctuation().
src_string = "superstars -- you'll find investing more fun..."
dst_string = "superstars  youll find investing more fun"
assert EmailCleaner.remove_punctuation(src_string) == dst_string

In [6]:
# Step 1 of pipeline: data cleaning.
email_cleaner = EmailCleaner()

# Step 2 of pipeline: CountVectorizer.
count_vectorizer = countVectorizer()

# Build pipeline.
prepare_pipeline = Pipeline([
    ("email_cleaner", email_cleaner),
    ("count_vectorizer", count_vectorizer),
])

# Run preprocessing.
X_all = X_train + X_test
prepare_pipeline.fit(X_all)
X_all = prepare_pipeline.transform(X_all)
num_train = len(X_train)
X_train = X_all[:num_train]
X_test = X_all[num_train:]

print(X_train.shape)
print(X_test.shape)

(2436, 108845)
(610, 108845)


### Step 3: Train a spam classifier (5 points)

Next, let's build a spam classifier, and train your classifier with the training set.

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf_one = DecisionTreeClassifier(random_state=0)

clf_one.fit(X_train, y_train)
y_prediction = clf_one.predict(X_test)




### Step 4: Eval your classifier

Test your classifier with the test set and print the precision and recall.

In [8]:
from sklearn.metrics import precision_score, recall_score

print('Precision: %.3f' %precision_score(y_true=y_test, y_pred=y_prediction))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_prediction))

Precision: 0.930
Recall: 0.904


### Step 5: Ensemble of classifiers (5 points)

1. Implement 4 new classifiers (in total you have 5 claassifiers now).
2. Use hard or soft voting to ensemble thoses classifiers.
3. Train your ensemble model on the training set. Report training/testing precision and recall.

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.ensemble import VotingClassifier, RandomForestClassifier


clf_two = LogisticRegression(C=10, random_state=0, max_iter=1000)
clf_two.fit(X_train, y_train)
 
clf_three =  KNeighborsClassifier(p=2)
clf_three.fit(X_train, y_train)

clf_four = SVC(probability=True)
clf_four.fit(X_train, y_train)

clf_five =  RandomForestClassifier(n_jobs=-1)
clf_five.fit(X_train, y_train)

estimators = [('dt', clf_one), ('lr', clf_two), ('knn', clf_three), ('svm', clf_four), ('prc', clf_five)]
ensemble_clf = VotingClassifier(estimators=estimators, voting="soft")
ensemble_clf.fit(X_train, y_train)
y_pred = ensemble_clf.predict(X_test)

print('Precision: %.3f' %precision_score(y_true=y_test, y_pred=y_pred))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))

Precision: 0.989
Recall: 0.950
