## Preprocessors and modeling: Spam Classificator

In [1]:
import sys 
sys.path.append('../scripts')

In [2]:
from get_data import *
collect_update_data()



  soup_wrap = BeautifulSoup(response.read())


d:\repos\spam-classifier\data


In [3]:
from load_data import *
data_folder_names = get_folder_names(data_path)
emails = load_data(data_folder_names)


In [4]:
X, y = create_dataset(emails)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [7]:
html_spam_emails = [email for email in X_train[y_train == 1]
                    if get_structures(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:100], "...")


Dear cpunks ,

<BODY bgColor=#ffccff>
<TABLE border=0 cellPadding=0 cellSpacing=0 width=475>
  <TBOD ...


## Email Preprocessor

In [14]:
try:
    import nltk
    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")


Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [15]:
try:
    import urlextract  # may require an Internet connection to download root domain names

    url_extractor = urlextract.URLExtract()
    print(url_extractor.find_urls(
        "Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
    print("Error: replacing URLs requires the urlextract module.")
    url_extractor = None


['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


In [18]:
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter
import numpy as np

In [21]:
class EmailTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, 
                 remove_punctuation=True, replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case 
        self.remove_punctuation = remove_punctuation 
        self.replace_urls = replace_urls 
        self.replace_numbers = replace_numbers 
        self.stemming = stemming

        try:
            import urlextract  # may require an Internet connection to download root domain names
            url_extractor = urlextract.URLExtract()

        except ImportError:
            print("Error: replacing URLs requires the urlextract module.")
            url_extractor = None
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""

            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)

                for url in urls:
                    text = text.replace(url, " URL ")
            
            if self.replace_numbers:
                text = re.sub('r\W+', ' ', text, flags=re.M)
            
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()

                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                
                word_counts = stemmed_word_counts
            
            X_transformed.append(word_counts)
        
        return np.array(X_transformed)

            


In [28]:
X_subset = X_train[:10]
X_sub_counts = EmailTransformer().fit_transform(X_subset)
X_sub_counts

array([Counter({'you': 131, 'to': 98, 'and': 93, 'a': 80, 'the': 70, 'grant': 61, 'of': 56, 'fo': 53, 'busi': 42, 'free': 34, 'govern': 33, 'in': 32, 'is': 26, 'money': 25, 'get': 25, 'that': 24, 'will': 24, 'thi': 23, 'are': 23, 'how': 18, 'o': 18, 'home': 17, 'program': 16, 'complet': 16, 'guid': 16, 'step': 16, 'be': 14, 'can': 14, 'with': 14, 'if': 14, 'it': 13, 'peopl': 12, 'from': 12, 'by': 12, 'start': 11, 'an': 11, 'ove': 10, 'not': 10, 'appli': 10, 'as': 10, 'help': 10, 'have': 9, 'at': 8, 'need': 8, 'much': 8, 'on': 8, 'new': 8, 'dollar': 7, 'all': 7, 'keep': 7, 'what': 7, 'ord': 7, 'business.': 7, 'we': 7, 'just': 6, 'state': 6, 'feder': 6, 'yea': 6, 'into': 6, 'know': 6, 'own': 6, 'do': 6, 'access': 6, 'small': 6, 'secret': 6, 'plan': 6, 'base': 6, 'person': 5, 'qualifi': 5, '-': 5, 'each': 5, 'million': 5, 'i': 5, "it'": 5, 'receiv': 5, 'these': 5, "don't": 5, 'programs,': 5, 'about': 5, 'want': 5, 'so': 5, 'purchas': 5, 'ani': 5, 'more.': 5, 'obtain': 5, 'money,': 5, 'use

In [27]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []

        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))


In [30]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_sub_counts)
X_few_vectors

<10x11 sparse matrix of type '<class 'numpy.intc'>'
	with 96 stored elements in Compressed Sparse Row format>

In [31]:
X_few_vectors.toarray()

array([[2197,   70,   80,   98,   56,   93,   24,   32,   53,   26,   13],
       [ 407,   15,   10,    8,   15,   16,    9,    6,    3,    8,    7],
       [ 313,    4,    2,    4,    3,    1,    0,    0,    3,    0,    0],
       [ 115,    6,    2,    7,    0,    1,    0,    0,    3,    0,    0],
       [ 364,   33,    9,    7,   27,    7,    7,   10,    3,    2,    2],
       [ 110,    7,    7,    4,    2,    1,    0,    0,    2,    0,    1],
       [  85,    5,    1,    1,    2,    1,    0,    0,    1,    2,    3],
       [ 209,   23,    3,    6,    6,   11,    6,    5,    4,    5,    1],
       [ 215,    4,   11,    2,    3,    4,    7,    1,    2,    3,    4],
       [ 993,   66,   17,   25,   17,   23,   13,   16,    9,    9,   19]],
      dtype=int32)

In [32]:
vocab_transformer.vocabulary_

{'the': 1,
 'a': 2,
 'to': 3,
 'of': 4,
 'and': 5,
 'that': 6,
 'in': 7,
 'fo': 8,
 'is': 9,
 'it': 10}

In [33]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="liblinear", random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()
