# Preparations

In [1]:
import json
import os

## Mount Google Drive with raw data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
! ls /content/drive/MyDrive/Colab\ Notebooks/Spam_Detector/Enron_raw_data/dataset

test  test.csv	train  train.csv


## Install this project package from github

In [4]:
!rm -rf /content/spam_detector/
!git clone https://github.com/NataliaTarasovaNatoshir/spam_detector.git
%cd spam_detector/
!git pull origin master
!python setup.py install

Cloning into 'spam_detector'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 92 (delta 46), reused 68 (delta 22), pack-reused 0[K
Unpacking objects: 100% (92/92), done.
/content/spam_detector
From https://github.com/NataliaTarasovaNatoshir/spam_detector
 * branch            master     -> FETCH_HEAD
Already up to date.
  "details." % version
running install
running bdist_egg
running egg_info
creating spam_detector.egg-info
writing spam_detector.egg-info/PKG-INFO
writing dependency_links to spam_detector.egg-info/dependency_links.txt
writing top-level names to spam_detector.egg-info/top_level.txt
writing manifest file 'spam_detector.egg-info/SOURCES.txt'
writing manifest file 'spam_detector.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build
creating build/lib
creating build/lib/spam_detector
copying

In [5]:
# load config from package
with open("/content/spam_detector/spam_detector/config.json") as file:
  config = json.load(file)
config

{'dataset_build': {'raw_files_folder': '/content/drive/MyDrive/Colab Notebooks/Spam_Detector/Enron_raw_data/raw_files',
  'res_dataset_folder_name': '/content/drive/MyDrive/Colab Notebooks/Spam_Detector/Enron_raw_data/dataset',
  'test_share': 0.3},
 'testing': {'recall_level': 0.99}}

# Bag of words approach

In [72]:
import pandas as pd
dataset_folder = config['dataset_build']['res_dataset_folder_name']

dataset = pd.read_csv(os.path.join(dataset_folder, 'train.csv'))

dataset.head()

Unnamed: 0,subject,text,message_id,label
0,"oneok westex transmission interconnect , ward ...",we have finalized negotiations with oneok west...,3883.2001-07-03.lokay.ham.txt,0
1,fw : blank traveler ' s profile form,the travel profile . if you want to complete i...,3888.2001-07-05.lokay.ham.txt,0
2,allocation order,"michelle , to answer your question on the allo...",3896.2001-07-06.lokay.ham.txt,0
3,sporting clays reminder !,don ' t forget the nesa / hea 5 th annual spor...,3904.2001-07-09.lokay.ham.txt,0
4,enron cost savings guidelines,"effective , july 1 , 2001 , a number of measur...",3914.2001-07-12.lokay.ham.txt,0


In [73]:
# fill empty values
dataset['text'].fillna('', inplace=True)
dataset['subject'].fillna('', inplace=True)

In [74]:
# split dataset into training and validation
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(dataset[['subject', 'text']], dataset['label'], test_size=0.33, random_state=42, shuffle=True)

In [75]:
assert len(X_train) + len(X_valid) == len(dataset)
assert len(y_train) + len(y_valid) == len(dataset)
print('Total dataset size: {}'.format(len(dataset)))
print('Train size: {}, Valid size: {}'.format(len(X_train), len(X_valid)))
print('Spam share - Train: {0:.4f}, Test: {1:.4f}'.format(y_train.mean(), 
                                                          y_valid.mean()))

Total dataset size: 23588
Train size: 15803, Valid size: 7785
Spam share - Train: 0.5088, Test: 0.5093


## Vectorize the subject and the mail body separately or together?

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import SGDClassifier
penalty = 'l1'
clf = SGDClassifier(alpha=0.0001, max_iter=50, penalty=penalty, loss='modified_huber')

from sklearn.metrics import roc_auc_score

In [77]:
# vectorize together
X_train_joined = X_train.copy()
X_train_joined['joined_text'] = X_train_joined[['subject', 'text']].apply(
    lambda x: str(x[0]) + ' ' + str(x[1]), axis=1)
X_valid_joined = X_valid.copy()
X_valid_joined['joined_text'] = X_valid_joined[['subject', 'text']].apply(
    lambda x: str(x[0]) + ' ' + str(x[1]), axis=1)

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")

X_train_vectorized = vectorizer.fit_transform(X_train_joined['joined_text'].values)
X_valid_vectorized = vectorizer.transform(X_valid_joined['joined_text'].values)

clf.fit(X_train_vectorized, y_train)

roc_auc_joined = roc_auc_score(y_valid, 
                               clf.predict_proba(X_valid_vectorized)[:,1])
print("Joined approach roc-auc score: {0:.4f}".format(roc_auc_joined))

Joined approach roc-auc score: 0.9957


In [78]:
# vectorize separately
vectorizer_subject = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")
vectorizer_text = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")

X_train_subject_vectorized = vectorizer_subject.fit_transform(X_train['subject'].values)
X_valid_subject_vectorized = vectorizer_subject.transform(X_valid['subject'].values)
X_train_text_vectorized = vectorizer_text.fit_transform(X_train['text'].values)
X_valid_text_vectorized = vectorizer_text.transform(X_valid['text'].values)

from scipy.sparse import hstack
X_train_vectorized = hstack([X_train_subject_vectorized, 
                             X_train_text_vectorized])
X_valid_vectorized = hstack([X_valid_subject_vectorized, 
                             X_valid_text_vectorized])
                            
clf.fit(X_train_vectorized, y_train)

roc_auc_joined = roc_auc_score(y_valid, 
                               clf.predict_proba(X_valid_vectorized)[:,1])
print("Separate approach roc-auc score: {0:.4f}".format(roc_auc_joined))

Separate approach roc-auc score: 0.9884


Joined approach provides higher quality and is easier to support.
Decision: use joined information from e-mail subject and text below

## Select hyperparameters

In [81]:
from sklearn.pipeline import Pipeline
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV

In [84]:
pipeline = Pipeline(
    [
     ("vectorization", TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")),
     ("classification", SGDClassifier(max_iter=50, loss='modified_huber'))
     ]
)

distributions = dict(vectorization__analyzer=["word", "char", "char_wb"],
                     vectorization__ngram_range=[(1, 1), (1, 2)],
                     classification__alpha=uniform(loc=0.00001, scale=1))

clf = RandomizedSearchCV(pipeline, distributions, random_state=0)
search = clf.fit(X_train_joined['joined_text'].values, y_train)
search.best_params_

{'classification__alpha': 0.0871392997015407,
 'vectorization__analyzer': 'word',
 'vectorization__ngram_range': (1, 2)}

In [85]:
# see optimum model performance on validation data
optimum_pipeline = Pipeline(
    [
     ("vectorization", TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english", 
                                       analyzer='word', ngram_range=(1, 2))),
     ("classification", SGDClassifier(max_iter=50, loss='modified_huber', 
                                      alpha=0.0871))
     ]
)

optimum_pipeline.fit(X_train_joined['joined_text'].values, y_train)
roc_auc_optimum = roc_auc_score(y_valid, 
                                optimum_pipeline.predict_proba(X_valid_joined['joined_text'].values)[:,1])
print("Optimum parameters roc-auc score: {0:.4f}".format(roc_auc_optimum))

Optimum parameters roc-auc score: 0.9946


Hyperparameter search has not provided a notable gain in model quality.

# Performance assessment

## Build the model

In [6]:
from spam_detector.models import bow_model

In [7]:
optimal_model = bow_model.BoWModel(
    model_name='optimal_bow', 
    vectorizer_params=dict(sublinear_tf=True, max_df=0.5, stop_words="english", 
                           analyzer='word', ngram_range=(1, 2)),
    classifier_params=dict(max_iter=50, loss='modified_huber', alpha=0.0871))

In [8]:
import pandas as pd
dataset_folder = config['dataset_build']['res_dataset_folder_name']

dataset = pd.read_csv(os.path.join(dataset_folder, 'train.csv'))

dataset.head()

Unnamed: 0,subject,text,message_id,label
0,"oneok westex transmission interconnect , ward ...",we have finalized negotiations with oneok west...,3883.2001-07-03.lokay.ham.txt,0
1,fw : blank traveler ' s profile form,the travel profile . if you want to complete i...,3888.2001-07-05.lokay.ham.txt,0
2,allocation order,"michelle , to answer your question on the allo...",3896.2001-07-06.lokay.ham.txt,0
3,sporting clays reminder !,don ' t forget the nesa / hea 5 th annual spor...,3904.2001-07-09.lokay.ham.txt,0
4,enron cost savings guidelines,"effective , july 1 , 2001 , a number of measur...",3914.2001-07-12.lokay.ham.txt,0


In [9]:
optimal_model.train(X=dataset[['message_id', 'subject', 'text']], y=dataset['label'])

## Run an experiment on test data

In [10]:
from spam_detector import quality_assessment

In [11]:
results = quality_assessment.run_experiment(config=config, model_name='optimal_model', model=optimal_model)

Testing model optimal_model

Loading test dataset
Test dataset size = 10105 entries
Share of spam = 0.5090

Generating predictions
Predictions generated

Total inference time: 0:00:04.023293
Inference time per 1000 entries: 0:00:00.398149

Binary classification metrics:
roc-auc score: 0.9965
At recall 0.9899 precision = 0.9738

Confusion matrix:
[[4825  137]
 [  52 5091]]


In [12]:
results

{'model_name': 'optimal_model',
 'quality_metrics': {'precision': 0.9737949502677888,
  'recall': 0.9898891697452848,
  'roc_auc_score': 0.9965062493617641,
  'threshold': 0.5135137500040693},
 'runtime': {'runtime_per_1000': datetime.timedelta(microseconds=398149),
  'total_runtime': datetime.timedelta(seconds=4, microseconds=23293)},
 'test_dataset': {'size': 10105, 'spam_share': 0.5089559623948541}}

## Save the model to file

In [13]:
optimal_model.dump(folder='/content/drive/MyDrive/Colab Notebooks/Spam_Detector/models')

Model saved to /content/drive/MyDrive/Colab Notebooks/Spam_Detector/models/optimal_bow.joblib


In [14]:
# check that a model loaded from file behaves in the same way
new_model = bow_model.BoWModel(
    load_from_file=True, 
    filepath='/content/drive/MyDrive/Colab Notebooks/Spam_Detector/models/optimal_bow.joblib')

Loading the model from /content/drive/MyDrive/Colab Notebooks/Spam_Detector/models/optimal_bow.joblib
Model loaded successfully


In [16]:
new_results = quality_assessment.run_experiment(config=config, 
                                                model_name=new_model.model_name, 
                                                model=new_model)

Testing model optimal_bow

Loading test dataset
Test dataset size = 10105 entries
Share of spam = 0.5090

Generating predictions
Predictions generated

Total inference time: 0:00:03.919431
Inference time per 1000 entries: 0:00:00.387870

Binary classification metrics:
roc-auc score: 0.9965
At recall 0.9899 precision = 0.9738

Confusion matrix:
[[4825  137]
 [  52 5091]]
