# Preparations

In [1]:
import json
import os

## Mount Google Drive with raw data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
! ls /content/drive/MyDrive/Colab\ Notebooks/Spam_Detector/Enron_raw_data/dataset

test  test.csv	train  train.csv


## Install this project package from github

In [4]:
!rm -rf /content/spam_detector/
!git clone https://github.com/NataliaTarasovaNatoshir/spam_detector.git
%cd spam_detector/
!git pull origin master
!python setup.py install

Cloning into 'spam_detector'...
remote: Enumerating objects: 68, done.[K
remote: Counting objects:   1% (1/68)[Kremote: Counting objects:   2% (2/68)[Kremote: Counting objects:   4% (3/68)[Kremote: Counting objects:   5% (4/68)[Kremote: Counting objects:   7% (5/68)[Kremote: Counting objects:   8% (6/68)[Kremote: Counting objects:  10% (7/68)[Kremote: Counting objects:  11% (8/68)[Kremote: Counting objects:  13% (9/68)[Kremote: Counting objects:  14% (10/68)[Kremote: Counting objects:  16% (11/68)[Kremote: Counting objects:  17% (12/68)[Kremote: Counting objects:  19% (13/68)[Kremote: Counting objects:  20% (14/68)[Kremote: Counting objects:  22% (15/68)[Kremote: Counting objects:  23% (16/68)[Kremote: Counting objects:  25% (17/68)[Kremote: Counting objects:  26% (18/68)[Kremote: Counting objects:  27% (19/68)[Kremote: Counting objects:  29% (20/68)[Kremote: Counting objects:  30% (21/68)[Kremote: Counting objects:  32% (22/68)[Kremote: Cou

In [5]:
# load config from package
with open("/content/spam_detector/spam_detector/config.json") as file:
  config = json.load(file)
config

{'dataset_build': {'raw_files_folder': '/content/drive/MyDrive/Colab Notebooks/Spam_Detector/Enron_raw_data/raw_files',
  'res_dataset_folder_name': '/content/drive/MyDrive/Colab Notebooks/Spam_Detector/Enron_raw_data/dataset',
  'test_share': 0.3},
 'testing': {'recall_level': 0.9}}

# Model testing

In [6]:
import pandas as pd
dataset_folder = config['dataset_build']['res_dataset_folder_name']

test = pd.read_csv(os.path.join(dataset_folder, 'test.csv'))

test.head()

Unnamed: 0,subject,text,message_id,label
0,prc for todd,"sally , attached below is a list of my accompl...",1496.2000-06-18.beck.ham.txt,0
1,var training,"sally , angela sprock forwarded your vm to me ...",1516.2000-06-19.beck.ham.txt,0
2,average eol and non - eol deals per day,"fyi - we are now distributing the "" john lavor...",1518.2000-06-19.beck.ham.txt,0
3,ena sap project,"as you know , we have substantially wrapped up...",1546.2000-06-20.beck.ham.txt,0
4,re : var training,"sally , i understand your comments . i ' ll ge...",1551.2000-06-20.beck.ham.txt,0


In [7]:
from numpy.random.mtrand import seed
import random
import numpy as np

class DummyModel():
  def __init__(self, seed):
    self.seed = seed
  
  def predict(self, X):
    np.random.seed(self.seed)
    return np.random.rand(len(X))

In [8]:
dummy_model = DummyModel(42)
dummy_model.predict(test)

array([0.37454012, 0.95071431, 0.73199394, ..., 0.98141572, 0.33775599,
       0.7030317 ])

In [9]:
from datetime import datetime

# calculate predictions and measure runtime
model_name = 'dummy_model'
print("Testing model {}\n".format(model_name))
print("Test dataset size = {} entries".format(len(test)))
print("Share of spam = {0:.4f}\n".format(test['label'].mean()))
print("Generating predictions")
start_time = datetime.now()
y_pred = dummy_model.predict(test[['message_id', 'subject', 'text']])
end_time = datetime.now()
print("Predictions generated\n")
runtime = end_time - start_time
runtime_per_1000 = 1000 * runtime / len(test)
print('Total inference time: {}'.format(runtime))
print('Inference time per 1000 entries: {}\n'.format(runtime_per_1000))

Testing model dummy_model

Test dataset size = 10105 entries
Share of spam = 0.5090

Generating predictions
Predictions generated

Total inference time: 0:00:00.003824
Inference time per 1000 entries: 0:00:00.000378



In [10]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, confusion_matrix

recall_level = 0.3

# binary classification metrics
y_true = test['label'].values
print('Binary classification metrics:')
# roc-auc as a general quality metric
roc_auc = roc_auc_score(y_true, y_pred)
print("roc-auc score: {0:.4f}".format(roc_auc))
# calculate precision at a chosen recall level
precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
for i in range(len(recall) - 1):
  if recall[i] <= recall_level: break
selected_threshold = thresholds[i]
selected_recall = recall[i]
selected_precision = precision[i]
print('At recall {0:.4f} precision = {1:.4f}\n'.format(selected_recall, selected_precision))
print('Confusion matrix:\n{}'.format(confusion_matrix(y_true, y_pred>=selected_threshold)))

Binary classification metrics:
roc-auc score: 0.4957
At recall 0.2998 precision = 0.5018

Confusion matrix:
[[3431 1531]
 [3601 1542]]


# Run testing pipeline

Check that everything works from the package too

In [11]:
from spam_detector import quality_assessment

In [13]:
results = quality_assessment.run_experiment(config=config, model_name='dummy_model', model=dummy_model)

Testing model dummy_model

Loading test dataset
Test dataset size = 10105 entries
Share of spam = 0.5090

Generating predictions
Predictions generated

Total inference time: 0:00:00.003044
Inference time per 1000 entries: 0:00:00.000301

Binary classification metrics:
roc-auc score: 0.4957
At recall 0.8999 precision = 0.5104

Confusion matrix:
[[ 522 4440]
 [ 515 4628]]


In [14]:
results

{'model_name': 'dummy_model',
 'quality_metrics': {'precision': 0.5103661226290251,
  'recall': 0.8998638926696481,
  'roc_auc_score': 0.49569659609414984,
  'threshold': 0.09957971354002892},
 'runtime': {'runtime_per_1000': datetime.timedelta(microseconds=301),
  'total_runtime': datetime.timedelta(microseconds=3044)},
 'test_dataset': {'size': 10105, 'spam_share': 0.5089559623948541}}