In [1]:
import datetime
import json
import os
import boto3
from dotenv import load_dotenv
from datetime import datetime
from flask import Flask, request, jsonify
from botocore.errorfactory import ClientError
import structlog  # for event logging
import pandas as pd
import numpy as np
import os
import json
import glob
from sklearn.feature_extraction.text import CountVectorizer
import scipy
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.cluster import DBSCAN
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:

# load the environment files
load_dotenv()
s3_resource = boto3.resource('s3',
                             endpoint_url=os.getenv('ENDPOINT_URL'),
                             aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
                             aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
                             aws_session_token=None,
                             config=boto3.session.Config(signature_version='s3v4'),
                             verify=False
                             )

In [None]:
def read_from_s3_bucket(file_name: str, bucket_name="joined-out"):
    # read all the files in the bucket
    bucket = s3_resource.Bucket(bucket_name)
    found = False
    data = ""
    for obj in bucket.objects.all():
        key = obj.key
        if file_name in str(key):
            body = obj.get()['Body'].read()
            
            if isinstance(body, bytes):
                body = body.decode()
            elif not isinstance(body, str):
                body = str(body)
                
            print("%s : length = %s" % (key, len(body)))
                
            data += body
            # data = body
            # data.append(body)
            found = True
    if found:
        return data
    else:
        print("ERROR KEY NOT FOUND")
        return -1

In [None]:

DEBUG = True
# read in all the data from S3 as one big string
data = read_from_s3_bucket(file_name="out/")

out/_SUCCESS : length = 0
out/part-00000-28743dcb-7f7e-4000-ab03-e38baf795fe1-c000.json : length = 44613071
out/part-00001-28743dcb-7f7e-4000-ab03-e38baf795fe1-c000.json : length = 44990855
out/part-00002-28743dcb-7f7e-4000-ab03-e38baf795fe1-c000.json : length = 44710523
out/part-00003-28743dcb-7f7e-4000-ab03-e38baf795fe1-c000.json : length = 45277188


In [None]:
# Split the string of data into individual lines. Only works because backslash is escaped in string.
data = data.split('\n')
data = data[0:-1]
if DEBUG: print(len(data))

# convert every entry in the list to a JSON object. 
records = []
for i in range(len(data)):
    record = json.loads(data[i])
    record['email_object'] = json.loads(record['email_object'])
    record['to'] = record['email_object']['to']
    record['body'] = record['email_object']['body']
    record['from'] = record['email_object']['from']
    record['subject'] = record['email_object']['subject']
    records.append(record)

if DEBUG: print("Type: %s of type: %s, Len %d" % (type(records), type(records[0]), len(records)))
# if DEBUG: print(json.dumps(records[0]))

100000
Type: <class 'list'> of type: <class 'dict'>, Len 100000


In [None]:
# load the data into a dataframe and set proper types
df = pd.DataFrame(records)
df['received_timestamp'] = pd.to_datetime(df['received_timestamp'], utc=True)
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
df['label'] = df['label'].astype('category')
df['event'] = df['event'].astype('category')
df = df.sort_values(by='received_timestamp')
df[['to', 'from', 'body', 'subject']] = df[['to', 'from', 'body', 'subject']].fillna(value="")

if DEBUG: print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 39590 to 53604
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   email_id            100000 non-null  int64              
 1   received_timestamp  100000 non-null  datetime64[ns, UTC]
 2   email_object        100000 non-null  object             
 3   event               64504 non-null   category           
 4   label               100000 non-null  category           
 5   timestamp           64504 non-null   datetime64[ns, UTC]
 6   to                  100000 non-null  object             
 7   body                100000 non-null  object             
 8   from                100000 non-null  object             
 9   subject             100000 non-null  object             
dtypes: category(2), datetime64[ns, UTC](2), int64(1), object(5)
memory usage: 7.1+ MB
None


In [None]:
df.head(1)

Unnamed: 0,email_id,received_timestamp,email_object,event,label,timestamp,to,body,from,subject
39590,45701,2023-01-11 20:26:26.701000+00:00,"{'to': 'the00@speedy.uwaterloo.ca', 'body': ' ...",email::id::label::put,spam,2023-01-11 20:26:26.741625+00:00,the00@speedy.uwaterloo.ca,\n\n\n\n\n\n\nDo you feel the pressure to perf...,"""Tomas Jacobs"" <RickyAmes@aol.com>","Generic Cialis, branded quality@"


In [None]:
# Perform Time Based Split
n = len(df)
train_amt = int(0.7 * n)
test_amt = int(0.2 * n)
validation_amt = int(0.1 * n)
# split the dataframe by slicing based on index. Only works due to being sorted by time.
# training_targets = ['to', 'from', 'body', 'subject']
training_targets =['body']
train_x = df.iloc[0:train_amt][training_targets]
train_y = df.iloc[0:train_amt]['label']
validation_x = df.iloc[train_amt:validation_amt + train_amt][training_targets]
validation_y = df.iloc[train_amt:validation_amt + train_amt]['label']
test_x = df.iloc[train_amt + validation_amt:n][training_targets]
test_y = df.iloc[train_amt + validation_amt:n]['label']
if DEBUG: print("Train Size: %d\tValidation Size: %d\tTest Size: %d" % (len(train_x), len(validation_x), len(test_x)))
    

Train Size: 70000	Validation Size: 10000	Test Size: 20000


In [None]:
# Deal with Class Imbalance by undersampling the majority for training
y_orig = train_y
x_orig = train_x # truncated?
if DEBUG: print('Original dataset shape {}'.format(Counter(y_orig)))
rus = RandomUnderSampler(random_state=42)
x, y = rus.fit_resample(x_orig, y_orig)
if DEBUG: print('Resampled dataset shape {}'.format(Counter(y)))
train_x = x
train_y = y
if DEBUG: print(train_y.shape)
if DEBUG: print(train_x.shape)

Original dataset shape Counter({'spam': 45133, 'ham': 24867})
Resampled dataset shape Counter({'ham': 24867, 'spam': 24867})
(49734,)
(49734, 1)


In [None]:
"""
Performing Text Classification

GridSearch Params:
CountVectorizer parameters,
Model types,
Model Hyperparameters
"""

'\nPerforming Text Classification\n\nGridSearch Params:\nCountVectorizer parameters,\nModel types,\nModel Hyperparameters\n'

In [None]:
# https://stackoverflow.com/questions/65242617/sklearn-pipeline-with-countvectorizer-and-category-on-a-pandas-dataframe
# https://towardsdatascience.com/natural-language-processing-on-multiple-columns-in-python-554043e05308
# https://stackoverflow.com/a/55401454 for grid search info

# Apparently can only countvectorizer on one column... May need to handle the others as categories

text_preprocessing1 = Pipeline([
    ('Vect', CountVectorizer()),
    ('Tsvd', TruncatedSVD(n_components=10))
    ])
text_preprocessing2 = Pipeline([
    ('Vect', CountVectorizer())
    ])
text_preprocessing3 = Pipeline([
    ('Vect', CountVectorizer(stop_words='english', min_df=10)),
    ('Tsvd', TruncatedSVD(n_components=10))
    ])
text_preprocessing4 = Pipeline([
    ('Vect', CountVectorizer(stop_words='english', min_df=10))
    ])
text_preprocessing5 = Pipeline([('BOW', TfidfVectorizer(ngram_range=(1, 3), max_features=1000))])

classifier1 = Pipeline([
    ('svc', svm.SVC())
    ])

classifier2 = Pipeline([
    ('ridge', RidgeClassifier())
    ])

classifier3 = Pipeline([
    ('rf', RandomForestClassifier())
    ])

preprocessors = [text_preprocessing1, text_preprocessing2, text_preprocessing3, text_preprocessing4, text_preprocessing5]
classifiers = [classifier1, classifier2, classifier3]


In [None]:
x_train = train_x
y_train = train_y.squeeze()
x_test = test_x
y_test = test_y
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(49734, 1)
(49734,)
(20000, 1)
(20000,)


In [None]:
if False: # Do Not re-run as the results are in a table below
    for preprocessor in preprocessors:
        for classifier in classifiers:
            
            preprocess = ColumnTransformer([
                ('txt' + str(0), preprocessor, 'body')
                ], remainder='passthrough')

            pipeline = Pipeline([
                            ('preprocess', preprocess),
                            ('clf', classifier)
                        ])

            print(pipeline)
            
            pipe = pipeline.fit(x_train, y_train)
            y_pred = pipe.predict(x_test)
            try:
                y_decision = pipe.decision_function(x_test)
                print("ROC_AUC: %.2f" % (metrics.roc_auc_score(y_test, y_decision)))
            except Exception:
                print("No Decision function. Using predict_proba")
                y_pred_prob = pipe.predict_proba(x_test)
                print("ROC_AUC: %.2f" % (metrics.roc_auc_score(y_test, y_pred_prob[:,1])))
            print("F1: %.2f" % (f1_score(y_test, y_pred, pos_label='spam')))
            print("------------------------------------")
        

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('txt0',
                                                  Pipeline(steps=[('Vect',
                                                                   CountVectorizer()),
                                                                  ('Tsvd',
                                                                   TruncatedSVD(n_components=10))]),
                                                  'body')])),
                ('clf', Pipeline(steps=[('svc', SVC())]))])
ROC_AUC: 0.91
F1: 0.90
------------------------------------
Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('txt0',
                                                  Pipeline(steps=[('Vect',
                                                                   CountVectorizer()),
                    

# Prelim Pipeline Search Results
| Preprocessor | Txt Process Had Args? | CLF | ROC AUC | F1 |
|--------------|-----------------------|-----|---------|----|
| CV + TSVD | No | svc | 0.91 | 0.90 |
| CV + TSVD | No | ridge | 0.83 | 0.83 |
| CV + TSVD | No | rf | 1.00 | 0.97 |
| CV | No | svc | 0.98 | 0.97 |
| CV | No | ridge | 1.00 | 0.98 |
| CV | No | rf | 1.00 | 0.98 |
| CV + TSVD | Yes | svc | 0.89 | 0.91 |
| CV + TSVD | Yes | ridge | 0.83 | 0.84 |
| CV + TSVD | Yes | rf | 1.00 | 0.97 |
| CV | Yes | svc | 0.99 | 0.97 |
| CV | Yes | ridge | 0.99 | 0.97 |
| CV | Yes | rf | 1.00 | 0.98 |
| TFIDF | Yes | svc | 0.98 | 0.98 |
| TFIDF | Yes | ridge | 0.99 | 0.97 |
| TFIDF | Yes | rf | 1.00 | 0.98 |

The above table shows the results of the model and preprocessor semi-grid search that was performed. It shows that RandomForest had an area under the ROC curve of 1 in all cases. It also had a high F1 score. The table shows that TFIDF performed similarly to the CountVectorizer on its own but was much slower. It also showed that adding the TruncatedSVD after the CountVectorizer hurt the accuracy of the model but durastically improved training time. The table showed that the Ridge classifier performed similar to the RandomForest but with slightly faster speeds. Another piece of information of note is that removing the stopwords during the CountVectorizer hurt performance.

These results point towards the conclusion that we would want a CountVectorizer with Ridge classifier for the best performance. This does not take time or legibility into account though. For these reasons it is likely better to move forward using a CountVectorizer followed by a TruncatedSVD with a RandomForest classifier.

In [None]:

text_preprocessing = Pipeline([
    ('Vect', CountVectorizer()),
    ('Tsvd', TruncatedSVD(n_components=10))
    ])
preprocess = ColumnTransformer([('pre', text_preprocessing, 'body')], remainder='passthrough')
# classifier = Pipeline([('ridge', RidgeClassifier())])
# classifier = Pipeline([('svc', svm.SVC())])
# classifier = Pipeline([('rf', RandomForestClassifier())])
# pipeline = Pipeline([
#                         ('preprocess', preprocess),
#                         ('clf', classifier)
#                     ])
pipeline = Pipeline([
                        ('pre', preprocess),
                        ('rf', RandomForestClassifier())
                    ])

pipeline

In [None]:
# parameters = {
#   'rf__n_estimators':[1,10,100,1000],
#   'rf__min_samples_split': [2,3,4,5],
#   'rf__criterion': ['gini', 'entropy', 'log_loss'],
#   'rf__max_depth': [None, 10, 100],
#   'rf__max_features': [None, 'sqrt', 'log2'],
#   'rf__n_jobs': [-1],
#   'rf__random_state': [42]
#   }
parameters = {
  'rf__n_estimators':[1,10,100,1000],
  'rf__min_samples_split': [2,4],
  'rf__n_jobs': [-1],
  'rf__random_state': [42]
  }
# initialize
grid_pipeline = GridSearchCV(pipeline, parameters, cv=1, verbose=2, n_jobs=-1, scoring='roc_auc')
# fit
grid_pipeline.fit(x_train, y_train)
grid_pipeline.best_params_

Pipeline(steps=[('pre',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pre',
                                                  Pipeline(steps=[('Vect',
                                                                   CountVectorizer()),
                                                                  ('Tsvd',
                                                                   TruncatedSVD(n_components=10))]),
                                                  'body')])),
                ('rf', RandomForestClassifier())])
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


: 

In [None]:

# pipe = pipeline.fit(x_train, y_train)
# print(pipeline.score(x_train, y_train))

pipe = grid_pipeline

y_pred = pipe.predict(x_test)
try:
    y_decision = pipe.decision_function(x_test)
    print("ROC_AUC: %.2f" % (metrics.roc_auc_score(y_test, y_decision)))
except Exception:
    print("No Decision function. Using predict_proba")
    y_pred_prob = pipe.predict_proba(x_test)
    print("ROC_AUC: %.2f" % (metrics.roc_auc_score(y_test, y_pred_prob[:,1])))
print("F1: %.2f" % (f1_score(y_test, y_pred, pos_label='spam')))
print("Accuracy: %.2f" % (metrics.accuracy_score(y_test, y_pred)))
print("Precision: %.2f" % (metrics.precision_score(y_test, y_pred, average='weighted', zero_division=0)))
print("Recall: %.2f" % (metrics.recall_score(y_test, y_pred, average='weighted')))
print("------------------------------------")

In [None]:


# pipe = pipeline.fit(x_train, y_train)
# # y_pred = pipe.predict(x_test)
# y_decision = pipe.decision_function(x_test)
# print("ROC_AUC: %.2f" % (metrics.roc_auc_score(y_test, y_decision)))
# svc_disp = metrics.RocCurveDisplay.from_estimator(pipe, x_test, y_test)
# plt.show()


In [None]:
# # prepare the data by running the count vectorizer and tsvd. 
# # Want Tfidf eventually. Also want to use other data than just body
# vectorizer = CountVectorizer(binary=True)
# vect = TruncatedSVD()
# x_train = vect.fit_transform(vectorizer.fit_transform(x['body']))
# y_train = y
# x_valid = vect.fit_transform(vectorizer.fit_transform(validation_x['body']))
# y_valid = validation_y
# x_test = vect.fit_transform(vectorizer.fit_transform(test_x['body']))
# y_test = test_y
# print(x_train.shape)
# print(y_train.shape)

In [None]:
# clf = svm.SVC(kernel='linear')
# clf.fit(x_train, y_train)
# y_pred = clf.predict(x_test)
# # y_pred_prob = clf.predict_proba(x_test)
# y_decision = clf.decision_function(x_test)
    

In [None]:
# # print("Accuracy: %.2f" % (metrics.accuracy_score(y_test, y_pred)))
# # print("Precision: %.2f" % (metrics.precision_score(y_test, y_pred, average='weighted', zero_division=0)))
# # print("Recall: %.2f" % (metrics.recall_score(y_test, y_pred, average='weighted')))
# print("ROC_AUC: %.2f" % (metrics.roc_auc_score(y_test, y_decision)))
# svc_disp = metrics.RocCurveDisplay.from_estimator(clf, x_test, y_test)
# plt.show()