In [1]:
#Import libraries
import sys
import os
import pandas as pd
import mmh3
import numpy as np

We start by defining the necessary functions to get document similarity (week 5 of course exercises). Please note that the listhash, minhash, and signatures are gone. We will not be using them in this model as we saw the performance of the model decrease from a 0.94 avg F1 score to 0.76. We believe this is due to the information loss as we minhash. And as we won't minhash there is no need to create hashes at all and therefore, there won't be any signatures. Just q-shingles!

In [34]:
# create shingle function
def shingles(string:str,q:int):
    output = set()
    for i in range(len(string)+1):
        if i < q:
            pass
        else:
            output.add(' '.join(string[i-q:i]))
    return output

#create jaccard sim function
def jaccard(doc1, doc2):
    doc1=set(doc1)
    doc2=set(doc2)
    intersect = doc1.intersection(doc2)
    union = doc1.union(doc2)
    if len(union) != 0:
        return len(intersect) / len(union)
    else:
        return 0
    
#create document sim function
def similarity(docs:dict):
    output = np.zeros((len(docs.keys()),len(docs.keys())))
    for key1, value1 in docs.items():
        for key2, value2 in docs.items():
            if key1 <= key2:
                pass
            else:
                jac_value = jaccard(value1,value2)
                output[key1,key2] = jac_value
    
    return np.tril(output) + np.triu(output.T, 1)

Now we create the "training" loop. The training actually happens as we create the document similarity matrix. This function just evaluates one fold of a cross validation ind gives us the predicted labels for the test set in that fold.

In [6]:
def weighted_knn(x, y_train,test_idx,low,high,k_neighbours=5):
    y_test = []
    mask = np.ones(len(x),bool)
    mask[y_test] = False
    for i in test_idx:
        ind = []
        temp = np.argpartition(x[i], -k_neighbours)[-k_neighbours:]
        temp = np.flip(temp)
        for idx in temp:
            if idx>=high or idx < low:
                ind.append(idx)
        topk = x[i][ind]
        labels = {j: y_train[j] for j in ind}
        ham = 0
        spam = 0
        for key, value in labels.items():
            if value == 'ham':
                ham += x[i][key]
            if value == 'spam':
                spam += x[i][key]
        if ham>spam:
            y_test.append('ham')
        if ham == spam:
            y_test.append('ham')
        if ham<spam:
            y_test.append('spam')
    return y_test

Now we get to a piece of code that we are not really proud of. However, we decided to use no sklearn models in this section (except for evaluating f1-scores and accuracies. This is a two-level 5-fold cross validation. The outer level splits the data into a train and test set. The inner level evaluates - through 5-fold cross validation on the outer training set - the best hyperparameter $q$ and returns that. Then the outer level takes that best $q$ and evaluates the scoring metrics on the outer test set.

We also keep the predictions of the outer test set in order to do McNemar tests between the shingle and non-shingle KNN.

In [147]:
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
from ast import literal_eval

emails = pd.read_csv('data/clean_spam.csv', encoding='latin')
num_rows = len(emails)
kfold_outer = 5
test_set_percent_outer = ((num_rows/kfold_outer)/num_rows)
test_size_outer = round(test_set_percent_outer*num_rows)+1

#define variables to collect metrics:
f1_scores = []
accuracies = []
predicted = []
best_q = 0

qs = np.arange(2,9)
### Do outer cross validation
for cross_val,emails in tqdm(enumerate(pd.read_csv('data/clean_spam.csv',sep=',', chunksize=test_size_outer, encoding='latin'))):
    train1 = pd.read_csv('data/clean_spam.csv',sep=',', encoding='latin')
    train = train1.drop(emails.index, axis=0)
    
    train.reset_index(inplace=True)
    train = train.drop('index',axis=1)
    emails_out = emails.reset_index()
    
    f1s = {}
    accs = {}
    
    ### Do hyper parameter loop
    for q in qs:
        #get shingles
        email_shingles = {train.index[i]: shingles(train.iloc[:,1][i], q=q) for i in train.index}
        #get similarity
        email_similarity = similarity(email_shingles)
        #do training loop
        kfold_inner=5
        test_set_percent = ((len(train)/kfold_inner)/len(train))
        test_size = round(test_set_percent*len(train))
        former_test_idx = 0
        f1_inner_mean = []
        acc_inner_mean = []
        
        ### Do inner cross validation
        # Take mean of all metric scores
        for i in range(kfold_inner):
            y_test = train.iloc[:,0][former_test_idx:(i+1)*test_size]
            y_idx = y_test.index
            mask = np.ones(len(train), bool)
            mask[y_idx] = False
            y_train = train.iloc[:,0][mask]
            y_pred = weighted_knn(email_similarity,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
            f1 = f1_score(y_test,y_pred, pos_label='spam')
            f1_inner_mean.append(f1)
            y_pred1 = y_pred1 + list(y_pred)
            acc_inner_mean.append(accuracy_score(y_test, y_pred))
            former_test_idx += test_size
        f1s[q] = np.mean(f1_inner_mean)
        accs[q] = np.mean(acc_inner_mean)
        
    max_q = max(f1s, key=f1s.get)
    emails_out = pd.concat([pd.DataFrame(train),pd.DataFrame(emails_out)])
    emails_out = emails_out.drop('index',axis=1)
    emails_out = emails_out.reset_index()
    emails_out = emails_out.drop('index',axis=1)
    email_out_shingles = {emails_out.index[i]: shingles(emails_out.iloc[:,1][i], q=max_q) for i in emails_out.index}
    sim_matrix = similarity(email_out_shingles)
    
    y_test = emails_out.iloc[:,0][-test_size_outer:]
    y_idx = y_test.index
    mask = np.ones(len(emails_out), bool)
    mask[y_idx] = False
    y_train = emails_out.iloc[:,0][mask]
    y_pred = weighted_knn(sim_matrix,y_train,y_idx,len(emails_out)-test_size_outer,len(emails_out),5)
    f1_scores.append(f1_score(y_test,y_pred, pos_label='spam'))
    accuracies.append(accuracy_score(y_test,y_pred))
    predicted = predicted + list(y_pred)
    best_q = max_q

5it [12:12, 146.48s/it]


In [148]:
f1_scores

[0.9096774193548388,
 0.916030534351145,
 0.8995983935742973,
 0.901023890784983,
 0.9230769230769231]

Now we do the same loop but for KNN without q-shingles. Here we can actually drop the two inner loops as we have no $q$ to optimise for.

In [156]:
from ast import literal_eval
emails = pd.read_csv('data/clean_spam.csv', encoding='latin')
emails.tokens = emails.tokens.apply(literal_eval)

In [157]:
#get bag of words as sets for each email
email_bow = {emails.index[i]: set(emails['tokens'][i]) for i in range(len(emails))}

In [158]:
#get similarity between documents
email_bow_sim = similarity(email_bow)

In [160]:
from sklearn.metrics import f1_score
kfold=5
f1_scores = []
test_set_percent = ((len(emails)/kfold)/len(emails))
test_size = round(test_set_percent*len(emails))
former_test_idx = 0
y_tests = []
y_pred2 = []
for i in range(kfold):
    y_test = emails['label'][former_test_idx:(i+1)*test_size]
    y_tests = y_tests + list(y_test)
    y_idx = y_test.index
    mask = np.ones(len(emails), bool)
    mask[y_idx] = False
    y_train = emails['label'][mask]
    y_pred = weighted_knn(email_bow_sim,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
    y_pred2 = y_pred2 + list(y_pred)
    f1_scores.append(f1_score(y_test,y_pred, pos_label='spam'))
    former_test_idx += test_size
print(f1_scores)

[0.9235474006116209, 0.9138576779026217, 0.9011857707509882, 0.9072164948453608, 0.9110320284697508]


Now we define the McNemar test function. This function is inspired by Introduction to Machine Learning. Our null-hypothesis is going to be that the two models have the same accuracy.

If the p-value is below our level of significanse, $\alpha=0.05$, we can reject the null-hypothesis.

In [163]:
import scipy
#define McNemar test (inspired by a function made in Introduction to Machine Learning)
def mcnemar(y_true, y_pred1, y_pred2, alpha=0.05):
    nn = np.zeros((2,2))
    c1 = y_pred1 - y_true == 0
    c2 = y_pred2 - y_true == 0

    nn[0,0] = sum(c1 & c2)
    nn[0,1] = sum(c1 & ~c2)
    nn[1,0] = sum(~c1 & c2)
    nn[1,1] = sum(~c1 & ~c2)

    n = sum(nn.flat);
    n12 = nn[0,1]
    n21 = nn[1,0]

    thetahat = (n12-n21)/n
    Etheta = thetahat

    Q = n**2 * (n+1) * (Etheta+1) * (1-Etheta) / ( (n*(n12+n21) - (n12-n21)**2) )

    p = (Etheta + 1)*0.5 * (Q-1)
    q = (1-Etheta)*0.5 * (Q-1)

    CI = tuple(lm * 2 - 1 for lm in scipy.stats.beta.interval(1-alpha, a=p, b=q) )

    p = 2*scipy.stats.binom.cdf(min([n12,n21]), n=n12+n21, p=0.5)
    print(r'Result of McNemars test using $\alpha$ = ', alpha)
    print('Comparison matrix n')
    print(nn)
    if n12+n21 <= 10:
        print('Warning, n12+n21 is low: n12+n21=',(n12+n21))
    print(r'$\theta_hat$: ',thetahat)
    print(r'Approximate 1-$\alpha$ confidence interval of $\theta$: [$\theta_L$,$\theta_U$] = ', CI)
    print(r'p-value for two-sided test model 1 and model 2 have same accuracy (exact binomial test): p = ', p)

    return thetahat, CI, p 

Because of the design of this function, we need to vstack all y_test and y_pred to accomodate for all folds in the cross validation:

In [162]:
np_y_tests = np.zeros(len(y_tests))
np_y_pred1 = np.zeros(len(y_tests))
np_y_pred2 = np.zeros(len(y_tests))

for i in range(len(y_tests)):
    if y_tests[i] == 'spam':
        np_y_tests[i] = 1
    if y_pred1[i] == 'spam':
        np_y_pred1[i] = 1
    if y_pred2[i] == 'spam':
        np_y_pred2[i] = 1


mcresults = mcnemar(np_y_tests, np_y_pred1, np_y_pred2, alpha=0.05)

Result of McNemars test using $\alpha$ =  0.05
Comparison matrix n
[[4324.   34.]
 [1121.   91.]]
$\theta_hat$:  -0.19515260323159783
Approximate 1-$\alpha$ confidence interval of $\theta$: [$\theta_L$,$\theta_U$] =  (-0.20594472551654575, -0.18433712513451816)
p-value for two-sided test model 1 and model 2 have same accuracy (exact binomial test): p =  0.0


You can see that the p-value is very much lower than our level of confidence (alpha) of 0.05. Therefore, we can reject the null-hypothesis that the accuracies of the two models are equal!

Translated to English, this means that the first model based on signatures is significantly better than the model based on tokens.