In [241]:
#Import libraries
import sys
import os
import pandas as pd
import mmh3
import numpy as np

We start by defining the necessary functions to get document similarity (week 5 of course exercises). Please note that the listhash, minhash, and signatures are gone. We will not be using them in this model as we saw a decrease in model performance. We did not perform any extensive analysis and we believe this is due to the information loss as we minhash. And as we won't minhash there is no need to create hashes at all and therefore, there won't be any signatures. Just q-shingles!

In [283]:
# create shingle function
def shingles(string:str,q:int):
    output = set()
    for i in range(len(string)+1):
        if i < q:
            pass
        else:
            output.add(''.join(string[i-q:i]))
    return output

#create jaccard sim function
def jaccard(doc1, doc2):
    intersect = np.intersect1d(doc1,doc2)
    union = np.union1d(doc1,doc2)
    if len(union) != 0:
        return len(intersect) / len(union)
    else:
        return 0
    
#create document sim function
def similarity(docs:dict):
    output = np.zeros((len(docs.keys()),len(docs.keys())))
    for key1, value1 in tqdm(docs.items()):
        for key2, value2 in docs.items():
            if key1 <= key2:
                pass
            else:
                jac_value = jaccard(np.array(value1),np.array(value2))
                output[key1,key2] = jac_value
    return np.tril(output) + np.triu(output.T, 1)

Now we create the "training" loop. The training actually happens as we create the document similarity matrix. This function just evaluates one fold of a cross validation ind gives us the predicted labels for the test set in that fold.

In [243]:
def weighted_knn(x, y_train,test_idx,low,high,k_neighbours=5):
    y_test = []
    mask = np.ones(len(x),bool)
    mask[y_test] = False
    for i in test_idx:
        ind = []
        temp = np.argpartition(x[i], -k_neighbours)[-k_neighbours:]
        temp = np.flip(temp)
        for idx in temp:
            if idx>=high or idx < low:
                ind.append(idx)
        topk = x[i][ind]
        labels = {j: y_train[j] for j in ind}
        ham = 0
        spam = 0
        for key, value in labels.items():
            if value == 0:
                ham += x[i][key]
            if value == 1:
                spam += x[i][key]
        
        if ham>spam:
            y_test.append(0)
        if ham == spam:
            y_test.append(0)
        if ham<spam:
            y_test.append(1)
        

    return y_test

Now we get to a piece of code that we are not really proud of. However, we decided to use no sklearn models in this section (except for evaluating f1-scores and accuracies). We evaluate the model with a $k$ (in KNN) of 5 and a $q$ of 5 as that was suggested by the course. We evaluate through a 5-fold cross validation and save all performance metrics from each run.

We also keep the predictions of the outer test set in order to do McNemar tests between models. This will be done in another notebook as this one has become quite extensive.

### SPAM (SMS)
Sorry for variable names, we realised that this was SMS messages and not emails later in the process.

##### Q-KNN (on raw)

In [270]:
from ast import literal_eval
emails = pd.read_csv('clean_data/clean_spam.csv', encoding='latin')
emails.tokens = emails.tokens.apply(literal_eval)

In [252]:
#Get shingles and similarity
q=5
email_shingles = {emails.index[_]: shingles(emails.iloc[:,1][_], q=q) for _ in emails.index}
email_similarity = similarity(email_shingles)

In [253]:
from sklearn.metrics import f1_score
kfold=5
f1_scores = []
accuracies = []
test_set_percent = ((len(emails)/kfold)/len(emails))
test_size = round(test_set_percent*len(emails))
former_test_idx = 0
y_tests = []
predicted = []
for i in range(kfold):
    y_test = emails['binary'][former_test_idx:(i+1)*test_size]
    y_tests = y_tests + list(y_test)
    y_idx = y_test.index
    mask = np.ones(len(emails), bool)
    mask[y_idx] = False
    y_train = emails['binary'][mask]
    y_pred = weighted_knn(email_similarity,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
    predicted = predicted + list(y_pred)
    f1_scores.append(f1_score(y_test,y_pred))
    accuracies.append(accuracy_score(y_test,y_pred))
    former_test_idx += test_size

In [254]:
f1_scores = np.array(f1_scores)
accuracies = np.array(accuracies)
np.savetxt('q-knn_results/f1_q-knn_raw_spam.txt',f1_scores)
np.savetxt('q-knn_results/acc_q-knn_raw_spam.txt', accuracies)
np.savetxt('q-knn_results/pred_q-knn_raw_spam.txt',np.array(predicted), fmt="%s")

##### Bag of Words-KNN (on tokens)

In [255]:
#get bag of words as sets for each email
email_bow = {emails.index[i]: set(emails['tokens'][i]) for i in range(len(emails))}
#get similarity between documents
email_bow_sim = similarity(email_bow)

from sklearn.metrics import f1_score
kfold=5
f1_scores = []
accuracies = []
test_set_percent = ((len(emails)/kfold)/len(emails))
test_size = round(test_set_percent*len(emails))
former_test_idx = 0
y_tests = []
predicted = []
for i in range(kfold):
    y_test = emails['binary'][former_test_idx:(i+1)*test_size]
    y_tests = y_tests + list(y_test)
    y_idx = y_test.index
    mask = np.ones(len(emails), bool)
    mask[y_idx] = False
    y_train = emails['binary'][mask]
    y_pred = weighted_knn(email_bow_sim,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
    predicted = predicted + list(y_pred)
    f1_scores.append(f1_score(y_test,y_pred))
    accuracies.append(accuracy_score(y_test,y_pred))
    former_test_idx += test_size

In [256]:
f1_scores = np.array(f1_scores)
accuracies = np.array(accuracies)
np.savetxt('q-knn_results/f1_token-knn_spam.txt',f1_scores)
np.savetxt('q-knn_results/acc_token-knn_spam.txt', accuracies)
np.savetxt('q-knn_results/pred_token-knn_spam.txt',np.array(predicted), fmt="%s")

##### Q-KNN (on tokens)

In [257]:
#get bag of words as sets for each email
email_shingles = {emails.index[_]: shingles(emails.iloc[:,3][_], q=q) for _ in emails.index}
#get similarity between documents
email_similarity = similarity(email_shingles)

from sklearn.metrics import f1_score
kfold=5
f1_scores = []
accuracies = []
test_set_percent = ((len(emails)/kfold)/len(emails))
test_size = round(test_set_percent*len(emails))
former_test_idx = 0
y_tests = []
predicted = []
for i in range(kfold):
    y_test = emails['binary'][former_test_idx:(i+1)*test_size]
    y_tests = y_tests + list(y_test)
    y_idx = y_test.index
    mask = np.ones(len(emails), bool)
    mask[y_idx] = False
    y_train = emails['binary'][mask]
    y_pred = weighted_knn(email_similarity,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
    predicted = predicted + list(y_pred)
    f1_scores.append(f1_score(y_test,y_pred))
    accuracies.append(accuracy_score(y_test,y_pred))
    former_test_idx += test_size

In [258]:
f1_scores = np.array(f1_scores)
accuracies = np.array(accuracies)
np.savetxt('q-knn_results/f1_q-knn_spam.txt',f1_scores)
np.savetxt('q-knn_results/acc_q-knn_spam.txt', accuracies)
np.savetxt('q-knn_results/pred_q-knn_spam.txt',np.array(predicted), fmt="%s")

##### Bag of Words-KNN (on raw data)

In [273]:
emails['token_raw'] = [emails['text'][i].split(' ') for i in emails.index]
#get bag of words as sets for each email
email_bow = {emails.index[i]: set(emails['token_raw'][i]) for i in range(len(emails))}
#get similarity between documents
email_bow_sim = similarity(email_bow)

from sklearn.metrics import f1_score
kfold=5
f1_scores = []
accuracies = []
test_set_percent = ((len(emails)/kfold)/len(emails))
test_size = round(test_set_percent*len(emails))
former_test_idx = 0
y_tests = []
predicted = []
for i in range(kfold):
    y_test = emails['binary'][former_test_idx:(i+1)*test_size]
    y_tests = y_tests + list(y_test)
    y_idx = y_test.index
    mask = np.ones(len(emails), bool)
    mask[y_idx] = False
    y_train = emails['binary'][mask]
    y_pred = weighted_knn(email_bow_sim,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
    predicted = predicted + list(y_pred)
    f1_scores.append(f1_score(y_test,y_pred))
    accuracies.append(accuracy_score(y_test,y_pred))
    former_test_idx += test_size

In [274]:
f1_scores = np.array(f1_scores)
accuracies = np.array(accuracies)
np.savetxt('q-knn_results/f1_token-knn_raw_spam.txt',f1_scores)
np.savetxt('q-knn_results/acc_token-knn_raw_spam.txt', accuracies)
np.savetxt('q-knn_results/pred_token-knn_raw_spam.txt',np.array(predicted), fmt="%s")

### SPAM ASSASSIN (EMAILS)
This is actually email data!

##### Q-KNN (on raw)

In [275]:
emails = pd.read_csv('clean_data/clean_completeSpamAssassin.csv', encoding='latin')
emails = emails.sample(frac=1).reset_index(drop=True)
np.random.seed(0)
emails.to_csv('clean_data/clean_completeSpamAssassin_shuffled.csv', index=False)
emails.tokens = emails.tokens.apply(literal_eval)

In [264]:
#get bag of words as sets for each email
email_shingles = {emails.index[_]: shingles(emails.iloc[:,0][_], q=q) for _ in emails.index}
#get similarity between documents
email_similarity = similarity(email_shingles)

from sklearn.metrics import f1_score
kfold=5
f1_scores = []
accuracies = []
test_set_percent = ((len(emails)/kfold)/len(emails))
test_size = round(test_set_percent*len(emails))
former_test_idx = 0
y_tests = []
predicted = []
for i in range(kfold):
    y_test = emails['binary'][former_test_idx:(i+1)*test_size]
    y_tests = y_tests + list(y_test)
    y_idx = y_test.index
    mask = np.ones(len(emails), bool)
    mask[y_idx] = False
    y_train = emails['binary'][mask]
    y_pred = weighted_knn(email_similarity,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
    predicted = predicted + list(y_pred)
    f1_scores.append(f1_score(y_test,y_pred))
    accuracies.append(accuracy_score(y_test,y_pred))
    former_test_idx += test_size

In [265]:
f1_scores = np.array(f1_scores)
accuracies = np.array(accuracies)
np.savetxt('q-knn_results/f1_q-knn_raw_assassin.txt',f1_scores)
np.savetxt('q-knn_results/acc_q-knn_raw_assasin.txt', accuracies)
np.savetxt('q-knn_results/pred_q-knn_raw_assassin.txt',np.array(predicted), fmt="%s")

##### Bag of Words-KNN (on tokens)

In [266]:
#get bag of words as sets for each email
email_bow = {emails.index[i]: set(emails['tokens'][i]) for i in range(len(emails))}
#get similarity between documents
email_bow_sim = similarity(email_bow)

from sklearn.metrics import f1_score
kfold=5
f1_scores = []
accuracies = []
test_set_percent = ((len(emails)/kfold)/len(emails))
test_size = round(test_set_percent*len(emails))
former_test_idx = 0
y_tests = []
predicted = []
for i in range(kfold):
    y_test = emails['binary'][former_test_idx:(i+1)*test_size]
    y_tests = y_tests + list(y_test)
    y_idx = y_test.index
    mask = np.ones(len(emails), bool)
    mask[y_idx] = False
    y_train = emails['binary'][mask]
    y_pred = weighted_knn(email_bow_sim,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
    predicted = predicted + list(y_pred)
    f1_scores.append(f1_score(y_test,y_pred))
    accuracies.append(accuracy_score(y_test,y_pred))
    former_test_idx += test_size

In [267]:
f1_scores = np.array(f1_scores)
accuracies = np.array(accuracies)
np.savetxt('q-knn_results/f1_token-knn_assassin.txt',f1_scores)
np.savetxt('q-knn_results/acc_token-knn_assassin.txt', accuracies)
np.savetxt('q-knn_results/pred_token-knn_assassin.txt',np.array(predicted), fmt="%s")

##### Q-KNN (on tokens)

In [268]:
#get bag of words as sets for each email
email_shingles = {emails.index[_]: shingles(emails.iloc[:,3][_], q=q) for _ in emails.index}
#get similarity between documents
email_similarity = similarity(email_shingles)

from sklearn.metrics import f1_score
kfold=5
f1_scores = []
accuracies = []
test_set_percent = ((len(emails)/kfold)/len(emails))
test_size = round(test_set_percent*len(emails))
former_test_idx = 0
y_tests = []
predicted = []
for i in range(kfold):
    y_test = emails['binary'][former_test_idx:(i+1)*test_size]
    y_tests = y_tests + list(y_test)
    y_idx = y_test.index
    mask = np.ones(len(emails), bool)
    mask[y_idx] = False
    y_train = emails['binary'][mask]
    y_pred = weighted_knn(email_similarity,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
    predicted = predicted + list(y_pred)
    f1_scores.append(f1_score(y_test,y_pred))
    accuracies.append(accuracy_score(y_test,y_pred))
    former_test_idx += test_size

In [269]:
f1_scores = np.array(f1_scores)
accuracies = np.array(accuracies)
np.savetxt('q-knn_results/f1_q-knn_assassin.txt',f1_scores)
np.savetxt('q-knn_results/acc_q-knn_assassin.txt', accuracies)
np.savetxt('q-knn_results/pred_q-knn_assassin.txt',np.array(predicted), fmt="%s")

##### Bag of Words-KNN (on raw)

In [276]:
emails['token_raw'] = [emails['text'][i].split(' ') for i in emails.index]
#get bag of words as sets for each email
email_bow = {emails.index[i]: set(emails['token_raw'][i]) for i in range(len(emails))}
#get similarity between documents
email_bow_sim = similarity(email_bow)

from sklearn.metrics import f1_score
kfold=5
f1_scores = []
accuracies = []
test_set_percent = ((len(emails)/kfold)/len(emails))
test_size = round(test_set_percent*len(emails))
former_test_idx = 0
y_tests = []
predicted = []
for i in range(kfold):
    y_test = emails['binary'][former_test_idx:(i+1)*test_size]
    y_tests = y_tests + list(y_test)
    y_idx = y_test.index
    mask = np.ones(len(emails), bool)
    mask[y_idx] = False
    y_train = emails['binary'][mask]
    y_pred = weighted_knn(email_bow_sim,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
    predicted = predicted + list(y_pred)
    f1_scores.append(f1_score(y_test,y_pred))
    accuracies.append(accuracy_score(y_test,y_pred))
    former_test_idx += test_size

In [277]:
f1_scores = np.array(f1_scores)
accuracies = np.array(accuracies)
np.savetxt('q-knn_results/f1_token-knn_raw_assassin.txt',f1_scores)
np.savetxt('q-knn_results/acc_token-knn_raw_assassin.txt', accuracies)
np.savetxt('q-knn_results/pred_token-knn_raw_assassin.txt',np.array(predicted), fmt="%s")