In [3]:
#Import libraries
import sys
import os
import pandas as pd
import mmh3
import numpy as np

We start by defining the necessary functions to get document similarity (week 5 of course exercises).

In [76]:
# create shingle function
def shingles(string:str,q:int):
    output = set()
    for i in range(len(string)+1):
        if i < q:
            pass
        else:
            output.add(' '.join(string[i-q:i]))
    return output

#create listhash function
def listhash(l,seeds):
    vals = set()

    for e in l:
        val = 0
        for seed in seeds:
            val = val ^ mmh3.hash(e, seed)
        vals.add(val)
    return vals

#create signatures function
def signatures(docs, q=9, k=20):
    sign = {}
    for key, value in docs.items():
        sign[key] = listhash(shingles(value,q=q),np.arange(k))
    return sign

#create jaccard sim function
def jaccard(doc1, doc2):
    doc1=set(doc1)
    doc2=set(doc2)
    intersect = doc1.intersection(doc2)
    union = doc1.union(doc2)
    if len(union) != 0:
        return len(intersect) / len(union)
    else:
        return 0
    
#create document sim function
def similarity(docs:dict):
    output = np.zeros((len(docs.keys()),len(docs.keys())))
    for key1, value1 in docs.items():
        for key2, value2 in docs.items():
            if key1 == key2:
                pass
            else:
                jac_value = jaccard(value1,value2)
                output[key1,key2] = jac_value
    return output
            

Load email data!

In [275]:
emails = pd.read_csv('data/clean_spam.csv', encoding='latin')

#determine q and k
q = 5 #number of characters in each shingle
k = 20 #number of hashes per shingle

In [276]:
#create signatures for emails (we keep count based on index in emails)
email_signatures = {emails.index[i]: listhash(shingles(emails['text'][i], q=q),np.arange(k)) for i in emails.index}

In [277]:
#create signatures for emails
email_similarity = similarity(email_signatures)

In [396]:
def weighted_knn(x, y_train,test_idx,low,high,k_neighbours=5):
    y_test = []
    mask = np.ones(len(x),bool)
    mask[y_test] = False
    for i in test_idx:
        ind = []
        temp = np.argpartition(x[i], -k_neighbours)[-k_neighbours:]
        temp = np.flip(temp)
        for idx in temp:
            if idx>=high or idx < low:
                ind.append(idx)
        topk = x[i][ind]
        labels = {j: y_train[j] for j in ind}
        ham = 0
        spam = 0
        for key, value in labels.items():
            if value == 'ham':
                ham += x[i][key]
            if value == 'spam':
                spam += x[i][key]
        if ham>spam:
            y_test.append('ham')
        if ham == spam:
            y_test.append('ham')
        if ham<spam:
            y_test.append('spam')
    return y_test


In [403]:
from sklearn.metrics import f1_score
kfold=5
f1_scores = []
test_set_percent = ((len(emails)/kfold)/len(emails))
test_size = round(test_set_percent*len(emails))
former_test_idx = 0
for i in range(kfold):
    y_test = emails['label'][former_test_idx:(i+1)*test_size]
    y_idx = y_test.index
    mask = np.ones(len(emails), bool)
    mask[y_idx] = False
    y_train = emails['label'][mask]
    y_pred = weighted_knn(email_similarity,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
    f1_scores.append(f1_score(y_test,y_pred, pos_label='spam'))
    former_test_idx += test_size
print(f'Average f1-score: {round(np.mean(f1_scores),2)}')

Average f1-score: 0.94
