# The supervised models

The following models:

- Jaccard KNN
- TF-IDF KNN
- TF-IDF NB

In [3]:
# resetting all parameters
%reset -f

## Initializing

In [49]:
import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer
from mlxtend.frequent_patterns import apriori
from sklearn.model_selection import RepeatedKFold, KFold, train_test_split
from sklearn.metrics import f1_score, accuracy_score
import time
from mlxtend.preprocessing import TransactionEncoder

from ast import literal_eval
from tqdm import tqdm
import mmh3
from itertools import product
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
import scipy.stats

seed = 42


### training function for TF-IDF models

In [47]:

#TDIDF
def identity_tokenizer(text):
    return text

def train(model, model_name, dataset, df_results, seed):
    # data
    #dataset = 'clean_completeSpamAssassin'
    #dataset = 'clean_spam'

    if dataset == 'clean_spam':
        new_dataset_name = 'SMS'
        min_df = 1
    else:
        new_dataset_name = 'Emails'
        min_df = 0.001
    # clean_completeSpamAssassin
    # clean_spam
    data = pd.read_csv(f'../data/{dataset}.csv', encoding='latin')
    data.tokens = data.tokens.apply(literal_eval)

    df_text_tokenized = {i: [data['text'][i].split(' ')] for i in list(data.index)}
    data['text_tokenized'] = pd.DataFrame.from_dict(df_text_tokenized, orient='index')


    look_up_for_look_ups = ['raw/BOW', 'raw/Q', 'prep/BOW', 'prep/Q']
    look_ups = ['text_tokenized', 'text', 'tokens', 'str_tokens']
    dims = {}

    #print(f'Model: {model_name}')

    for idx, look_up in enumerate(look_ups):
        labels = ['ham', 'spam']
        q = 5
        num_neighs = 5 # 2
        num_folds = 5 # 5 fold
        dims[look_up] = []
        kf = RepeatedKFold(n_splits=num_folds, n_repeats=1, random_state=seed)
        f1_scores = []
        accuracies = []

        for train_idx, test_idx in tqdm(kf.split(data)):
            # sorting data
            df_train = data.loc[train_idx]
            df_test = data.loc[test_idx]
            # generating vectors
            if look_up in ['text', 'str_tokens']:
                # shingles
                corpus = list(data[look_up])
                vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(q, q), min_df=min_df).fit(corpus)
                #vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(q, q), max_df=0.7).fit(corpus)
                X = vectorizer.fit_transform(df_train[look_up])
                feature_names = vectorizer.get_feature_names_out()
                dims[look_up].append(len(feature_names))


            else:
                vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, min_df=min_df)
                #vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, max_df=0.7)
                vecs = vectorizer.fit_transform(df_train[look_up])
                feature_names = vectorizer.get_feature_names_out()
                dense = vecs.todense()
                lst1 = dense.tolist()
                TDM = pd.DataFrame(lst1, columns=feature_names).dropna()
                X = vecs
                dims[look_up].append(len(feature_names))
            # training classifier
            model.fit(X, df_train.label)
            # predicting
            test_vecs = vectorizer.transform(df_test[look_up])
            pred_prob = model.predict_proba(test_vecs)
            y_pred = [labels[pred_idx] for pred_idx in pred_prob.argmax(axis=1)]
            y_test = df_test.label
            # generating predictions
            f1 = f1_score(y_test, y_pred, pos_label='spam')
            acc = accuracy_score(y_test, y_pred)
            f1_scores.append(f1_score(y_test, y_pred, pos_label='spam'))
            accuracies.append(accuracy_score(y_test, y_pred))
        #print(look_up)
        #print(f'Dataset: {look_up_for_look_ups[idx]}')
        #print(f'f1_mean: {np.mean(f1_scores)}  |  f1: {f1_scores}')
        #print(f'acc_mean: {np.mean(accuracies)}  |  acc: {accuracies}')
        #print('\n')
        
        single_df_results = pd.DataFrame({'Model': [model_name]*num_results,
                                      'Dataset': [new_dataset_name]*num_results, 
                                      'Method': [look_up_for_look_ups[idx]]*num_results,
                                      'F1': f1_scores,
                                      'Accuracy': accuracies,
                                      })
        df_results = df_results.append(single_df_results)

        
    return df_results


### dataframe with results

In [24]:
# general dataframe with results
df_template = pd.DataFrame(columns=["Model", "Dataset", "Method", 'F1', 'Accuracy'])    # from low to high conf
df_results = df_template
num_results = 5

## Evaluating the KNN model

In [25]:
# defnining model
num_neighs = 5
model = KNeighborsClassifier(n_neighbors=num_neighs, metric='cosine')
model_name = 'TF-IDF KNN'

In [26]:
# running model on each dataset and for different data processing methods
datasets = ['clean_spam', 'clean_completeSpamAssassin']
for dataset in datasets:
    model_df_results = train(model, model_name, dataset, df_template, seed)
    df_results = df_results.append(model_df_results)

5it [01:20, 16.16s/it]
5it [00:03,  1.38it/s]
5it [00:23,  4.62s/it]
5it [00:01,  2.56it/s]
5it [01:55, 23.02s/it]
5it [01:00, 12.04s/it]
5it [00:40,  8.12s/it]
5it [00:24,  4.95s/it]


#### Saving the results

In [27]:
### Saving the results
df_results.to_csv('results.csv', index=False)

## Evaluating the NB model

In [None]:
df_results = pd.read_csv(f'results.csv')

In [52]:
# defnining model
# note the prior is 50/50, achieve better performance, i.e. f1 score
model = MultinomialNB(fit_prior=False)
model_name = 'TF-IDF NB'

In [53]:
# running model on each dataset and for different data processing methods
datasets = ['clean_spam', 'clean_completeSpamAssassin']
for dataset in datasets:
    model_df_results = train(model, model_name, dataset, df_template, seed)
    df_results = df_results.append(model_df_results)

5it [01:26, 17.22s/it]
5it [00:03,  1.50it/s]
5it [00:27,  5.41s/it]
5it [00:01,  2.85it/s]
5it [01:53, 22.62s/it]
5it [00:55, 11.04s/it]
5it [00:35,  7.09s/it]
5it [00:23,  4.68s/it]


#### Saving the results

In [55]:
### Saving the results
df_results.to_csv('results.csv', index=False)

## Evaluating the Jaccard KNN model

We start by defining the necessary functions to get document similarity (week 5 of course exercises). Please note that the listhash, minhash, and signatures are gone. We will not be using them in this model as we saw a decrease in model performance. We did not perform any extensive analysis and we believe this is due to the information loss as we minhash. And as we won't minhash there is no need to create hashes at all and therefore, there won't be any signatures. Just q-shingles!

### Defining functions

In [93]:
# create shingle function
def shingles(string:str,q:int):
    output = set()
    for i in range(len(string)+1):
        if i < q:
            pass
        else:
            output.add(''.join(string[i-q:i]))
    return output

#create jaccard sim function
def jaccard(doc1, doc2):
    intersect = np.intersect1d(doc1,doc2)
    union = np.union1d(doc1,doc2)
    if len(union) != 0:
        return len(intersect) / len(union)
    else:
        return 0
    
#create document sim function
def similarity(docs:dict):
    output = np.zeros((len(docs.keys()),len(docs.keys())))
    for key1, value1 in tqdm(docs.items()):
        for key2, value2 in docs.items():
            if key1 <= key2:
                pass
            else:
                jac_value = jaccard(np.array(value1),np.array(value2))
                output[key1,key2] = jac_value
    return np.tril(output) + np.triu(output.T, 1)


# the model
def weighted_knn(x, y_train,test_idx,low,high,k_neighbours=5):
    y_test = []
    mask = np.ones(len(x),bool)
    mask[y_test] = False
    for i in test_idx:
        ind = []
        temp = np.argpartition(x[i], -k_neighbours)[-k_neighbours:]
        temp = np.flip(temp)
        for idx in temp:
            if idx>=high or idx < low:
                ind.append(idx)
        topk = x[i][ind]
        labels = {j: y_train[j] for j in ind}
        ham = 0
        spam = 0
        for key, value in labels.items():
            if value == 0:
                ham += x[i][key]
            if value == 1:
                spam += x[i][key]
        
        if ham>spam:
            y_test.append(0)
        if ham == spam:
            y_test.append(0)
        if ham<spam:
            y_test.append(1)
        
    return y_test

Now we create the "training" loop. The training actually happens as we create the document similarity matrix. This function just evaluates one fold of a cross validation ind gives us the predicted labels for the test set in that fold.

Now we get to a piece of code that we are not really proud of. However, we decided to use no sklearn models in this section (except for evaluating f1-scores and accuracies). We evaluate the model with a $k$ (in KNN) of 5 and a $q$ of 5 as that was suggested by the course. We evaluate through a 5-fold cross validation and save all performance metrics from each run.

We also keep the predictions of the outer test set in order to do McNemar tests between models. This will be done in another notebook as this one has become quite extensive.

In [91]:
def jknn_train(model_name, dataset, df_results, seed):
    # data
    #dataset = 'clean_completeSpamAssassin'
    #dataset = 'clean_spam'

    if dataset == 'clean_spam':
        new_dataset_name = 'SMS'
        min_df = 1
    else:
        new_dataset_name = 'Emails'
        min_df = 0.001
    # clean_completeSpamAssassin
    # clean_spam
    data = pd.read_csv(f'../data/{dataset}.csv', encoding='latin')
    data.tokens = data.tokens.apply(literal_eval)
    
    df_text_tokenized = {i: [data['text'][i].split(' ')] for i in list(data.index)}
    data['text_tokenized'] = pd.DataFrame.from_dict(df_text_tokenized, orient='index')


    look_up_for_look_ups = ['raw/BOW', 'raw/Q', 'prep/BOW', 'prep/Q']
    look_ups = ['text_tokenized', 'text', 'tokens', 'str_tokens']
    
    q=5
    
    for idx, look_up in enumerate(look_ups):
        # data is in strings
        if look_ups in ['text', 'str_tokens']:
            curr_data = {data.index[_]: shingles(data.iloc[:,1][_], q=q) for _ in data.index}
        # data is in tokens
        else:
            curr_data = data[look_up]
             
        sim_matrix = similarity(curr_data)
        kfold=5
        f1_scores = []
        accuracies = []
        test_set_percent = ((len(data)/kfold)/len(data))
        test_size = int(test_set_percent*len(data))
        former_test_idx = 0
        y_tests = []
        predicted = []
        for i in range(kfold):
            y_test = data['binary'][former_test_idx:(i+1)*test_size]
            y_tests = y_tests + list(y_test)
            y_idx = y_test.index
            mask = np.ones(len(data), bool)
            mask[y_idx] = False
            y_train = data['binary'][mask]
            y_pred = weighted_knn(sim_matrix,y_train,y_idx,former_test_idx,(i+1)*test_size,5)
            predicted = predicted + list(y_pred)
            f1_scores.append(f1_score(y_test,y_pred))
            accuracies.append(accuracy_score(y_test,y_pred))
            former_test_idx += test_size

        # saving look up, i.e. method
        single_df_results = pd.DataFrame({'Model': [model_name]*num_results,
                                  'Dataset': [new_dataset_name]*num_results, 
                                  'Method': [look_up_for_look_ups[idx]]*num_results,
                                  'F1': f1_scores,
                                  'Accuracy': accuracies,
                                  })
        df_results = df_results.append(single_df_results)

    return df_results
    
    
    

In [68]:
# loading results
df_results = pd.read_csv(f'results.csv')

## Evaluating the J KNN model

In [94]:
model_name = 'J KNN'
# running model on each dataset and for different data processing methods
datasets = ['clean_spam', 'clean_completeSpamAssassin']
for dataset in datasets:
    model_df_results = jknn_train(model_name, dataset, df_template, seed)
    df_results = df_results.append(model_df_results)

100it [00:00, 626.34it/s]
100it [00:00, 986.86it/s]
100it [00:00, 796.98it/s]
100it [00:00, 1008.90it/s]
100it [00:01, 54.02it/s]
100it [00:00, 684.10it/s]
100it [00:00, 202.27it/s]
100it [00:00, 823.71it/s]


### Saving the results

In [None]:
### Saving the results
df_results.to_csv('results.csv', index=False)

## Analysing the results

With confidence level of 95%

In [57]:
round = 3 # round numbers
data = pd.read_csv(f'results.csv')

In [58]:

def mean_confidence_interval(data, round=3, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return np.round(m, round), np.round(m-h, round), np.round(m+h, round)


models = list(set(data.Model))
datasets = list(set(data.Dataset))
# force order
methods = ['raw/Q', 'prep/Q', 'raw/BOW', 'prep/BOW']

for model in models:
    print(model)
    for dataset in datasets:
        print(f'  {dataset}')
        for method in methods:
            current_data = data[(data.Model==model) & (data.Dataset==dataset) & (data.Method==method)]
            f1, f1_l, f1_h = mean_confidence_interval(list(current_data.F1), round=round)
            acc, acc_l, acc_h = mean_confidence_interval(list(current_data.Accuracy), round=round)
            # todo: round
            print(f'    {method}: f1: {f1}, [{f1_l},{f1_h}] | acc: {acc}, [{acc_l},{acc_h}]')
    print('\n')


TF-IDF NB
  Emails
    raw/Q: f1: 0.895, [0.874,0.917] | acc: 0.946, [0.939,0.954]
    prep/Q: f1: 0.959, [0.95,0.968] | acc: 0.977, [0.972,0.983]
    raw/BOW: f1: 0.944, [0.931,0.956] | acc: 0.97, [0.963,0.976]
    prep/BOW: f1: 0.953, [0.937,0.97] | acc: 0.973, [0.964,0.982]
  SMS
    raw/Q: f1: 0.908, [0.871,0.944] | acc: 0.977, [0.968,0.986]
    prep/Q: f1: 0.842, [0.801,0.882] | acc: 0.955, [0.945,0.965]
    raw/BOW: f1: 0.882, [0.848,0.917] | acc: 0.97, [0.96,0.98]
    prep/BOW: f1: 0.841, [0.814,0.867] | acc: 0.953, [0.948,0.958]


TF-IDF KNN
  Emails
    raw/Q: f1: 0.86, [0.812,0.907] | acc: 0.914, [0.887,0.941]
    prep/Q: f1: 0.938, [0.919,0.957] | acc: 0.965, [0.953,0.976]
    raw/BOW: f1: 0.897, [0.883,0.911] | acc: 0.94, [0.935,0.945]
    prep/BOW: f1: 0.933, [0.923,0.943] | acc: 0.962, [0.955,0.968]
  SMS
    raw/Q: f1: 0.901, [0.871,0.932] | acc: 0.975, [0.968,0.983]
    prep/Q: f1: 0.85, [0.804,0.897] | acc: 0.963, [0.95,0.975]
    raw/BOW: f1: 0.852, [0.813,0.89] | acc

## resetting

In [None]:
# resetting all parameters
%reset -f