# Spam detection: Apriori Algorithm together with naive bayes





In [3]:
# importing modules

import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer
from mlxtend.frequent_patterns import apriori
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold 
from ast import literal_eval
from tqdm.notebook import tqdm
import mmh3
from itertools import product


seed = 42

In [52]:
pwd

'/Users/arond.jacobsen/Documents/GitHub/02807-project/src'

## Formatting dataset

In [3]:
df = pd.read_csv('../data/clean_spam.csv')#,encoding='ISO-8859-1')
df.head()

Unnamed: 0,label,text,tokens
0,ham,"Go until jurong point, crazy.. Available only ...","['go', 'jurong', 'point', 'crazi', 'avail', 'b..."
1,ham,Ok lar... Joking wif u oni...,"['ok', 'lar', 'joke', 'wif', 'u', 'oni']"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"['free', 'entri', 'wkli', 'comp', 'win', 'fa',..."
3,ham,U dun say so early hor... U c already then say...,"['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","['nah', 'think', 'goe', 'usf', 'live', 'around..."


In [7]:

# formatting
df.tokens = df.tokens.apply(literal_eval)



In [4]:
# create shingle function
def shingles(string:str,q:int):
    output = set()
    for i in range(len(string)+1):
        if i < q:
            pass
        else:
            output.add(' '.join(string[i-q:i]))
    return output

#create listhash function
def listhash(l,seeds):
    vals = []

    for e in l:
        val = 0
        for seed in seeds:
            val = val ^ mmh3.hash(e, seed)
        vals.append(str(val))
    return vals

#create signatures function
def signatures(docs, q=9, k=20):
    sign = {}
    for key, value in docs.items():
        sign[key] = listhash(shingles(value,q=q),np.arange(k))
    return sign

    
#create document sim function
def similarity(docs:dict):
    output = np.zeros((len(docs.keys()),len(docs.keys())))
    for key1, value1 in docs.items():
        for key2, value2 in docs.items():
            if key1 == key2:
                pass
            else:
                jac_value = jaccard(value1,value2)
                output[key1,key2] = jac_value
    return output
            

In [None]:
#determine q and k
q = 5 #number of characters in each shingle
k = 20 #number of hashes per shingle


#create signatures for emails (we keep count based on index in emails)
df_signatures = {df.index[i]: [listhash(shingles(df['text'][i], q=q), np.arange(k))] for i in df.index}

df['signatures'] = pd.DataFrame.from_dict(df_signatures, orient='index')

# formatting data

df.to_csv('../data/SMS.csv', index=False) 

data = pd.read_csv(f'../data/SMS.csv')
data.columns

## The Apriori Naive Bayes algorithm

In [26]:


def apriori_for_binary(df, labels, minimum_support, look_up):
    word_freq_dict = {}

    # CHOOSE support and confidence

    #minimum_support = [0.005, 0.03] # for 'ham' and 'spam' respectivly

    # looping labels
    for idx, y in enumerate(labels):
        # filtering
        filtered_df = df[df.label == y]
        # generating corpus for vectorizer
        corpus = []
        for row in filtered_df[look_up]:
            corpus.append(' '.join(row))
        # vectorizing words
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(corpus)
        column_values = vectorizer.get_feature_names_out()
        # generating dataframe
        transformed_df = pd.DataFrame(data = X.toarray(), columns = column_values)
        # performing a-priori
        word_freqs = apriori(transformed_df.astype(bool), 
                            min_support=minimum_support[idx], 
                            use_colnames=True)
        word_freqs.itemsets = word_freqs.itemsets.apply(lambda x: set(x)).astype("unicode").apply(literal_eval)

        word_freq_dict[y] = word_freqs
    # word frequency dictionaries based on apriori for the two labels
    return word_freq_dict


# using naive bayes
def NB_classify(x, word_freq_dict, labels):

    #x = df.iloc[2]

    # looping labels
    output = [0]*len(labels) # output probabilities
    
    if len(labels) == 2:
        combined = list(word_freq_dict[labels[0]].itemsets) + list(word_freq_dict[labels[1]].itemsets)
    else:
        # not implemented for non-binary
        raise NotImplemented
    
    for idx, y in enumerate(labels):
    #for idx, y in enumerate(['ham']):  
        #prior = len(df[df.label == y])/len(df)
        prior = len(word_freq_dict[y].itemsets)/len(combined)

        # prior is added
        #prob = np.log(prior)
        prob=np.log(prior) 

        denominator = word_freq_dict[y].support.sum()

        # Laplace estimator to avoid the zero
        add = len(combined)

        #for (index, word_freq) in word_freq_dict[y].iterrows():
        # looping each wordset
        for word_set in combined:
            # if word set is a subset all the supported
            if set(word_set).issubset(x):
                # if word set not supported for current label
                if word_set not in list(word_freq_dict[y].itemsets):
                    # not squared to make it a negative contribution
                    prob += np.log(1/ (denominator + add) ) # itemset not found, laplace
                # supported by current label
                else:
                    row = word_freq_dict[y].loc[word_freq_dict[y]['itemsets'] == word_set]
                    prob += np.log( ( (float(row.support)+1) / (denominator + add) ) )
        output[idx] = prob
    
    prediction = labels[np.argmax(output)]
    # predicted label
    return prediction




# Complete algorithm

def apriori_NB(train, test, method, hparams):
    
    if method == 'terms':
        look_up = 'signatures'
    elif method == 'shingles':
        look_up = 'tokens'
        
    minimum_support = list(hparams)
    
    # hyperparameters
    labels = list(np.unique(train.label)) # currently repeated
    
    word_freq_dict =  apriori_for_binary(train, labels, minimum_support, look_up)
    
    
    # check size of sets based om support values
    print(f'{minimum_support}: [{len(word_freq_dict[labels[0]])},{len(word_freq_dict[labels[1]])}]')
    
    # classifying using the naive bayes
    correct = 0
    for idx, x in test.iterrows():
        # by x.tokens we remove the 'label' data
        pred = NB_classify(x[look_up], word_freq_dict, labels)
        
        correct += int(pred==x.label)
    
    accuracy = correct / (len(test))
    
    # TODO: f1_score
    
    return accuracy

## Training functions

In [30]:

# the two level cross validation

def two_level_cv(data, algorithm, method, combinations, seed, outer_folds, inner_folds):
    # creating splits
    outer_kf = RepeatedKFold(n_splits=outer_folds, n_repeats=1, random_state=seed) 
    inner_kf = RepeatedKFold(n_splits=inner_folds, n_repeats=1, random_state=seed) 

    outer_performance = []
    outer_hparams = []
    
    # outer loop
    print('outer loop')
    for outer_train_idx, outer_test_idx in tqdm(outer_kf.split(data)):
        df_outer_train = data.loc[outer_train_idx]
        df_outer_test = data.loc[outer_test_idx]
        
        # inner loop
        # saving performance per hyperparameter combination and fold
        inner_performance = np.zeros([len(combinations), inner_folds])
        inner_fold = 0
        
        inner_data = df_outer_train.reset_index(drop=True)
        print('inner loop')
        for inner_train_idx, inner_test_idx in tqdm(inner_kf.split(inner_data)):
            df_inner_train = inner_data.loc[inner_train_idx]
            df_inner_test = inner_data.loc[inner_test_idx]
            # looping each 
            for idx, hparams in enumerate(combinations):
                out = algorithm(train = df_inner_train, test=df_inner_test, 
                                method = method, hparams = hparams)
                # saving performance
                inner_performance[idx, inner_fold] = out
            
            inner_fold += 1
        best_hparams_idx = np.argmax(inner_performance.mean(axis=1))
        best_hparams = combinations[best_hparams_idx]
        outer_hparams.append(best_hparams)
        # evaluating on outer loop with best performing parameters
        out = algorithm(train = df_outer_train, test=df_outer_test, 
                        method = method, hparams = best_hparams)
        
        outer_performance.append(out)
        
    return (outer_performance, outer_hparams)


## Running commands

In [31]:

# algorithm -> dataset -> method -> hparams
#    - NOTE: hparams must have the correct order as their input in algorithm
run_commands = {
    # dataset
    'aprioir_NB': 
        # algorithm
        {'SMS': 
             # method
             {'terms': 
                  # hparams
                  {'min_supp_ham': np.linspace(0.02, 0.03, 2),
                   'min_supp_spam': np.linspace(0.06, 0.07, 2)
                  },
              # method
             'shingles': 
                  # hparams
                  {'min_supp_ham': np.linspace(0.015, 0.035, 2),
                   'min_supp_spam': np.linspace(0.06, 0.08, 2)
                  }
             }
                           
        }
    
}


algo_dict =  {'aprioir_NB': apriori_NB}



## Hyperparameter grid search

In [32]:

results = {}

# two-level CV
outer_folds = 2
inner_folds = 2

seed = 42


for algorithm in run_commands.keys():
    results[algorithm] = {}
    print(algorithm)
    for dataset in run_commands[algorithm].keys():
        print(dataset)
        results[algorithm][dataset] = {}
                
        # loading dataset
        data = pd.read_csv(f'../data/{dataset}.csv')
        data.tokens = data.tokens.apply(literal_eval)
        data.signatures = data.signatures.apply(literal_eval)
        # i.e. terms or shingles
        for method in run_commands[algorithm][dataset].keys():
            print(method)
            results[algorithm][dataset][method] = {}
            
            # list of all hparams
            hparams = []
            for param in run_commands[algorithm][dataset][method].values():
                hparams.append(list(param))
            # running combination of hparams
            combinations = list(product(*hparams))
            
            
            output = two_level_cv(data = data,
                                  algorithm = algo_dict[algorithm],
                                  method = method,
                                  combinations=combinations,
                                  seed=seed,
                                  outer_folds = outer_folds,
                                  inner_folds = inner_folds)
            outer_performance, outper_hparams = output
            
            # saving results
            results[algorithm][dataset][method]['performance'] = outer_performance
            results[algorithm][dataset][method]['hparams'] = outper_hparams
            
            print(np.mean(outer_performance))
        
        # open and save results to a file here iteratively
        # i.e. after finishing a dataset


            

aprioir_NB
SMS
terms
outer loop


0it [00:00, ?it/s]

inner loop


0it [00:00, ?it/s]

[0.02, 0.06]: [710,916]
[0.02, 0.07]: [710,364]
[0.03, 0.06]: [251,916]
[0.03, 0.07]: [251,364]
[0.02, 0.06]: [816,715]
[0.02, 0.07]: [816,346]
[0.03, 0.06]: [447,715]
[0.03, 0.07]: [447,346]
[0.02, 0.07]: [811,223]
inner loop


0it [00:00, ?it/s]

[0.02, 0.06]: [684,957]
[0.02, 0.07]: [684,457]
[0.03, 0.06]: [382,957]
[0.03, 0.07]: [382,457]
[0.02, 0.06]: [793,1125]
[0.02, 0.07]: [793,518]
[0.03, 0.06]: [381,1125]
[0.03, 0.07]: [381,518]
[0.02, 0.07]: [708,303]
0.895010768126346
shingles
outer loop


0it [00:00, ?it/s]

inner loop


0it [00:00, ?it/s]

[0.015, 0.06]: [83,35]
[0.015, 0.08]: [83,18]
[0.035, 0.06]: [20,35]
[0.035, 0.08]: [20,18]
[0.015, 0.06]: [76,50]
[0.015, 0.08]: [76,26]
[0.035, 0.06]: [20,50]
[0.035, 0.08]: [20,26]
[0.015, 0.06]: [80,37]
inner loop


0it [00:00, ?it/s]

[0.015, 0.06]: [74,40]
[0.015, 0.08]: [74,29]
[0.035, 0.06]: [19,40]
[0.035, 0.08]: [19,29]
[0.015, 0.06]: [87,60]
[0.015, 0.08]: [87,31]
[0.035, 0.06]: [17,60]
[0.035, 0.08]: [17,31]
[0.015, 0.06]: [78,46]
0.9100861450107681


In [33]:
results[algorithm]

{'SMS': {'terms': {'performance': [0.9048815506101938, 0.8851399856424982],
   'hparams': [(0.02, 0.07), (0.02, 0.07)]},
  'shingles': {'performance': [0.9055994256999282, 0.914572864321608],
   'hparams': [(0.015, 0.06), (0.015, 0.06)]}}}

## Other: testing training functions and hyperparameters

### terms

In [None]:

# terms

# 20 % test
kf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=seed) 

# hyperparameters
labels = list(np.unique(df.label)) # currently repeated

minimum_support = [0.005, 0.03]

scores = [] # to store accuracies

for train_index, test_index in tqdm(kf.split(df)):
    #X_train, X_test = df.tokens[train_index], df.tokens[test_index] 
    #y_train, y_test = df.label[train_index], df.label[test_index]
    #pd.concat([X_train, y_train],axis=1)
    
    df_train = df.loc[train_index]
    
    df_test = df.loc[test_index]
    
    # training?
    word_freq_dict =  apriori_for_binary(df_train, labels, minimum_support, 'tokens')
    
    #for 
    # classifying using the naive bayes
    correct = 0
    for idx, x in df_test.iterrows():
        # by x.tokens we remove the 'label' data
        pred = NB_classify(x.tokens, word_freq_dict, labels)
        correct += int(pred==x.label)
    
    scores.append(correct / (len(df_test)))

print(scores)

### shingles

In [None]:
# SHINGLES


# 20 % test
kf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=seed) 

# hyperparameters
labels = list(np.unique(df.label)) # currently repeated

minimum_support = [0.025, 0.07] # for ham and spam respectivly

scores = [] # to store accuracies

for train_index, test_index in tqdm(kf.split(df)):
    
    df_train = df.loc[train_index]
    
    df_test = df.loc[test_index]
    
    # training?
    word_freq_dict =  apriori_for_binary(df_train, labels, minimum_support, 'signatures')
    
    #for 
    # classifying using the naive bayes
    correct = 0
    for idx, x in df_test.iterrows():
        # by x.tokens we remove the 'label' data
        pred = NB_classify(x.signatures, word_freq_dict, labels)
        
        correct += int(pred==x.label)
    
    scores.append(correct / (len(df_test)))

print(scores)


