# Spam detection: Apriori Algorithm together with naive bayes





In [95]:
# importing modules

import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer
from mlxtend.frequent_patterns import apriori
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold 
from ast import literal_eval
from tqdm import tqdm
import mmh3





seed = 42

In [85]:
df = pd.read_csv('data/clean_spam.csv',encoding='ISO-8859-1')
df.head()

Unnamed: 0,label,text,tokens
0,ham,"Go until jurong point, crazy.. Available only ...","['go', 'jurong', 'point', 'crazi', 'avail', 'b..."
1,ham,Ok lar... Joking wif u oni...,"['ok', 'lar', 'joke', 'wif', 'u', 'oni']"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"['free', 'entri', 'wkli', 'comp', 'win', 'fa',..."
3,ham,U dun say so early hor... U c already then say...,"['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","['nah', 'think', 'goe', 'usf', 'live', 'around..."


In [86]:

# formatting
df.tokens = df.tokens.apply(literal_eval)



In [160]:

def apriori_for_binary(df, labels, minimum_support, look_up):
    word_freq_dict = {}

    # CHOOSE support and confidence

    #minimum_support = [0.005, 0.03] # for 'ham' and 'spam' respectivly

    # looping labels
    for idx, y in enumerate(labels):
        # filtering
        filtered_df = df[df.label == y]
        # generating corpus for vectorizer
        corpus = []
        for row in filtered_df[look_up]:
            corpus.append(' '.join(row))
        # vectorizing words
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(corpus)
        column_values = vectorizer.get_feature_names_out()
        # generating dataframe
        transformed_df = pd.DataFrame(data = X.toarray(), columns = column_values)

        # performing a-priori
        word_freqs = apriori(transformed_df.astype(bool), 
                            min_support=minimum_support[idx], 
                            use_colnames=True)
        word_freqs.itemsets = word_freqs.itemsets.apply(lambda x: set(x)).astype("unicode").apply(literal_eval)

        word_freq_dict[y] = word_freqs
    # word frequency dictionaries based on apriori for the two labels
    return word_freq_dict

In [88]:



# using naive bayes
def classify(x, word_freq_dict, labels):

    #x = df.iloc[2]

    # looping labels
    output = [0]*len(labels) # output probabilities
    
    if len(labels) == 2:
        combined = list(word_freq_dict[labels[0]].itemsets) + list(word_freq_dict[labels[1]].itemsets)
    else:
        # not implemented for non-binary
        raise NotImplemented
    
    for idx, y in enumerate(labels):
    #for idx, y in enumerate(['ham']):  
        #prior = len(df[df.label == y])/len(df)
        prior = len(word_freq_dict[y].itemsets)/len(combined)

        # prior is added
        #prob = np.log(prior)
        prob=np.log(prior) 

        denominator = word_freq_dict[y].support.sum()

        # Laplace estimator to avoid the zero
        add = len(combined)

        #for (index, word_freq) in word_freq_dict[y].iterrows():
        # looping each wordset
        for word_set in combined:
            # if word set is a subset all the supported
            if set(word_set).issubset(x):
                # if word set not supported for current label
                if word_set not in list(word_freq_dict[y].itemsets):
                    # not squared to make it a negative contribution
                    prob += np.log(1/ (denominator + add) ) # itemset not found, laplace
                # supported by current label
                else:
                    row = word_freq_dict[y].loc[word_freq_dict[y]['itemsets'] == word_set]
                    prob += np.log( ( (float(row.support)+1) / (denominator + add) ) )
        output[idx] = prob
    
    prediction = labels[np.argmax(output)]
    # predicted label
    return prediction

In [81]:

# 20 % test
kf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=seed) 

# hyperparameters
labels = list(np.unique(df.label)) # currently repeated

minimum_support = [0.005, 0.03]

scores = [] # to store accuracies

for train_index, test_index in tqdm(kf.split(df)):
    #X_train, X_test = df.tokens[train_index], df.tokens[test_index] 
    #y_train, y_test = df.label[train_index], df.label[test_index]
    #pd.concat([X_train, y_train],axis=1)
    
    df_train = df.loc[train_index]
    
    df_test = df.loc[test_index]
    
    # training?
    word_freq_dict =  apriori_for_binary(df_train, labels, minimum_support, 'tokens')
    
    #for 
    # classifying using the naive bayes
    correct = 0
    for idx, x in df_test.iterrows():
        # by x.tokens we remove the 'label' data
        pred = classify(x.tokens, word_freq_dict, labels)
        correct += int(pred==x.label)
    
    scores.append(correct / (len(df_test)))

print(scores)

5it [00:13,  2.62s/it]

[0.9354260089686098, 0.9417040358744395, 0.9434470377019749, 0.9326750448833034, 0.926391382405745]





# Shingles

In [174]:
# create shingle function
def shingles(string:str,q:int):
    output = set()
    for i in range(len(string)+1):
        if i < q:
            pass
        else:
            output.add(' '.join(string[i-q:i]))
    return output

#create listhash function
def listhash(l,seeds):
    vals = []

    for e in l:
        val = 0
        for seed in seeds:
            val = val ^ mmh3.hash(e, seed)
        vals.append(str(val))
    return vals

#create signatures function
def signatures(docs, q=9, k=20):
    sign = {}
    for key, value in docs.items():
        sign[key] = listhash(shingles(value,q=q),np.arange(k))
    return sign

    
#create document sim function
def similarity(docs:dict):
    output = np.zeros((len(docs.keys()),len(docs.keys())))
    for key1, value1 in docs.items():
        for key2, value2 in docs.items():
            if key1 == key2:
                pass
            else:
                jac_value = jaccard(value1,value2)
                output[key1,key2] = jac_value
    return output
            

In [175]:
#determine q and k
q = 5 #number of characters in each shingle
k = 20 #number of hashes per shingle


#create signatures for emails (we keep count based on index in emails)
df_signatures = {df.index[i]: listhash(shingles(df['text'][i], q=q), np.arange(k)) for i in df.index}


In [225]:

# 20 % test
kf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=seed) 

# hyperparameters
labels = list(np.unique(df.label)) # currently repeated

minimum_support = [0.025, 0.07]

scores = [] # to store accuracies

for train_index, test_index in tqdm(kf.split(df)):
    #X_train, X_test = df.tokens[train_index], df.tokens[test_index] 
    #y_train, y_test = df.label[train_index], df.label[test_index]
    #pd.concat([X_train, y_train],axis=1)
    
    df_train = df.loc[train_index]
    
    df_test = df.loc[test_index]
    
    # training?
    word_freq_dict =  apriori_for_binary(df_train, labels, minimum_support, 'signatures')
    
    #for 
    # classifying using the naive bayes
    correct = 0
    for idx, x in df_test.iterrows():
        # by x.tokens we remove the 'label' data
        pred = classify(x.tokens, word_freq_dict, labels)
        
        correct += int(pred==x.label)
    
    scores.append(correct / (len(df_test)))

print(scores)

5it [00:27,  5.55s/it]

[0.8654708520179372, 0.8663677130044843, 0.8680430879712747, 0.8707360861759426, 0.8590664272890485]





comments
- since the support needs to be lower, it could indicate that hashing finds more distinct, and therefore i could be necessary to include more in the frequent itemsets.