# Dedicated to finding:
## - somewhat optimal hyperparameters in the Apriori Naive Bayes model

---

## importing modules

In [1]:
# importing modules

import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer
from mlxtend.frequent_patterns import apriori
from sklearn.model_selection import RepeatedKFold, KFold, train_test_split
from sklearn.metrics import f1_score, accuracy_score
import time
from mlxtend.preprocessing import TransactionEncoder

from ast import literal_eval
from tqdm.notebook import tqdm
import mmh3
from itertools import product


seed = 42




---

## Defining the model

In [56]:

# create shingle function
def shingles(term_list:list,q:int):
    string = ' '.join(term_list)
    output = set()
    for i in range(len(string)+1):
        if i < q:
            pass
        else:
            output.add(''.join(string[i-q:i]))
    return list(output)


# OLD
def apriori_for_binary(df, labels, minimum_support, look_up):
    word_freq_dict = {}
    # CHOOSE support and confidence
    # minimum_support = [0.005, 0.03] # for 'ham' and 'spam' respectivly
    # looping labels
    for idx, y in enumerate(labels):
        # filtering
        filtered_df = df[df.label == y]
        # generating corpus for vectorizer
        corpus = []
        for row in filtered_df[look_up]:
            corpus.append(' '.join(row))
        # vectorizing words
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(corpus)
        column_values = vectorizer.get_feature_names_out()
        # generating dataframe
        transformed_df = pd.DataFrame(data=X.toarray(), columns=column_values)
        # performing a-priori
        word_freqs = apriori(transformed_df.astype(bool),
                             min_support=minimum_support[idx],
                             use_colnames=True)
        word_freqs.itemsets = word_freqs.itemsets.apply(lambda x: set(x)).astype("unicode").apply(literal_eval)

        word_freq_dict[y] = word_freqs
    # word frequency dictionaries based on apriori for the two labels
    return word_freq_dict


def get_freq_itemset(data, labels, look_up, num_itemsets):
    freq_itemsets_dict = {}
    for idx, label in enumerate(labels):
        support=0.1
        itemsets=[]
        while len(itemsets) <= num_itemsets[idx]:
            filtered_data = data[data.label == label]
            look_up_data = list(filtered_data[look_up].values)
            te = TransactionEncoder()
            te_ary = te.fit(look_up_data).transform(look_up_data)
            df = pd.DataFrame(te_ary, columns=te.columns_)
            itemsets = apriori(df, min_support=support, use_colnames=True)
            support -= support/20
        # finding top supported
        formatting = itemsets.sort_values(by='support', ascending=False).iloc[:num_itemsets[idx]].reset_index()
        # todo: normalizing
        #formatting.support = formatting.support/formatting.support.sum()
        freq_itemsets_dict[label] = formatting

    return freq_itemsets_dict


# using naive bayes
def NB_classify(x, freq_itemsets_dict, labels, priors):
    # unique
    combined = set(list(freq_itemsets_dict[labels[0]].itemsets) + list(freq_itemsets_dict[labels[1]].itemsets))
    # Laplace estimator in bayes, to avoid the log(0)
    len_vocabulary = len(combined)

    # making loop general for both labels
    denominators = []
    # posterior probabilities
    prob = []
    # to avoid log 0
    min_value = []
    # initialize
    for idx, label in enumerate(labels):
        # data is not normalized hence normalizing coefficient given supports for current label
        denominators.append(freq_itemsets_dict[label].support.sum())
        # prior is added to probability
        prob.append(np.log(priors[label]))
        min_value.append(min(freq_itemsets_dict[label].support))

    # looping each wordset of combination
    for word_set in combined:
        # if current word set is a subset of x
        if set(word_set).issubset(x):
            # if word set in itemsets for this label
            # originally 1 as prob should be added even if not represented
            count = count_occurences(word_set, x) 
            in_or_not = [0, 0]
            support = [0, 0]
            # so it is in combined, but maybe not in both freq itemsets
            if word_set in list(freq_itemsets_dict[labels[0]].itemsets):
                in_or_not[0] = 1
                support[0] = float(
                    (freq_itemsets_dict[labels[0]].loc[freq_itemsets_dict[labels[0]]['itemsets'] == word_set]).support)
            # if this label, notice, can be in both
            if word_set in list(freq_itemsets_dict[labels[1]].itemsets):
                in_or_not[1] = 1
                support[1] = float(
                    (freq_itemsets_dict[labels[1]].loc[freq_itemsets_dict[labels[1]]['itemsets'] == word_set]).support)
            # print(support)
            # calculating posterior
            for idx, label in enumerate(labels):
                #prob[idx] += np.log(((in_or_not[idx] * support[idx] + 1) / (denominators[idx] + len_vocabulary)))*count
                #prob[idx] += np.log((in_or_not[idx] * support[idx] + 1/len_vocabulary) * count
                prob[idx] += np.log( in_or_not[idx]*support[idx] + min_value[idx]/2) * count
                # with denominator
                #prob[idx] += np.log( (in_or_not[idx]*support[idx])/denominators[idx] + min_value[idx]/2) * count

    prediction = labels[np.argmax(prob)]
    # predicted label
    return prediction


def count_occurences(word_set, x):
    if len(word_set) == 1:
        return x.count(list(word_set)[0])
    else:
        diff_counts = []
        for string in list(word_set):
            diff_counts.append(x.count(string))
        # returning minimum value
        return min(diff_counts)


# Complete algorithm

def apriori_NB(train, test, method, hparams):

    if method == 'terms':
        look_up = 'tokens'
    elif method == 'shingles':
        look_up = 'shingles'
    else:
        raise NotImplementedError


    num_itemsets = list(hparams)

    # hyperparameters
    labels = list(np.unique(train.label))  # currently repeated

    # getting frequent itemsets
    freq_itemsets_dict = get_freq_itemset(train, labels, look_up, num_itemsets)
    # word_freq_dict =  apriori_for_binary(train, labels, minimum_support, look_up)

    # calculating prior for NB
    priors = {}
    for label in labels:
        # prior is based on the balance of labels in dataset
        prior = sum(train.label == label) / len(train)
        priors[label] = prior

    # classifying using the naive bayes
    y_test = []
    y_pred = []

    for idx, x in test.iterrows():
        # by x.tokens we remove the 'label' data
        # classifying
        pred = NB_classify(x[look_up], freq_itemsets_dict, labels, priors)
        y_pred.append(pred)
        y_test.append(x.label)

    f1 = f1_score(y_test, y_pred, pos_label='spam')
    acc = accuracy_score(y_test, y_pred)

    # TODO: f1_score
    print(f'Performance: f1={np.round(f1, 5)}  |  acc={np.round(acc, 5)}')
    # check size of sets based om support values

    print(f'   sets={num_itemsets}: [{len(freq_itemsets_dict[labels[0]])},{len(freq_itemsets_dict[labels[1]])}]\n')

    return (f1, acc)



---

# Pilot for the SMS dataset

In [61]:
# loading dataset
data = pd.read_csv(f'../data/clean_spam.csv')
data.tokens = data.tokens.apply(literal_eval)

# splitting dataset
train, test = train_test_split(data, test_size=0.33, random_state=0)
train.reset_index(inplace=True);
test.reset_index(inplace=True);



### Shingles

In [62]:


itemsets_ham = list(np.linspace(100,2000, 20).astype(int))

itemsets_spam = list(np.linspace(100,2000, 20).astype(int))

#q_shingles = np.arange(2,9)

# list of all hparams
hparams = [itemsets_ham, itemsets_spam]#, list(q_shingles)]
# running combination of hparams
combinations = list(product(*hparams))


q=5
method = 'shingles'


df_shingles = {i: [shingles(train['tokens'][i], q=q)] for i in list(train.index)}
train['shingles'] = pd.DataFrame.from_dict(df_shingles, orient='index')
df_shingles = {i: [shingles(test['tokens'][i], q=q)] for i in list(test.index)}
test['shingles'] = pd.DataFrame.from_dict(df_shingles, orient='index')
#train.shingles = train.shingles.apply(literal_eval)




In [21]:
len(combinations)

16

In [63]:

for hparam in tqdm(combinations):
    out = apriori_NB(train=train, test=test, method=method, hparams=hparam)
    #f1, acc = out
    

  0%|          | 0/400 [00:00<?, ?it/s]

Performance: f1=0.66667  |  acc=0.87058
   sets=[100, 100]: [100,100]

Performance: f1=0.64411  |  acc=0.85699
   sets=[100, 200]: [100,200]

Performance: f1=0.64578  |  acc=0.85862
   sets=[100, 300]: [100,300]

Performance: f1=0.6539  |  acc=0.86243
   sets=[100, 400]: [100,400]

Performance: f1=0.64096  |  acc=0.85318
   sets=[100, 500]: [100,500]



KeyboardInterrupt: 

In [67]:
# testing 
start_time = time.time()
out = apriori_NB(train=train, test=test, method=method, hparams=(2000, 600))
print("--- %s seconds ---" % (time.time() - start_time))


KeyboardInterrupt: 

Based on til pilot run, is seems like...

### Terms

In [49]:



itemsets_ham = list(np.linspace(100,1000, 10).astype(int))

itemsets_spam = list(np.linspace(100,1000, 10).astype(int))

#q_shingles = np.arange(2,9)

# list of all hparams
hparams = [itemsets_ham, itemsets_spam]#, list(q_shingles)]
# running combination of hparams
combinations = list(product(*hparams))


method = 'terms'




In [44]:
np.linspace(100,1000, 10)

array([ 100.,  200.,  300.,  400.,  500.,  600.,  700.,  800.,  900.,
       1000.])

In [24]:
len(combinations)

9

In [46]:
combinations

[(100.0, 100.0),
 (100.0, 200.0),
 (100.0, 300.0),
 (100.0, 400.0),
 (100.0, 500.0),
 (100.0, 600.0),
 (100.0, 700.0),
 (100.0, 800.0),
 (100.0, 900.0),
 (100.0, 1000.0),
 (200.0, 100.0),
 (200.0, 200.0),
 (200.0, 300.0),
 (200.0, 400.0),
 (200.0, 500.0),
 (200.0, 600.0),
 (200.0, 700.0),
 (200.0, 800.0),
 (200.0, 900.0),
 (200.0, 1000.0),
 (300.0, 100.0),
 (300.0, 200.0),
 (300.0, 300.0),
 (300.0, 400.0),
 (300.0, 500.0),
 (300.0, 600.0),
 (300.0, 700.0),
 (300.0, 800.0),
 (300.0, 900.0),
 (300.0, 1000.0),
 (400.0, 100.0),
 (400.0, 200.0),
 (400.0, 300.0),
 (400.0, 400.0),
 (400.0, 500.0),
 (400.0, 600.0),
 (400.0, 700.0),
 (400.0, 800.0),
 (400.0, 900.0),
 (400.0, 1000.0),
 (500.0, 100.0),
 (500.0, 200.0),
 (500.0, 300.0),
 (500.0, 400.0),
 (500.0, 500.0),
 (500.0, 600.0),
 (500.0, 700.0),
 (500.0, 800.0),
 (500.0, 900.0),
 (500.0, 1000.0),
 (600.0, 100.0),
 (600.0, 200.0),
 (600.0, 300.0),
 (600.0, 400.0),
 (600.0, 500.0),
 (600.0, 600.0),
 (600.0, 700.0),
 (600.0, 800.0),
 (600.0, 

In [50]:

for hparam in tqdm(combinations):
    out = apriori_NB(train=train, test=test, method=method, hparams=hparam)
    #f1, acc = out

  0%|          | 0/100 [00:00<?, ?it/s]

Performance: f1=0.7031  |  acc=0.8907
   sets=[100, 100]: [100,100]

Performance: f1=0.73273  |  acc=0.90321
   sets=[100, 200]: [100,200]

Performance: f1=0.74242  |  acc=0.90756
   sets=[100, 300]: [100,300]

Performance: f1=0.74085  |  acc=0.90756
   sets=[100, 400]: [100,400]

Performance: f1=0.73574  |  acc=0.9043
   sets=[100, 500]: [100,500]

Performance: f1=0.73134  |  acc=0.90212
   sets=[100, 600]: [100,600]

Performance: f1=0.73134  |  acc=0.90212
   sets=[100, 700]: [100,700]

Performance: f1=0.73134  |  acc=0.90212
   sets=[100, 800]: [100,800]

Performance: f1=0.73134  |  acc=0.90212
   sets=[100, 900]: [100,900]

Performance: f1=0.72059  |  acc=0.89668
   sets=[100, 1000]: [100,1000]

Performance: f1=0.65569  |  acc=0.86351
   sets=[200, 100]: [200,100]

Performance: f1=0.73314  |  acc=0.90103
   sets=[200, 200]: [200,200]

Performance: f1=0.74363  |  acc=0.90701
   sets=[200, 300]: [200,300]

Performance: f1=0.74735  |  acc=0.90919
   sets=[200, 400]: [200,400]

Perform

In [17]:
sum(train.label=='ham')/len(train)

0.8698098044468257

In [31]:
# testing 
start_time = time.time()
out = apriori_NB(train=train, test=test, method=method, hparams=(3000, 1000))
print("--- %s seconds ---" % (time.time() - start_time))



KeyboardInterrupt: 

# Discussion


- minimum is hard to tweak for each new dataset (chose num in freq instead)
- support varies per freq itemset, normalize with the denominator
- the ham might contain typical stopwords, somethings represented in both
- not good with imblanced datasets?
- we add min(y)/2: https://towardsdatascience.com/laplace-smoothing-in-naïve-bayes-algorithm-9c237a8bdece
- 


---

# Pilot for the _ dataset

### Shingles

### Terms