In [1]:
from apriori import Apriori
from improved_apriori import Improved_Apriori
import pandas as pd
import numpy as np
import csv
import json
from efficient_apriori import apriori
from apyori import apriori as apyori_apriori

In [2]:
# Test Dataset
data = pd.read_csv('test_dataset/test1.csv')

# Own Algorithm vs Efficient Apriori

In [3]:
def test_apriori(frequent_itemset_1, frequent_itemset_2):
    if(len(frequent_itemset_1)!=len(frequent_itemset_2)):
        return False
    
    for i in range(len(frequent_itemset_1)):
        sorted_dict1 = {tuple(sorted(k)): v for k, v in frequent_itemset_1[i+1].items()}
        sorted_dict2 = {tuple(sorted(k)): v for k, v in frequent_itemset_1[i+1].items()}
        if(sorted_dict1!=sorted_dict2):
            return False
        
    return True


In [4]:
# Data Processing where we structure the data to fit our own implementation
# Create a dictionary where the key is the index and the value is a tuple of items
data_dict = data.apply(lambda row: (row.dropna().index.tolist()), axis=1).to_dict()

# To fit Efficient-Apriori implementation
data_list = data.apply(lambda row: row.dropna().index.tolist(), axis=1).tolist()

In [5]:
diff_result = []
for min_support in np.arange(0.0001, 0.1, 0.0001):
    improved_apriori = Improved_Apriori(data_dict, min_support=min_support, min_confidence=1)
    frequent_itemset = improved_apriori.apriori()

    frequent_itemset_dict = {}
    for level in range(len(frequent_itemset)):
        frequent_itemset_dict[level+1] = frequent_itemset[level]
    frequent_itemset_dict_2, _ = apriori(data_list, min_support=min_support, min_confidence=1)
    if not test_apriori(frequent_itemset_dict, frequent_itemset_dict_2):
        diff_result.append([frequent_itemset_dict, frequent_itemset_dict_2, min_support])




100%|██████████| 10/10 [00:00<00:00, 84733.41it/s]
100%|██████████| 10/10 [00:00<00:00, 153637.51it/s]
100%|██████████| 5/5 [00:00<00:00, 115864.75it/s]
100%|██████████| 1/1 [00:00<00:00, 9157.87it/s]
100%|██████████| 10/10 [00:00<00:00, 128266.18it/s]
100%|██████████| 10/10 [00:00<00:00, 173318.35it/s]
100%|██████████| 5/5 [00:00<00:00, 119156.36it/s]
100%|██████████| 1/1 [00:00<00:00, 8683.86it/s]
100%|██████████| 10/10 [00:00<00:00, 164482.51it/s]
100%|██████████| 10/10 [00:00<00:00, 37752.51it/s]
100%|██████████| 5/5 [00:00<00:00, 50901.75it/s]
100%|██████████| 1/1 [00:00<00:00, 12633.45it/s]
100%|██████████| 10/10 [00:00<00:00, 161319.38it/s]
100%|██████████| 10/10 [00:00<00:00, 90982.73it/s]
100%|██████████| 5/5 [00:00<00:00, 94466.31it/s]
100%|██████████| 1/1 [00:00<00:00, 15827.56it/s]
100%|██████████| 10/10 [00:00<00:00, 158875.15it/s]
100%|██████████| 10/10 [00:00<00:00, 175493.89it/s]
100%|██████████| 5/5 [00:00<00:00, 125577.96it/s]
100%|██████████| 1/1 [00:00<00:00, 27962.


100%|██████████| 10/10 [00:00<00:00, 128266.18it/s]
100%|██████████| 10/10 [00:00<00:00, 161942.24it/s]
100%|██████████| 5/5 [00:00<00:00, 103819.41it/s]
100%|██████████| 1/1 [00:00<00:00, 27776.85it/s]
100%|██████████| 10/10 [00:00<00:00, 147686.76it/s]
100%|██████████| 10/10 [00:00<00:00, 68089.35it/s]
100%|██████████| 5/5 [00:00<00:00, 82241.25it/s]
100%|██████████| 1/1 [00:00<00:00, 26214.40it/s]
100%|██████████| 10/10 [00:00<00:00, 142663.40it/s]
100%|██████████| 10/10 [00:00<00:00, 161942.24it/s]
100%|██████████| 5/5 [00:00<00:00, 26051.58it/s]
100%|██████████| 1/1 [00:00<00:00, 24966.10it/s]
100%|██████████| 10/10 [00:00<00:00, 137068.76it/s]
100%|██████████| 10/10 [00:00<00:00, 160701.30it/s]
100%|██████████| 5/5 [00:00<00:00, 101803.50it/s]
100%|██████████| 1/1 [00:00<00:00, 8004.40it/s]
100%|██████████| 10/10 [00:00<00:00, 140748.46it/s]
100%|██████████| 10/10 [00:00<00:00, 151418.92it/s]
100%|██████████| 5/5 [00:00<00:00, 103819.41it/s]
100%|██████████| 1/1 [00:00<00:00, 24

In [6]:
# Print the min_support values where results differ
for i in range(len(diff_result)):
    print("The results are different for these min_support values:", diff_result[i][2])

# Efficient Apriori

In [7]:
# data processing
data_list = data.apply(lambda row: row.dropna().index.tolist(), axis=1).tolist()
data_list

[['handphone', 'laptop'],
 ['handphone', 'laptop', 'charger'],
 ['handphone', 'laptop', 'charger', 'powerbank'],
 ['handphone', 'laptop', 'tablet'],
 ['handphone', 'charger', 'tablet'],
 ['powerbank', 'tablet'],
 ['handphone', 'laptop', 'charger', 'tablet'],
 ['handphone', 'charger'],
 ['handphone', 'powerbank'],
 ['laptop', 'charger', 'powerbank']]

In [8]:
itemsets, _ = apriori(data_list, min_support=0.001, min_confidence=1)
itemsets[1]

{('handphone',): 8,
 ('laptop',): 6,
 ('charger',): 6,
 ('powerbank',): 4,
 ('tablet',): 4}

In [9]:
frequent_itemset_dict[1] == itemsets[1]


True