In [1]:
from improved_apriori import Improved_Apriori
from efficient_apriori import apriori
import pandas as pd
import numpy as np

In [2]:
data = {'T1': ['I1', 'I2', 'I5', 'I3'],
        'T2': ['I2', 'I4', 'I3'],
        'T3': ['I2', 'I4'],
        'T4': ['I1', 'I2', 'I4'],
        'T5': ['I1', 'I3'],
        'T6': ['I2', 'I3'],
        'T7': ['I1', 'I3'],
        'T8': ['I1', 'I2', 'I3', 'I5'],
        'T9': ['I1', 'I2', 'I3']
        }

###  In this test, we check our own Apriori implementation against Efficient Apriori to ensure that our algorithm is correct

In [3]:
"""
To make things simpler,we start off with a known dataset and what we will do is to check the output of each steps between the two implementation

1. Pruning step
2. Candidate generation step
3. Compare output of the frequent itemset

"""
print("\n Own Apriori: \n")
improved_apriori = Improved_Apriori(data, min_support=0.3, min_confidence=0.1,verbose=2)
improved_apriori_itemset = improved_apriori.apriori()
l = list(data.values())




 Own Apriori: 

Found 5 candidate itemsets from 1st Level
Found 4 frequent itemsets from 1th item candidate sets
Found 6 candidates for 2th itemsets
Candidate Sets for 2th itemset : [('I1', 'I2'), ('I1', 'I3'), ('I1', 'I4'), ('I2', 'I3'), ('I2', 'I4'), ('I3', 'I4')]
Time taken to find 2th item candidate sets: 9.036064147949219e-05


100%|██████████| 6/6 [00:00<00:00, 71697.50it/s]


Found 4 frequent itemsets from candidate 2th-itemsets
Frequent Itemsets for 2th itemset : {('I1', 'I2'): 4, ('I1', 'I3'): 5, ('I2', 'I3'): 5, ('I2', 'I4'): 3}
Found 1 candidates for 3th itemsets
Candidate Sets for 3th itemset : [('I1', 'I2', 'I3')]
Time taken to find 3th item candidate sets: 4.9114227294921875e-05


100%|██████████| 1/1 [00:00<00:00, 5562.74it/s]


Found 1 frequent itemsets from candidate 3th-itemsets
Frequent Itemsets for 3th itemset : {('I1', 'I2', 'I3'): 3}
Found 0 candidates for 4th itemsets
Candidate Sets for 4th itemset : []
Time taken to find 4th item candidate sets: 3.695487976074219e-05


0it [00:00, ?it/s]

Found 0 frequent itemsets from candidate 4th-itemsets
Frequent Itemsets for 4th itemset : {}





In [4]:
print("\n Efficient Apriori: \n")
efficient_apriori_itemsets, _ = apriori(l, min_support=0.3, min_confidence=1, verbosity=2)


 Efficient Apriori: 

Generating itemsets.
 Counting itemsets of length 1.
  Found 5 candidate itemsets of length 1.
  Found 4 large itemsets of length 1.
    [('I1',), ('I2',), ('I3',), ('I4',)]
 Counting itemsets of length 2.
  Found 6 candidate itemsets of length 2.
   [('I1', 'I2'), ('I1', 'I3'), ('I1', 'I4'), ('I2', 'I3'), ('I2', 'I4'), ('I3', 'I4')]
    Iterating over transactions.
  Found 4 large itemsets of length 2.
   [('I1', 'I2'), ('I1', 'I3'), ('I2', 'I3'), ('I2', 'I4')]
 Counting itemsets of length 3.
  Found 1 candidate itemsets of length 3.
   [('I1', 'I2', 'I3')]
    Iterating over transactions.
  Found 1 large itemsets of length 3.
   [('I1', 'I2', 'I3')]
 Counting itemsets of length 4.
  Found 0 candidate itemsets of length 4.
   []
Itemset generation terminated.

Generating rules from itemsets.
 Generating rules of size 2.
 Generating rules of size 3.
Rule generation terminated.



In [5]:
improved_apriori_itemset 

{1: {('I1',): 6, ('I2',): 7, ('I3',): 7, ('I4',): 3},
 2: {('I1', 'I2'): 4, ('I1', 'I3'): 5, ('I2', 'I3'): 5, ('I2', 'I4'): 3},
 3: {('I1', 'I2', 'I3'): 3}}

In [6]:
efficient_apriori_itemsets

{1: {('I1',): 6, ('I2',): 7, ('I3',): 7, ('I4',): 3},
 2: {('I1', 'I2'): 4, ('I1', 'I3'): 5, ('I2', 'I3'): 5, ('I2', 'I4'): 3},
 3: {('I1', 'I2', 'I3'): 3}}

In [7]:
# Next we look at a slightly larger dataset and compare the output across different support
data = pd.read_csv('test_dataset/bread basket.csv')
data.drop_duplicates(inplace=True)

data.head(10)

Unnamed: 0,Transaction,Item,date_time,period_day,weekday_weekend
0,1,Bread,30-10-2016 09:58,morning,weekend
1,2,Scandinavian,30-10-2016 10:05,morning,weekend
3,3,Hot chocolate,30-10-2016 10:07,morning,weekend
4,3,Jam,30-10-2016 10:07,morning,weekend
5,3,Cookies,30-10-2016 10:07,morning,weekend
6,4,Muffin,30-10-2016 10:08,morning,weekend
7,5,Coffee,30-10-2016 10:13,morning,weekend
8,5,Pastry,30-10-2016 10:13,morning,weekend
9,5,Bread,30-10-2016 10:13,morning,weekend
10,6,Medialuna,30-10-2016 10:16,morning,weekend


In [8]:
data = data.groupby('Transaction')['Item'].apply(list)
data_dict = data.to_dict()

In [9]:
# Data Processing where we structure the data to fit our own implementation
# Create a dictionary where the key is the index and the value is a tuple of items

# To fit Efficient-Apriori implementation
data_list =  list(data_dict.values())

In [10]:
# Compare the two itemsets
def test_apriori(frequent_itemset_1, frequent_itemset_2):
    if(len(frequent_itemset_1)!=len(frequent_itemset_2)):
        return False
    
    for i in range(len(frequent_itemset_1)):
        sorted_dict1 = {tuple(sorted(k)): v for k, v in frequent_itemset_1[i+1].items()}
        sorted_dict2 = {tuple(sorted(k)): v for k, v in frequent_itemset_2[i+1].items()}
        if(sorted_dict1!=sorted_dict2):
            return False
        
    return True


In [11]:
# Create a list to hold the different reasult
diff_result = []
for min_support in np.arange(0.001, 0.3, 0.0001):
    improved_apriori = Improved_Apriori(data_dict, min_support=min_support)

    # Own implementation
    frequent_itemset_dict = improved_apriori.apriori()

    # Efficient Apriori
    frequent_itemset_dict_2, _ = apriori(data_list, min_support=min_support, max_length=12, min_confidence=1)
    if not test_apriori(frequent_itemset_dict, frequent_itemset_dict_2):
        diff_result.append([frequent_itemset_dict, frequent_itemset_dict_2, min_support])

100%|██████████| 1596/1596 [00:00<00:00, 46246.32it/s]
100%|██████████| 840/840 [00:00<00:00, 9834.21it/s]
100%|██████████| 59/59 [00:00<00:00, 4019.36it/s]
0it [00:00, ?it/s]
100%|██████████| 1540/1540 [00:00<00:00, 47586.70it/s]
100%|██████████| 786/786 [00:00<00:00, 9786.95it/s]
100%|██████████| 43/43 [00:00<00:00, 3696.71it/s]
0it [00:00, ?it/s]
100%|██████████| 1378/1378 [00:00<00:00, 41729.25it/s]
100%|██████████| 697/697 [00:00<00:00, 9203.48it/s]
100%|██████████| 36/36 [00:00<00:00, 3638.52it/s]
0it [00:00, ?it/s]
100%|██████████| 1225/1225 [00:00<00:00, 39249.11it/s]
100%|██████████| 602/602 [00:00<00:00, 8806.68it/s]
100%|██████████| 31/31 [00:00<00:00, 3636.21it/s]
0it [00:00, ?it/s]
100%|██████████| 1128/1128 [00:00<00:00, 40403.90it/s]
100%|██████████| 568/568 [00:00<00:00, 9114.60it/s]
100%|██████████| 26/26 [00:00<00:00, 3694.05it/s]
100%|██████████| 1081/1081 [00:00<00:00, 38852.79it/s]
100%|██████████| 527/527 [00:00<00:00, 8951.59it/s]
100%|██████████| 19/19 [00:00<00

In [12]:
# Print the min_support values where results differ
for i in range(len(diff_result)):
    print("The results are different for these min_support values:", diff_result[i][2])
else:
    print("There is no difference in results betweeen the two algorithm")

There is no difference in results betweeen the two algorithm
