In [9]:
import pandas as pd
import numpy as np
import math
from itertools import combinations

In [3]:
transaction_df = pd.read_csv('./GroceryStoreDataSet.csv')
transaction_df

Unnamed: 0,"MILK,BREAD,BISCUIT"
0,"BREAD,MILK,BISCUIT,CORNFLAKES"
1,"BREAD,TEA,BOURNVITA"
2,"JAM,MAGGI,BREAD,MILK"
3,"MAGGI,TEA,BISCUIT"
4,"BREAD,TEA,BOURNVITA"
5,"MAGGI,TEA,CORNFLAKES"
6,"MAGGI,BREAD,TEA,BISCUIT"
7,"JAM,MAGGI,BREAD,TEA"
8,"BREAD,MILK"
9,"COFFEE,COCK,BISCUIT,CORNFLAKES"


In [7]:
transaction_df.index.rename('TID', inplace=True)
transaction_df.rename(columns={'MILK,BREAD,BISCUIT' : 'item_list'}, inplace=True)
transaction_df

Unnamed: 0_level_0,item_list
TID,Unnamed: 1_level_1
0,"BREAD,MILK,BISCUIT,CORNFLAKES"
1,"BREAD,TEA,BOURNVITA"
2,"JAM,MAGGI,BREAD,MILK"
3,"MAGGI,TEA,BISCUIT"
4,"BREAD,TEA,BOURNVITA"
5,"MAGGI,TEA,CORNFLAKES"
6,"MAGGI,BREAD,TEA,BISCUIT"
7,"JAM,MAGGI,BREAD,TEA"
8,"BREAD,MILK"
9,"COFFEE,COCK,BISCUIT,CORNFLAKES"


In [8]:
trans_df = transaction_df.item_list.str.split(',')
trans_df

TID
0      [BREAD, MILK, BISCUIT, CORNFLAKES]
1                 [BREAD, TEA, BOURNVITA]
2               [JAM, MAGGI, BREAD, MILK]
3                   [MAGGI, TEA, BISCUIT]
4                 [BREAD, TEA, BOURNVITA]
5                [MAGGI, TEA, CORNFLAKES]
6            [MAGGI, BREAD, TEA, BISCUIT]
7                [JAM, MAGGI, BREAD, TEA]
8                           [BREAD, MILK]
9     [COFFEE, COCK, BISCUIT, CORNFLAKES]
10    [COFFEE, COCK, BISCUIT, CORNFLAKES]
11             [COFFEE, SUGER, BOURNVITA]
12                  [BREAD, COFFEE, COCK]
13                [BREAD, SUGER, BISCUIT]
14            [COFFEE, SUGER, CORNFLAKES]
15              [BREAD, SUGER, BOURNVITA]
16                 [BREAD, COFFEE, SUGER]
17                 [BREAD, COFFEE, SUGER]
18        [TEA, MILK, COFFEE, CORNFLAKES]
Name: item_list, dtype: object

In [30]:
def get_item_support(transactions, itemset):
    """Calculate support for an itemset"""
    count = sum(1 for transaction in transactions if itemset.issubset(transaction))
    return count / len(transactions)

def get_frequent_itemsets(transactions, candidates, min_support):
    """Filter itemsets by minimum support"""
    frequent_itemsets = {}
    for itemset in candidates:
        support = get_item_support(transactions, itemset)
        if (support > min_support):
            frequent_itemsets[frozenset(itemset)] = support
    
    return frequent_itemsets

def apriori(transactions, min_support, min_confidence):
    """Apriori algorithm for association rule mining"""
    distinct_trans = [set(transaction) for transaction in transactions]
    distinct_items = {item for transaction in transactions for item in transaction}
    # Start with 1-item candidate
    candidates = [{item} for item in distinct_items]
    frequent_itemsets = {}

    k = 1
    while candidates:
        frequent_k = get_frequent_itemsets(distinct_trans, candidates, min_support)
        frequent_itemsets.update(frequent_k)

        k += 1
        candidates = [set1.union(set2) for set1 in frequent_k for set2 in frequent_k if len(set1.union(set2)) == k]
        candidates = list(map(frozenset, candidates))

    # Get only the itemsets with len > 1
    filtered_itemsets = {itemset: support for itemset, support in frequent_itemsets.items() if len(itemset) > 1}
    associate_rules = []
    
    for itemset, support in filtered_itemsets.items():
        for consequent in map(frozenset, combinations(itemset, 1)):
            antecedent = itemset - consequent
            if antecedent in filtered_itemsets:
                confidence = support / filtered_itemsets[antecedent]
                if confidence >= min_confidence:
                    associate_rules.append((antecedent, consequent, support, confidence))

    return filtered_itemsets, associate_rules



In [39]:
if __name__ == "__main__":
    transactions = trans_df.tolist() 
    print(transactions)
    min_support = 0.2
    min_confidence = 0.7

    frequent_itemsets, association_rules = apriori(transactions, min_support, min_confidence)
    print(frequent_itemsets, association_rules)

    for antecedent, consequent, support, confidence in association_rules:
        print(f"{set(antecedent)} => {set(consequent)} (support: {support:.2f}, confidence: {confidence:.2f})")

[['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'], ['BREAD', 'TEA', 'BOURNVITA'], ['JAM', 'MAGGI', 'BREAD', 'MILK'], ['MAGGI', 'TEA', 'BISCUIT'], ['BREAD', 'TEA', 'BOURNVITA'], ['MAGGI', 'TEA', 'CORNFLAKES'], ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'], ['JAM', 'MAGGI', 'BREAD', 'TEA'], ['BREAD', 'MILK'], ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'], ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'], ['COFFEE', 'SUGER', 'BOURNVITA'], ['BREAD', 'COFFEE', 'COCK'], ['BREAD', 'SUGER', 'BISCUIT'], ['COFFEE', 'SUGER', 'CORNFLAKES'], ['BREAD', 'SUGER', 'BOURNVITA'], ['BREAD', 'COFFEE', 'SUGER'], ['BREAD', 'COFFEE', 'SUGER'], ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]
{frozenset({'CORNFLAKES', 'COFFEE'}): 0.21052631578947367, frozenset({'SUGER', 'COFFEE'}): 0.21052631578947367, frozenset({'MAGGI', 'TEA'}): 0.21052631578947367, frozenset({'BREAD', 'TEA'}): 0.21052631578947367, frozenset({'SUGER', 'BREAD'}): 0.21052631578947367} []
