# Association Rule Mining

In [1]:
import pandas as pd
import itertools
import math

## Step 0
- You will get two tsv files from us. Rows are transactions with purchased items. Load it in your language/environment.
- Use the smaller file (items.tsv, minsupp of 70%) for development and the larger file (retail.tsv, minsupp of 10%) for evaluation.

In [28]:
import csv
with open('retail.tsv', 'r') as f:
    reader = csv.reader(f, delimiter=" ")
    #print(list(reader))
    dataset = [[int(item) for item in purchase if item] for purchase in reader]
    
print(len(dataset))
print(max(len(transaction) for transaction in dataset))

88162
76


In [40]:
minsupp = 0.1

## Step 1
- Implement the Apriori algorithm*.
- Using your implementation, extract frequent item sets from the given datasets.

In [12]:
def apriori(dataset, minsupp, debug=True):
    total_transactions = len(dataset)
    candidates = sorted({(item,) for transaction in dataset for item in transaction})
    k = 1
    debug_threshold = math.ceil(total_transactions/100)

    result = []
    while candidates:
        candidate_counts = dict()
        if debug:
            print(f'k={k}')
        for i, transaction in enumerate(dataset):
            if debug and i % debug_threshold == 0:
                print(f'{i}/{total_transactions}', end='\r')
            for c_t in itertools.combinations(transaction, r=k):
                if k == 1 or c_t in candidates:
                    if c_t not in candidate_counts:
                        candidate_counts[c_t] = 0
                    candidate_counts[c_t] += 1
                    
        print(f'{total_transactions}/{total_transactions}')
        
        frequent_items = set()
        for item, count in sorted(candidate_counts.items()):
            if count / total_transactions >= minsupp:
                frequent_items.add(item)
        
        if debug:
            print(f'Frequent items fount: {len(frequent_items)}')

        for items in sorted(frequent_items):
            result.append(items)

        relevant_items = sorted({item for items in frequent_items for item in items})
        k += 1
        candidates = [combination for combination in itertools.combinations(relevant_items, r=k)]
    
    return result

In [41]:
print(apriori(dataset, minsupp))

k=1
88162/88162
Frequent items fount: 5
k=2
88162/88162
Frequent items fount: 4
k=3
88162/88162
Frequent items fount: 0
[(32,), (38,), (39,), (41,), (48,), (38, 39), (39, 41), (39, 48), (41, 48)]
