In [1]:
SUPPORT_THRESHOLD = 5.9 # To avoid the mistake of using > instead of >=
CONFIDENCE_THRESHOLD = 0.749999

dataset = """1 A,B,C,D
2 B,C,D,E,G
3 A,C,G,H,K
4 B,C,D,E,K
5 D,E,F,H,L
6 A,B,C,D,E,L
7 A,D,E,F,L
8 B,I,K,L
9 C,D,F,L
10 A,B,D,E,K
11 C,D,H,I,K
12 B,C,E,K
13 B,C,D,F
14 A,B,C,D
15 C,H,I,J
16 A,E,F,H,L
17 H,K,L
18 A,B,D,H,K
19 D,E,K
20 B,C,D,E,H"""

transactions = []
items = {}

from collections import namedtuple
Itemset = namedtuple('Itemset', 'items transactions hv support')

def symbol(transaction):
    return [item if item in transaction else '?' for item in items.keys()]
    

for line in dataset.split("\n"):
    i, stuff = line.split()
    transaction = set(stuff.split(','))
    for item in transaction:
        l = items.get(item,[])
        l.append(str(i))
        items[item] = l
        
    transactions.append(transaction)
 
# Create CSV
print ",".join(items.keys())
for t in transactions:
    print ",".join(symbol(t))


A,C,B,E,D,G,F,I,H,K,J,L
A,C,B,?,D,?,?,?,?,?,?,?
?,C,B,E,D,G,?,?,?,?,?,?
A,C,?,?,?,G,?,?,H,K,?,?
?,C,B,E,D,?,?,?,?,K,?,?
?,?,?,E,D,?,F,?,H,?,?,L
A,C,B,E,D,?,?,?,?,?,?,L
A,?,?,E,D,?,F,?,?,?,?,L
?,?,B,?,?,?,?,I,?,K,?,L
?,C,?,?,D,?,F,?,?,?,?,L
A,?,B,E,D,?,?,?,?,K,?,?
?,C,?,?,D,?,?,I,H,K,?,?
?,C,B,E,?,?,?,?,?,K,?,?
?,C,B,?,D,?,F,?,?,?,?,?
A,C,B,?,D,?,?,?,?,?,?,?
?,C,?,?,?,?,?,I,H,?,J,?
A,?,?,E,?,?,F,?,H,?,?,L
?,?,?,?,?,?,?,?,H,K,?,L
A,?,B,?,D,?,?,?,H,K,?,?
?,?,?,E,D,?,?,?,?,K,?,?
?,C,B,E,D,?,?,?,H,?,?,?


Create candidates, 

In [2]:
def join_items(A, B):
    diff = A.items.symmetric_difference(B.items)
    
    if not len(diff) == 2: return None
    
    transactions = A.transactions & B.transactions
    
    if not len(transactions) >= SUPPORT_THRESHOLD: return None
    
    items = A.items | B.items
    
    values = list(items)
    values.sort()
    
    return Itemset(items = items, transactions = transactions, hv = "".join(values), support=len(transactions) )

In [3]:
all_supported_itemsets = {}
current_itemsets = {}
all_single_itemsets = {}

for item, trans in items.items():
    
    hv = "".join(item) # sets are unhashable, could make custom set
    support = len(trans) # cheaper to store than call len(transactions?)
    itemset = Itemset(items = {item}, transactions = set(trans), hv=hv, support=support)
    
    all_single_itemsets[hv] = itemset
    if len(trans) >= SUPPORT_THRESHOLD:
        current_itemsets[hv] = itemset
    
all_supported_itemsets.update(current_itemsets)  

values = sorted(all_single_itemsets.values(), key=lambda itemset: -itemset.support)

ordered = {list(k.items)[-1] :v for v,k in enumerate(values)}

print ordered

        
while current_itemsets:
    from itertools import combinations
    combs = combinations(current_itemsets.values(), 2) # Self Join
    
    current_itemsets = {} # clear
    
    for c,d in combs: # for each candidate
        itemset = join_items(c,d)
        if itemset:   
            current_itemsets[itemset.hv] = itemset
    
    print "\n\nNEW ITEMSETS\n=============\n"
    for cd in current_itemsets:
        print cd
    all_supported_itemsets.update(current_itemsets)

{'A': 5, 'C': 1, 'B': 2, 'E': 3, 'D': 0, 'G': 10, 'F': 8, 'I': 9, 'H': 6, 'K': 4, 'J': 11, 'L': 7}


NEW ITEMSETS

BD
BE
AD
BC
DE
CD


NEW ITEMSETS

BCD


NEW ITEMSETS



In [8]:
def write_line(ev, it):
    consequent = list(ev.items - it.items)
    consequent.sort()
    confidence = ev.support*1.0/all_supported_itemsets["".join(consequent)].support
    support = len(ev.items)*1.0/it.support
    if confidence > CONFIDENCE_THRESHOLD:
        print ", ".join(consequent),"=> ", it.hv, "confidence =", confidence, "support =",support 

for ev in all_supported_itemsets.values():
    if len(ev.items) == 1:
        continue 
        
    for ite in ev.items:
        it = all_supported_itemsets[ite]
        write_line(ev,it)

B =>  D confidence = 0.818181818182 support = 0.142857142857
B, D =>  C confidence = 0.777777777778 support = 0.25
C, D =>  B confidence = 0.777777777778 support = 0.272727272727
B, C =>  D confidence = 0.875 support = 0.214285714286
E =>  D confidence = 0.8 support = 0.142857142857
C =>  D confidence = 0.75 support = 0.142857142857
A =>  D confidence = 0.75 support = 0.142857142857


In [5]:
Node = namedtuple('Node', 'item support children')
head = Node(item=None, support={}, children = {})

for t in transactions:
    ordered_transaction = sorted(t, key=lambda x: ordered[x])
    print ordered_transaction
    parent = head
    
    
    for item in ordered_transaction:
        
        child = parent.children.get(item, None)
        
        if child:
            child.support['s'] += 1
        else:
            new_item = Node(item = item, support = {'s':1}, children = {})
            parent.children[item] = new_item
            child = new_item
            
        parent = child

    print "\n\n\n\n", head

['D', 'C', 'B', 'A']




Node(item=None, support={}, children={'D': Node(item='D', support={'s': 1}, children={'C': Node(item='C', support={'s': 1}, children={'B': Node(item='B', support={'s': 1}, children={'A': Node(item='A', support={'s': 1}, children={})})})})})
['D', 'C', 'B', 'E', 'G']




Node(item=None, support={}, children={'D': Node(item='D', support={'s': 2}, children={'C': Node(item='C', support={'s': 2}, children={'B': Node(item='B', support={'s': 2}, children={'A': Node(item='A', support={'s': 1}, children={}), 'E': Node(item='E', support={'s': 1}, children={'G': Node(item='G', support={'s': 1}, children={})})})})})})
['C', 'K', 'A', 'H', 'G']




Node(item=None, support={}, children={'C': Node(item='C', support={'s': 1}, children={'K': Node(item='K', support={'s': 1}, children={'A': Node(item='A', support={'s': 1}, children={'H': Node(item='H', support={'s': 1}, children={'G': Node(item='G', support={'s': 1}, children={})})})})}), 'D': Node(item='D', support={'s': 2}, chi

In [6]:
#Depth-First Traversal
nodes = []

for n in head.children.values():
        nodes.append([n])

while nodes:
    node_list = nodes.pop()
    print "-> ".join([x.item for x in node_list]), "Support =", node_list[-1].support['s']
    node = node_list[-1]
    for n in node.children.values():
        nodes.append(node_list + [n])

D Support = 14
D-> E Support = 3
D-> E-> K Support = 1
D-> E-> H Support = 1
D-> E-> H-> L Support = 1
D-> E-> H-> L-> F Support = 1
D-> E-> A Support = 1
D-> E-> A-> L Support = 1
D-> E-> A-> L-> F Support = 1
D-> B Support = 2
D-> B-> E Support = 1
D-> B-> E-> K Support = 1
D-> B-> E-> K-> A Support = 1
D-> B-> K Support = 1
D-> B-> K-> A Support = 1
D-> B-> K-> A-> H Support = 1
D-> C Support = 9
D-> C-> L Support = 1
D-> C-> L-> F Support = 1
D-> C-> B Support = 7
D-> C-> B-> F Support = 1
D-> C-> B-> E Support = 4
D-> C-> B-> E-> G Support = 1
D-> C-> B-> E-> K Support = 1
D-> C-> B-> E-> H Support = 1
D-> C-> B-> E-> A Support = 1
D-> C-> B-> E-> A-> L Support = 1
D-> C-> B-> A Support = 2
D-> C-> K Support = 1
D-> C-> K-> H Support = 1
D-> C-> K-> H-> I Support = 1
E Support = 1
E-> A Support = 1
E-> A-> H Support = 1
E-> A-> H-> L Support = 1
E-> A-> H-> L-> F Support = 1
B Support = 1
B-> K Support = 1
B-> K-> L Support = 1
B-> K-> L-> I Support = 1
C Support = 3
C-> B Support