## Load Data

In [1]:
import numpy as np
import pandas as pd
from random import sample
from libs.utils import *

In [3]:
path_to_data = "C:/Users/Admin/DA_Projects/online_retail.txt"
min_support = 0.05
min_confidence = 0.6
path_to_order = "C:/Users/Admin/DA_Projects/order.txt"

In [63]:
order = load_order(path_to_order)
transactions = load_transactions(path_to_data, order)
sampled_transactions = transactions[:int(len(transactions) * 0.01)] 
filtered_transactions = [t for t in sampled_transactions if len(t) < 10]
if len(filtered_transactions) > 50:
    filtered_transactions = sample(filtered_transactions, 50)
flattened_items = [item for sublist in filtered_transactions for item in sublist]
order = sorted(set(flattened_items))
print(order)

['3 PIECE SPACEBOY COOKIE CUTTER SET', 'ALARM CLOCK BAKELIKE CHOCOLATE', 'ALARM CLOCK BAKELIKE GREEN', 'ALARM CLOCK BAKELIKE IVORY', 'ALARM CLOCK BAKELIKE ORANGE', 'ALARM CLOCK BAKELIKE PINK', 'ALARM CLOCK BAKELIKE RED', 'ANTIQUE GLASS DRESSING TABLE POT', 'ANTIQUE TALL SWIRLGLASS TRINKET POT', 'ASSORTED BOTTLE TOP  MAGNETS', 'ASSORTED COLOUR BIRD ORNAMENT', 'ASSORTED COLOUR MINI CASES', 'BAG 125g SWIRLY MARBLES', 'BAKING SET SPACEBOY DESIGN', 'BATH BUILDING BLOCK WORD', 'BLACK HEART CARD HOLDER', 'BLACK KITCHEN SCALES', 'BLACK RECORD COVER FRAME', 'BLUE BIRDHOUSE DECORATION', 'BLUE COAT RACK PARIS FASHION', 'BLUE DRAWER KNOB ACRYLIC EDWARDIAN', 'BREAD BIN DINER STYLE PINK', 'BREAD BIN DINER STYLE RED', 'CANDLEHOLDER PINK HANGING HEART', 'CERAMIC CAKE STAND + HANGING CAKES', 'CERAMIC HEART FAIRY CAKE MONEY BANK', 'CERAMIC STRAWBERRY CAKE MONEY BANK', 'CERAMIC STRAWBERRY DESIGN MUG', 'CHEST OF DRAWERS GINGHAM HEART', 'CHILDS BREAKFAST SET SPACEBOY', 'CHINESE DRAGON PAPER LANTERNS', 'CHO

In [35]:
sum = 0
for sublist in filtered_transactions:
    sum += len(sublist)
print(sum)

179


In [37]:
len(filtered_transactions)

50

In [39]:
len(order)

142

## Initialization 

In [41]:
C = {}
L = {}
itemset_size = 1
discarded = {itemset_size: []}
C.update({itemset_size: [[f] for f in order]})

In [43]:
C

{1: [['3 TRADITIONAL COOKIE CUTTERS  SET'],
  ['3 TRADITIONAl BISCUIT CUTTERS  SET'],
  ['ALARM CLOCK BAKELIKE CHOCOLATE'],
  ['ALARM CLOCK BAKELIKE GREEN'],
  ['ALARM CLOCK BAKELIKE IVORY'],
  ['ALARM CLOCK BAKELIKE ORANGE'],
  ['ALARM CLOCK BAKELIKE PINK'],
  ['ALARM CLOCK BAKELIKE RED'],
  ['ANT WHITE WIRE HEART SPIRAL'],
  ['ASSORTED BOTTLE TOP  MAGNETS'],
  ['ASSORTED COLOUR BIRD ORNAMENT'],
  ['ASSORTED COLOUR MINI CASES'],
  ['BAG 125g SWIRLY MARBLES'],
  ['BATH BUILDING BLOCK WORD'],
  ['BIRTHDAY PARTY CORDON BARRIER TAPE'],
  ['BLUE BIRDHOUSE DECORATION'],
  ['BLUE DRAWER KNOB ACRYLIC EDWARDIAN'],
  ['BREAD BIN DINER STYLE PINK'],
  ['BREAD BIN DINER STYLE RED'],
  ['CAKE PLATE LOVEBIRD WHITE'],
  ['CANDLEHOLDER PINK HANGING HEART'],
  ['CERAMIC CAKE STAND + HANGING CAKES'],
  ['CHEST OF DRAWERS GINGHAM HEART'],
  ['CHILDRENS DOLLY GIRL MUG'],
  ['CHILDRENS SPACEBOY MUG'],
  ['CHILDS GARDEN BRUSH PINK'],
  ['CHILLI LIGHTS'],
  ['CHINESE DRAGON PAPER LANTERNS'],
  ['CHOCOLATE H

__Create L1__

In [65]:
supp_count_L = {}
f, sup, new_discarded = get_frequent(C[itemset_size], filtered_transactions, min_support, discarded)
discarded.update({itemset_size : new_discarded})
L.update({itemset_size : f})
supp_count_L.update({itemset_size : sup})

In [67]:
print_table(L[1], supp_count_L[1])

Itemset | Frequency
['ASSORTED COLOUR BIRD ORNAMENT']  :  13
['CHOCOLATE HOT WATER BOTTLE']  :  11
['CREAM CUPID HEARTS COAT HANGER']  :  15
['GLASS STAR FROSTED T-LIGHT HOLDER']  :  15
['HAND WARMER BIRD DESIGN']  :  20
['HAND WARMER OWL DESIGN']  :  17
['HAND WARMER RED POLKA DOT']  :  15
['HAND WARMER SCOTTY DOG DESIGN']  :  20
['HAND WARMER UNION JACK']  :  29
['HOMEMADE JAM SCENTED CANDLES']  :  13
['HOT WATER BOTTLE BABUSHKA']  :  13
['HOT WATER BOTTLE TEA AND SYMPATHY']  :  14
['JAM MAKING SET PRINTED']  :  24
['JAM MAKING SET WITH JARS']  :  13
['JUMBO BAG RED RETROSPOT']  :  15
['KNITTED UNION FLAG HOT WATER BOTTLE']  :  21
["PAPER CHAIN KIT 50'S CHRISTMAS"]  :  17
['PAPER CHAIN KIT VINTAGE CHRISTMAS']  :  16
['RED WOOLLY HOTTIE WHITE HEART.']  :  26
['REGENCY CAKESTAND 3 TIER']  :  17
['SET 7 BABUSHKA NESTING BOXES']  :  21
['VINTAGE HEADS AND TAILS CARD GAME']  :  11
['WHITE HANGING HEART T-LIGHT HOLDER']  :  29
['WHITE METAL LANTERN']  :  16
['WOODEN FRAME ANTIQUE WHITE']  

## Apriori algorithm

In [69]:
k = itemset_size + 1
convergence = False
while not convergence:
    C.update({ k : join_set_itemsets(L[k-1], order)})
    f, sup, new_discarded = get_frequent(C[k], filtered_transactions, min_support, discarded) 
    discarded.update({k : new_discarded})
    L.update({k : f})
    supp_count_L.update({k : sup})
    if len(L[k]) == 0:
        convergence = True
    else:
        print("Table L{}: \n".format(k))
        print_table(L[k], supp_count_L[k])
    k += 1

Table L2: 

Itemset | Frequency
['CREAM CUPID HEARTS COAT HANGER', 'GLASS STAR FROSTED T-LIGHT HOLDER']  :  13
['CREAM CUPID HEARTS COAT HANGER', 'KNITTED UNION FLAG HOT WATER BOTTLE']  :  13
['CREAM CUPID HEARTS COAT HANGER', 'RED WOOLLY HOTTIE WHITE HEART.']  :  13
['CREAM CUPID HEARTS COAT HANGER', 'SET 7 BABUSHKA NESTING BOXES']  :  12
['CREAM CUPID HEARTS COAT HANGER', 'WHITE HANGING HEART T-LIGHT HOLDER']  :  13
['CREAM CUPID HEARTS COAT HANGER', 'WHITE METAL LANTERN']  :  13
['CREAM CUPID HEARTS COAT HANGER', 'WOODEN FRAME ANTIQUE WHITE']  :  12
['GLASS STAR FROSTED T-LIGHT HOLDER', 'KNITTED UNION FLAG HOT WATER BOTTLE']  :  15
['GLASS STAR FROSTED T-LIGHT HOLDER', 'RED WOOLLY HOTTIE WHITE HEART.']  :  15
['GLASS STAR FROSTED T-LIGHT HOLDER', 'SET 7 BABUSHKA NESTING BOXES']  :  14
['GLASS STAR FROSTED T-LIGHT HOLDER', 'WHITE HANGING HEART T-LIGHT HOLDER']  :  15
['GLASS STAR FROSTED T-LIGHT HOLDER', 'WHITE METAL LANTERN']  :  15
['GLASS STAR FROSTED T-LIGHT HOLDER', 'WOODEN FRAM

## Generating the Association rules

In [71]:
assoc_rules_str = ""
num_trans = len(filtered_transactions)
for i in range(1, len(L)):
    for j in range(len(L[i])):
        s = powerset(L[i][j])
        for z in s:
            S = set(z)
            X = set(L[i][j])
            X_S = set(X-S)
            sup_x = supp_count_L[i][j]
            sup_x_s = count_occurences(X_S, filtered_transactions)
            conf = sup_x / count_occurences(S, filtered_transactions)
            lift = conf / (sup_x_s / num_trans)
            if conf >= min_confidence:
                assoc_rules_str += write_rules(X, X_S, S, conf, sup_x, lift, num_trans)

In [57]:
print(assoc_rules_str)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [73]:
with open('apriori_result.txt', 'w') as f:
    f.write(assoc_rules_str)