# Exercise 2: Implement the A-Priori algorithm

Implement a version of the A-Priori algorithm on your own. You may assume your data is given as a list of baskets.

In [1]:
import pandas as pd
import numpy as np

In [21]:
### read a csv file and extract the shopping items list

df = pd.read_csv('data/Groceries_dataset.csv')
baskets = [list(set(a[1]['itemDescription'].tolist())) for a in list(df.groupby('Member_number'))]
baskets

### total size of baskets

# len(list(np.concatenate(baskets).flat))

[['salty snack',
  'misc. beverages',
  'soda',
  'pickled vegetables',
  'semi-finished bread',
  'hygiene articles',
  'yogurt',
  'pastry',
  'sausage',
  'canned beer',
  'whole milk'],
 ['beef',
  'white bread',
  'whipped/sour cream',
  'soda',
  'rolls/buns',
  'frankfurter',
  'curd',
  'sausage',
  'whole milk'],
 ['other vegetables',
  'butter',
  'tropical fruit',
  'sugar',
  'butter milk',
  'specialty chocolate',
  'frozen vegetables',
  'whole milk'],
 ['rolls/buns',
  'sausage',
  'frozen meals',
  'detergent',
  'root vegetables',
  'dental care'],
 ['chocolate',
  'other vegetables',
  'pip fruit',
  'shopping bags',
  'packaged fruit/vegetables',
  'rolls/buns',
  'hygiene articles',
  'pastry',
  'dish cleaner',
  'tropical fruit',
  'frozen fish',
  'cling film/bags',
  'canned beer',
  'root vegetables',
  'red/blush wine',
  'whole milk'],
 ['margarine', 'whipped/sour cream', 'rolls/buns'],
 ['chocolate',
  'skin care',
  'shopping bags',
  'chicken',
  'flour',


In [22]:
### unique #items

items = set(list(np.concatenate(baskets).flat))
len(items)

167

In [23]:
### hash all singletons
df_item_hash = pd.DataFrame(range(len(items)), index = list(items), columns =['hashcode'], dtype=int)
df_item_hash

Unnamed: 0,hashcode
popcorn,0
ham,1
decalcifier,2
softener,3
syrup,4
...,...
pastry,162
sausage,163
bags,164
domestic eggs,165


In [24]:
### count the items, store the count into the hashed array index

# item_count = pd.DataFrame(np.zeros((len(items),1)), index = list(items), columns =['count'], dtype=int)
item_count_arr = np.zeros((len(items),1))

for b in baskets:
    for item in b:
            idx = df_item_hash.loc[item,'hashcode']
            item_count_arr[idx] += 1
            


In [49]:
item_count_arr.shape

(167, 1)

In [46]:
### find frequent items with support > s1 (here s1 = 0.02), and hash back from array index to items           
freq_items  = [df_item_hash[df_item_hash['hashcode']==x].index[0] for x in np.where(item_count_arr > 0.02*len(baskets))[0]] 
freq_items

# item_count_arr[item_count['count']>0.02*len(baskets)]
#freq_items['hashcode'] = list(range(1,len(freq_items)+1))

['ham',
 'seasonal products',
 'other vegetables',
 'semi-finished bread',
 'roll products ',
 'bottled water',
 'chocolate',
 'waffles',
 'bottled beer',
 'hamburger meat',
 'flour',
 'frozen dessert',
 'cream cheese ',
 'citrus fruit',
 'turkey',
 'meat',
 'pip fruit',
 'liquor',
 'canned fish',
 'pot plants',
 'specialty chocolate',
 'whipped/sour cream',
 'cat food',
 'dishes',
 'spread cheese',
 'packaged fruit/vegetables',
 'butter',
 'pickled vegetables',
 'hygiene articles',
 'rolls/buns',
 'tropical fruit',
 'frozen meals',
 'root vegetables',
 'pet care',
 'canned vegetables',
 'UHT-milk',
 'herbs',
 'mustard',
 'detergent',
 'red/blush wine',
 'white bread',
 'ice cream',
 'canned beer',
 'frozen vegetables',
 'frozen fish',
 'onions',
 'yogurt',
 'frankfurter',
 'specialty bar',
 'salt',
 'grapes',
 'candy',
 'chicken',
 'curd',
 'soft cheese',
 'butter milk',
 'sliced cheese',
 'sugar',
 'condensed milk',
 'pasta',
 'dessert',
 'shopping bags',
 'baking powder',
 'salty sn

In [48]:
### hash the frequent items (starting from 1)

df_freq_item_hash = pd.DataFrame(range(1,len(freq_items)+1), index=freq_items, columns=['hashcode'])
df_freq_item_hash

Unnamed: 0,hashcode
ham,1
seasonal products,2
other vegetables,3
semi-finished bread,4
roll products,5
...,...
hard cheese,83
pastry,84
sausage,85
domestic eggs,86


In [7]:
### triangular array encode function, (not used)
# def triangular_encode(i,j,n):
#     return int((i-1)*(n-i/2)+j-i)

In [50]:
### count the pairs using only frequent items, store the count into the (triangular) matrix.

# pair_mat = pd.DataFrame(np.zeros((len(freq_items.index),len(freq_items.index))), 
#                         columns=freq_items.index, index=freq_items.index,
#                        dtype=int)

pair_mat_hashed = np.zeros((len(freq_items)+1,len(freq_items)+1))
# n = len(freq_items)
# triangular_arr = np.zeros((n*n,))


for b in baskets:
    cand_list = [item for item in b if item in freq_items]
    if len(cand_list)<2:
        continue
    for idx, item1 in enumerate(cand_list):
        for item2 in cand_list[idx+1:]:
            i = df_freq_item_hash.loc[item1,'hashcode'] 
            j = df_freq_item_hash.loc[item2,'hashcode'] 
            #triangular_arr[triangular_encode(i,j,n)] +=1
            #pair_mat.loc[item1, item2] += 1
            pair_mat_hashed[max(i,j),min(i,j)]+=1

# pair_mat
pair_mat_hashed

array([[  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   7.,   0., ...,   0.,   0.,   0.],
       ...,
       [  0.,  57.,  19., ...,   0.,   0.,   0.],
       [  0.,  43.,  15., ..., 123.,   0.,   0.],
       [  0.,  28.,  12., ...,  99.,  82.,   0.]])

In [51]:
### extract frequent pairs that exceed support s2 (assume s2 = 0.02), and hash back.

freq_pairs = [[df_freq_item_hash[df_freq_item_hash['hashcode']==x].index[0], df_freq_item_hash[df_freq_item_hash['hashcode']==y].index[0]] for x, y in zip(*np.where(pair_mat_hashed > 0.02*len(baskets)))]
freq_pairs
# freq_pairs = [[freq_itemset[x], freq_itemset[y]] for x, y in zip(*np.where(pair_mat.values > 0.02*len(baskets)))]

[['other vegetables', 'ham'],
 ['bottled water', 'other vegetables'],
 ['chocolate', 'other vegetables'],
 ['chocolate', 'bottled water'],
 ['waffles', 'other vegetables'],
 ['bottled beer', 'other vegetables'],
 ['bottled beer', 'bottled water'],
 ['hamburger meat', 'other vegetables'],
 ['hamburger meat', 'bottled water'],
 ['cream cheese ', 'other vegetables'],
 ['cream cheese ', 'bottled water'],
 ['citrus fruit', 'other vegetables'],
 ['citrus fruit', 'bottled water'],
 ['citrus fruit', 'bottled beer'],
 ['meat', 'other vegetables'],
 ['pip fruit', 'other vegetables'],
 ['pip fruit', 'bottled water'],
 ['pip fruit', 'bottled beer'],
 ['pip fruit', 'citrus fruit'],
 ['specialty chocolate', 'other vegetables'],
 ['whipped/sour cream', 'other vegetables'],
 ['whipped/sour cream', 'bottled water'],
 ['whipped/sour cream', 'bottled beer'],
 ['whipped/sour cream', 'citrus fruit'],
 ['whipped/sour cream', 'pip fruit'],
 ['cat food', 'other vegetables'],
 ['butter', 'other vegetables'],
 

In [52]:
len(freq_pairs)

499

# Exercise 3: Use built in tools
Use/import the following Python packages: Pandas and MLxtend.  
Especially, have a look at apriori and association rules from mlxtend.frequent patterns.  
For documentation see: http://rasbt.github.io/mlxtend/

If helpful / desirable you might also use TransactionEncoder from mlxtend.preprocessing to clean / prepare your data.

The task: determine:
1. the frequent pairs of items.
2. the association rules of high confidence with or w/o high lift.
3. (optional) the association rules of high confidence with or w/o high interest. (optional)

In [11]:
# ! pip install mlxtend

In [53]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

**Solutions**  
1. The frequent pairs of items.  
**I'm mainly refering to this documentation examples:** [reference](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/#apriori-frequent-itemsets-via-the-apriori-algorithm)

In [54]:
te = TransactionEncoder()
te_ary = te.fit(baskets).transform(baskets)
df_one_hot = pd.DataFrame(te_ary, columns=te.columns_)
df_one_hot

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,True,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3893,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3894,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
3895,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3896,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,True,False


In [56]:
frq_items = apriori(df_one_hot, min_support = 0.02, use_colnames = True)
frq_items['length'] = frq_items['itemsets'].apply(lambda x: len(x))
frq_items

Unnamed: 0,support,itemsets,length
0,0.078502,(UHT-milk),1
1,0.031042,(baking powder),1
2,0.119548,(beef),1
3,0.079785,(berries),1
4,0.062083,(beverages),1
...,...,...,...
889,0.027963,"(soda, other vegetables, yogurt, whole milk)",4
890,0.021293,"(tropical fruit, other vegetables, yogurt, who...",4
891,0.021036,"(soda, sausage, rolls/buns, whole milk)",4
892,0.022832,"(yogurt, rolls/buns, sausage, whole milk)",4


In [59]:
### reformat a little, to put the frozenset into lists

ml_frq_items = []
for i in frq_items[frq_items['length']==1].itemsets.values:
    ml_frq_items.extend(list(i))
    
ml_frq_pairs = []
for i in frq_items[frq_items['length']==2].itemsets.values:
    ml_frq_pairs.append(list(i))

### check if the frequent itemsets found by ourselves and mlxtend are the same
for i in ml_frq_items:
    if i not in freq_items:
        print(i)
        
len(ml_frq_pairs)==len(freq_pairs)

True

2. the association rules of high confidence with or w/o high lift.  
[doc example](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/#association_rules-association-rules-generation-from-frequent-itemsets)

In [60]:
association_rules(frq_items, metric="confidence", min_threshold=0.6)

TypeError: association_rules() missing 1 required positional argument: 'num_itemsets'

In [61]:
association_rules(frq_items, metric="lift", min_threshold=1.2)

TypeError: association_rules() missing 1 required positional argument: 'num_itemsets'