# Chap 14 - Ex1: apply ECLAT for store_data.csv

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn

np.set_printoptions(suppress = True)
pd.set_option('display.width', 1000)

In [5]:
store_data = pd.read_csv('Data/store_data.csv')

store_data

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
1,chutney,,,,,,,,,,,,,,,,,,,
2,turkey,avocado,,,,,,,,,,,,,,,,,,
3,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
4,low fat yogurt,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,butter,light mayo,fresh bread,,,,,,,,,,,,,,,,,
7496,burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,,,,,,,,,,,
7497,chicken,,,,,,,,,,,,,,,,,,,
7498,escalope,green tea,,,,,,,,,,,,,,,,,,


## <span style = 'color:yellow'> Convert store_data df into transaction dataframe

In [8]:
transaction_dict = {}

for row in range(store_data.shape[0]):
    transaction_id = f'T{row}'
    item_lst = [col for col in store_data.columns if pd.notna(store_data.loc[row, col])]
    transaction_dict[transaction_id] = item_lst

df_transaction = pd.DataFrame.from_dict(data=transaction_dict, orient = 'index')

df_transaction

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
T0,shrimp,almonds,avocado,,,,,,,,,,,,,,,,
T1,shrimp,,,,,,,,,,,,,,,,,,
T2,shrimp,almonds,,,,,,,,,,,,,,,,,
T3,shrimp,almonds,avocado,vegetables mix,green grapes,,,,,,,,,,,,,,
T4,shrimp,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T7495,shrimp,almonds,avocado,,,,,,,,,,,,,,,,
T7496,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,,,,,,,,,,,,,
T7497,shrimp,,,,,,,,,,,,,,,,,,
T7498,shrimp,almonds,,,,,,,,,,,,,,,,,


## <span style = 'color:yellow'> Use pyECLAT to create transaction_id_set from transaction dataframe

In [11]:
from pyECLAT import ECLAT

eclat_instance = ECLAT(data = df_transaction.reset_index(drop=True), verbose = True)

transaction_id_set = eclat_instance.fit(min_support=0.3, min_combination=2, max_combination=3, separator=' & ')

#print(transaction_id_set)

100%|█████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 122.44it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 1114.13it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 3857.01it/s]


Combination 2 by 2


10it [00:00, 104.02it/s]


Combination 3 by 3


10it [00:00, 100.85it/s]


## <span style = 'color:yellow'> Create frequent_itemsets from transaction_id_set and eclat_instance.support

In [14]:
individual_item = eclat_instance.support()
del individual_item[None] #use this if there is None in individual_item dictionary

# Fix individual items to be singleton lists
itemset_lst = [[item] for item in individual_item.keys()] + [itemset.split(' & ') for itemset in transaction_id_set[1].keys()]
support_lst = list(individual_item.values()) + list(transaction_id_set[1].values())

df_tid = pd.DataFrame({
    'support': support_lst,
    'itemsets': itemset_lst
})

# Now safely convert to frozenset
df_tid['itemsets'] = df_tid['itemsets'].apply(frozenset)

df_tid

Unnamed: 0,support,itemsets
0,0.0004,(antioxydant juice)
1,0.1824,(yams)
2,0.0032,(mineral water)
3,0.585067,(avocado)
4,0.130667,(cottage cheese)
5,0.034,(low fat yogurt)
6,0.000267,(spinach)
7,0.445867,(vegetables mix)
8,0.052533,(tomato juice)
9,0.0004,(frozen smoothie)


## <span style = 'color:yellow'> Use mlxtend.frequent_patterns.association_rules to find association rules

In [19]:
from mlxtend.frequent_patterns import association_rules

## Find association rules within itemsets
asc_rules = association_rules(df_tid, metric='confidence', min_threshold=0.3,
                             return_metrics=['support', 'confidence', 'lift', 'leverage', 'conviction'])
asc_rules

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage,conviction
0,(avocado),(vegetables mix),0.445867,0.762078,1.709207,0.185005,2.329058
1,(vegetables mix),(avocado),0.445867,1.000000,1.709207,0.185005,inf
2,(avocado),(green grapes),0.337067,0.576117,1.709207,0.139860,1.563952
3,(green grapes),(avocado),0.337067,1.000000,1.709207,0.139860,inf
4,(almonds),(avocado),0.585067,0.763662,1.305256,0.136828,1.755675
...,...,...,...,...,...,...,...
75,"(almonds, shrimp)",(green grapes),0.337067,0.439958,1.305256,0.078829,1.183721
76,"(shrimp, green grapes)",(almonds),0.337067,1.000000,1.305256,0.078829,inf
77,(almonds),"(shrimp, green grapes)",0.337067,0.439958,1.305256,0.078829,1.183721
78,(green grapes),"(almonds, shrimp)",0.337067,1.000000,1.305256,0.078829,inf
