# Chap 14 - Ex2: apply ECLAT for dataset_group.csv

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn

np.set_printoptions(suppress = True)
pd.set_option('display.width', 1000)

In [14]:
dataset_group = (
    pd.read_csv('Data/dataset_group.csv', header=None)
    .set_axis(['Date', 'Order', 'Items'], axis = 1) #set columns' names
)

dataset_group

Unnamed: 0,Date,Order,Items
0,2000-01-01,1,yogurt
1,2000-01-01,1,pork
2,2000-01-01,1,sandwich bags
3,2000-01-01,1,lunch meat
4,2000-01-01,1,all- purpose
...,...,...,...
22338,2002-02-26,1139,soda
22339,2002-02-26,1139,laundry detergent
22340,2002-02-26,1139,vegetables
22341,2002-02-26,1139,shampoo


## <span style = 'color:yellow'> Convert datset_group df into transaction dataframe

In [39]:
dataset_group.groupby('Order')['Items'].agg(lambda x: list(x)).reset_index()

Unnamed: 0,Order,Items
0,1,"[yogurt, pork, sandwich bags, lunch meat, all-..."
1,2,"[toilet paper, shampoo, hand soap, waffles, ve..."
2,3,"[soda, pork, soap, ice cream, toilet paper, di..."
3,4,"[cereals, juice, lunch meat, soda, toilet pape..."
4,5,"[sandwich loaves, pasta, tortillas, mixes, han..."
...,...,...
1134,1135,"[sugar, beef, sandwich bags, hand soap, paper ..."
1135,1136,"[coffee/tea, dinner rolls, lunch meat, spaghet..."
1136,1137,"[beef, lunch meat, eggs, poultry, vegetables, ..."
1137,1138,"[sandwich bags, ketchup, milk, poultry, cheese..."


In [64]:
transaction_id = dataset_group.groupby('Order')['Items'].agg(lambda x: list(x)).reset_index()['Order'].to_list()
item_lst = dataset_group.groupby('Order')['Items'].agg(lambda x: list(x)).reset_index()['Items'].to_list()

transaction_dict = {f'T{tid}': items for tid, items in zip(transaction_id, item_lst)}

df_transaction = pd.DataFrame.from_dict(transaction_dict, orient='index')
df_transaction

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
T1,yogurt,pork,sandwich bags,lunch meat,all- purpose,flour,soda,butter,vegetables,beef,...,,,,,,,,,,
T2,toilet paper,shampoo,hand soap,waffles,vegetables,cheeses,mixes,milk,sandwich bags,laundry detergent,...,,,,,,,,,,
T3,soda,pork,soap,ice cream,toilet paper,dinner rolls,hand soap,spaghetti sauce,milk,ketchup,...,spaghetti sauce,pork,vegetables,cheeses,eggs,vegetables,vegetables,,,
T4,cereals,juice,lunch meat,soda,toilet paper,all- purpose,,,,,...,,,,,,,,,,
T5,sandwich loaves,pasta,tortillas,mixes,hand soap,toilet paper,vegetables,vegetables,paper towels,vegetables,...,all- purpose,soda,yogurt,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T1135,sugar,beef,sandwich bags,hand soap,paper towels,paper towels,all- purpose,beef,fruits,coffee/tea,...,beef,cereals,juice,poultry,sugar,soap,,,,
T1136,coffee/tea,dinner rolls,lunch meat,spaghetti sauce,pasta,vegetables,cereals,dinner rolls,soap,milk,...,,,,,,,,,,
T1137,beef,lunch meat,eggs,poultry,vegetables,tortillas,beef,beef,individual meals,dishwashing liquid/detergent,...,vegetables,pork,,,,,,,,
T1138,sandwich bags,ketchup,milk,poultry,cheeses,soap,toilet paper,yogurt,beef,waffles,...,,,,,,,,,,


## <span style = 'color:yellow'> Use pyECLAT to create transaction_id_set from transaction dataframe

In [73]:
from pyECLAT import ECLAT

eclat_instance = ECLAT(data = df_transaction.reset_index(drop=True), verbose = True)

transaction_id_set = eclat_instance.fit(min_support=0.3, min_combination=2, max_combination=3, separator=' & ')

100%|█████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 122.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 3894.90it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 3844.10it/s]


Combination 2 by 2


703it [00:04, 153.25it/s]


Combination 3 by 3


8436it [00:56, 148.20it/s]


## <span style = 'color:yellow'> Create frequent_itemsets from transaction_id_set and eclat_instance.support

In [78]:
individual_item = eclat_instance.support()
del individual_item[None] #use this if there is None in individual_item dictionary

# Fix individual items to be singleton lists
itemset_lst = [[item] for item in individual_item.keys()] + [itemset.split(' & ') for itemset in transaction_id_set[1].keys()]
support_lst = list(individual_item.values()) + list(transaction_id_set[1].values())

df_tid = pd.DataFrame({
    'support': support_lst,
    'itemsets': itemset_lst
})

# Now safely convert to frozenset
df_tid['itemsets'] = df_tid['itemsets'].apply(frozenset)

df_tid

Unnamed: 0,support,itemsets
0,0.380158,(milk)
1,0.3705,(fruits)
2,0.394205,(waffles)
3,0.352941,(flour)
4,0.345917,(hand soap)
5,0.389816,(eggs)
6,0.378402,(toilet paper)
7,0.395961,(cereals)
8,0.375768,(mixes)
9,0.385426,(bagels)


## <span style = 'color:yellow'> Use mlxtend.frequent_patterns.association_rules to find association rules

In [83]:
from mlxtend.frequent_patterns import association_rules

## Find association rules within itemsets
asc_rules = association_rules(df_tid, metric='lift', min_threshold=1.1,
                             return_metrics=['support', 'confidence', 'lift', 'leverage', 'conviction'])
asc_rules

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage,conviction
0,(eggs),(vegetables),0.326602,0.837838,1.13337,0.038433,1.607989
1,(vegetables),(eggs),0.326602,0.441805,1.13337,0.038433,1.093139
2,(vegetables),(yogurt),0.319579,0.432304,1.124188,0.035304,1.084123
3,(yogurt),(vegetables),0.319579,0.83105,1.124188,0.035304,1.543388
4,(laundry detergent),(vegetables),0.309043,0.816705,1.104783,0.029311,1.4226
5,(vegetables),(laundry detergent),0.309043,0.418052,1.104783,0.029311,1.068134
