In [12]:
import pandas as pd
from collections import defaultdict
from itertools import combinations


In [13]:
# Step 1: Load dataset
df = pd.read_csv("Groceries_dataset.csv")

In [14]:
# Step 2: Group items by transaction (same customer & date = one transaction)
df['itemDescription'] = df['itemDescription'].astype(str)
grouped = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list)
transactions = grouped.tolist()
transactions

[['sausage', 'whole milk', 'semi-finished bread', 'yogurt'],
 ['whole milk', 'pastry', 'salty snack'],
 ['canned beer', 'misc. beverages'],
 ['sausage', 'hygiene articles'],
 ['soda', 'pickled vegetables'],
 ['frankfurter', 'curd'],
 ['sausage', 'whole milk', 'rolls/buns'],
 ['whole milk', 'soda'],
 ['beef', 'white bread'],
 ['frankfurter', 'soda', 'whipped/sour cream'],
 ['frozen vegetables', 'other vegetables'],
 ['butter', 'whole milk'],
 ['tropical fruit', 'sugar'],
 ['butter milk', 'specialty chocolate'],
 ['sausage', 'rolls/buns'],
 ['root vegetables', 'detergent'],
 ['frozen meals', 'dental care'],
 ['rolls/buns', 'rolls/buns'],
 ['dish cleaner', 'cling film/bags'],
 ['canned beer', 'frozen fish'],
 ['other vegetables', 'hygiene articles'],
 ['pip fruit', 'whole milk', 'tropical fruit'],
 ['rolls/buns', 'red/blush wine', 'chocolate'],
 ['other vegetables', 'shopping bags'],
 ['whole milk', 'chocolate', 'packaged fruit/vegetables', 'rolls/buns'],
 ['root vegetables', 'whole milk'

ðŸ”¹ vertical_db = defaultdict(set)
Kya ho raha hai?

vertical_db : ek dictionary banayi gayi hai jisme har key (item) ka value ek set hai.

defaultdict(set) ka matlab hai: agar koi key exist nahi karti, to by default wo key ek empty set se initialize ho jaayegi.

ðŸ“Œ Why set?

Ek item ka multiple transaction IDs store karna hai, without duplicates.

In [15]:
# Step 3: Create TID-list (Vertical format)
vertical_db = defaultdict(set)
for tid, items in enumerate(transactions):
    for item in items:
        vertical_db[item].add(tid)

In [16]:
print("ðŸ“¦ Items and their Transaction IDs:")
for item, tids in vertical_db.items():
    print(f"Item: {item} => Transaction IDs: {sorted(list(tids))}")


ðŸ“¦ Items and their Transaction IDs:
Item: sausage => Transaction IDs: [0, 3, 6, 14, 58, 85, 93, 98, 124, 131, 141, 215, 220, 272, 275, 311, 315, 322, 327, 329, 330, 344, 412, 455, 483, 488, 547, 570, 571, 597, 664, 672, 673, 718, 725, 748, 750, 759, 788, 800, 809, 831, 851, 872, 876, 884, 888, 922, 926, 938, 942, 954, 988, 995, 997, 1006, 1007, 1057, 1058, 1076, 1084, 1095, 1109, 1118, 1127, 1149, 1180, 1190, 1191, 1195, 1202, 1209, 1212, 1229, 1313, 1330, 1333, 1353, 1363, 1424, 1438, 1441, 1444, 1447, 1455, 1462, 1468, 1469, 1481, 1483, 1502, 1503, 1505, 1591, 1601, 1679, 1682, 1683, 1687, 1690, 1699, 1739, 1740, 1746, 1747, 1762, 1764, 1774, 1789, 1816, 1823, 1837, 1845, 1850, 1865, 1876, 1888, 1905, 1921, 1930, 1936, 1945, 1951, 1971, 1972, 1984, 2022, 2032, 2048, 2077, 2120, 2126, 2144, 2157, 2163, 2182, 2184, 2190, 2204, 2206, 2231, 2236, 2250, 2299, 2373, 2413, 2423, 2456, 2462, 2468, 2471, 2517, 2523, 2551, 2580, 2586, 2662, 2663, 2669, 2678, 2695, 2701, 2714, 2716, 2751, 276

In [17]:
# Step 4: Minimum support threshold (e.g., 0.2% of total transactions)
min_support = int(0.002 * len(transactions)) #kitni baar ek itemset aana chahiye minimum, tabhi hum usse "frequent" maanein.

In [18]:
# Step 5: Find frequent 1-itemsets
frequent_itemsets = {}
for item, tids in vertical_db.items():
    if len(tids) >= min_support:
        frequent_itemsets[frozenset([item])] = tids 

print(frequent_itemsets)

{frozenset({'sausage'}): {0, 2048, 12290, 3, 12291, 6, 4102, 12295, 10250, 14, 12305, 12307, 6164, 4117, 6166, 10263, 8218, 2077, 6177, 14372, 12331, 6188, 10284, 4147, 6196, 6201, 58, 10298, 12345, 4157, 6205, 14396, 8256, 10307, 14406, 2120, 2126, 6226, 14420, 85, 12379, 93, 10333, 2144, 98, 6244, 6245, 12388, 4199, 8295, 4203, 10348, 2157, 6253, 14446, 14447, 8306, 2163, 4214, 6264, 8312, 12409, 124, 4220, 131, 12419, 12421, 2182, 6279, 2184, 6280, 141, 2190, 6288, 10386, 12436, 4249, 8345, 2204, 2206, 14500, 6310, 14503, 10410, 14509, 6318, 4271, 12463, 14510, 2231, 14522, 2236, 12477, 12482, 10436, 8390, 12489, 2250, 4306, 6358, 215, 10454, 14552, 14554, 220, 8419, 12520, 10481, 8434, 6394, 2299, 4347, 8444, 10490, 12541, 12543, 6402, 6406, 8458, 4366, 8462, 272, 14609, 8466, 275, 12562, 8471, 8472, 14621, 8486, 4393, 12586, 10539, 14635, 8493, 14638, 311, 6455, 8505, 10553, 315, 12602, 322, 12610, 2373, 4421, 327, 4423, 329, 330, 14667, 8526, 12627, 8532, 344, 12280, 14685, 8543,

In [19]:
#Jo 1-item wale frequent sets mile the, unka copy bana rahe hain.
final_itemsets = frequent_itemsets.copy()
#Ab hum 2-item combinations dhundhna start karenge.
k = 2
#Loop jab tak chalti rahegi jab tak naye combos milte rahenge.
while True:
    #Abhi tak ke saare frequent itemsets list me le lo (1-item, phir 2-item...).
    itemsets = list(frequent_itemsets.keys())
    next_itemsets = {}

    #Har itemset ko doosre itemset ke saath combine kar rahe hain.
    for i in range(len(itemsets)):
        for j in range(i + 1, len(itemsets)):
            # Do itemsets ko union kar rahe hain.
            combined = itemsets[i] | itemsets[j]
            #Sirf wahi combos lo jinke items ka size abhi ke k ke barabar ho.
            if len(combined) == k:
                #Common customers (TIDs) dekh rahe hain jinhone dono items ek saath kharide.
                tids = frequent_itemsets[itemsets[i]] & frequent_itemsets[itemsets[j]]
                #Agar ye combo bhi minimum support ko satisfy karta hai,
                if len(tids) >= min_support:
                    #Store kar lo next_itemsets me.
                    next_itemsets[combined] = tids


    #Agar naye combos nahi mile, toh stop kar do loop.
    if not next_itemsets:
        break  # Stop if no more frequent itemsets found
    
    #Ab naye mile combos ko base bana lo, next iteration ke liye.
    frequent_itemsets = next_itemsets
    #Final list me naye frequent combos add kar do.
    final_itemsets.update(next_itemsets)
    #Next round me ek item aur badhao (3-itemset banane ke liye).
    k += 1


In [20]:
#Ek khaali list bana rahe hain jisme hum final results store karenge.
results = []
#Har itemset (jaise milk, bread) aur uska TID list (kin logon ne kharida) utha rahe hain.
for items, tids in final_itemsets.items():
    results.append({
        #Itemset ko readable string mein convert kar rahe hain. Example: {bread, milk} â†’ "bread, milk"
        'Itemset': ', '.join(sorted(list(items))),
        'Support': round(len(tids) / len(transactions), 4)
    })

df_results = pd.DataFrame(results).sort_values(by='Support', ascending=False)

In [21]:
# Step 8: Display top frequent itemsets
print("Top Frequent Itemsets using ECLAT Algorithm:")
print(df_results)

Top Frequent Itemsets using ECLAT Algorithm:
                      Itemset  Support
1                  whole milk   0.1579
18           other vegetables   0.1221
13                 rolls/buns   0.1100
9                        soda   0.0971
3                      yogurt   0.0859
..                        ...      ...
126               light bulbs   0.0019
116             specialty fat   0.0019
113          artif. sweetener   0.0019
85                       fish   0.0019
164  frozen meals, whole milk   0.0019

[347 rows x 2 columns]
