In [28]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [29]:
# Step 1: Load your dataset
df = pd.read_csv("Groceries_dataset.csv")  # Replace with your file name

In [30]:
# Step 2: Group transactions by Member_number and Date
df_grouped = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).reset_index()

In [31]:
# Step 3: Create list of transactions
transactions = df_grouped['itemDescription'].tolist()

In [33]:
# Step 4: One-hot encode transactions
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_array, columns=te.columns_)

In [34]:
frequent_itemsets = apriori(df_encoded, min_support=0.002, use_colnames=True)

In [35]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)

In [36]:
rules_sorted = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]\
                .sort_values(by='lift', ascending=False)

In [37]:
print("Top 10 Association Rules:\n")
print(rules_sorted.head(10))

Top 10 Association Rules:

                antecedents         consequents   support  confidence  \
23            (frankfurter)  (other vegetables)  0.005146    0.136283   
14              (chocolate)        (rolls/buns)  0.002807    0.118980   
25           (frozen meals)  (other vegetables)  0.002139    0.127490   
35                   (meat)  (other vegetables)  0.002139    0.126984   
30                    (ham)        (whole milk)  0.002740    0.160156   
3                 (berries)  (other vegetables)  0.002673    0.122699   
28  (fruit/vegetable juice)        (rolls/buns)  0.003743    0.110020   
6            (bottled beer)        (whole milk)  0.007151    0.157817   
42            (salty snack)  (other vegetables)  0.002205    0.117438   
10                  (candy)        (whole milk)  0.002139    0.148837   

        lift  
23  1.116150  
14  1.081592  
25  1.044134  
35  1.039991  
30  1.014142  
3   1.004899  
28  1.000136  
6   0.999330  
42  0.961807  
10  0.942468  


- Support tells us how frequently an item or itemset appears in the dataset.
- Confidence tells us how likely it is that a customer will buy item B, given that they have already bought
- Lift tells us how much more likely item B is purchased when A is purchased, compared to when A is not purchased.
It compares actual confidence to expected confidence.