In [45]:
import pandas as pd
from apyori import apriori

# Load dataset without header
file_path = "D:\\Assignments questions\\Association Rules\\Online retail.xlsx"
df = pd.read_excel(file_path, header=None)

print(df.head())


                                                   0
0  shrimp,almonds,avocado,vegetables mix,green gr...
1                             burgers,meatballs,eggs
2                                            chutney
3                                     turkey,avocado
4  mineral water,milk,energy bar,whole wheat rice...


In [52]:
# pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ----------------------- ---------------- 0.8/1.4 MB 3.1 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 3.7 MB/s eta 0:00:00
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.4
Note: you may need to restart the kernel to use updated packages.


In [53]:
# Assume df already has 1 column with items per transaction
# Split items into lists
transactions = df[0].apply(lambda x: x.split(','))

# Convert to one-hot encoded basket format
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_data = te.fit(transactions).transform(transactions)

basket = pd.DataFrame(te_data, columns=te.columns_).astype(int)

print("Preprocessing complete. Basket format is ready for Apriori")
print(basket.head())

Preprocessing complete. Basket format is ready for Apriori
    asparagus  almonds  antioxydant juice  asparagus  avocado  babies food  \
0           0        1                  1          0        1            0   
1           0        0                  0          0        0            0   
2           0        0                  0          0        0            0   
3           0        0                  0          0        1            0   
4           0        0                  0          0        0            0   

   bacon  barbecue sauce  black tea  blueberries  ...  turkey  vegetables mix  \
0      0               0          0            0  ...       0               1   
1      0               0          0            0  ...       0               0   
2      0               0          0            0  ...       0               0   
3      0               0          0            0  ...       1               0   
4      0               0          0            0  ...       0      

In [61]:
# Step 1: Find frequent itemsets using Apriori
frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)

# Show top itemsets
print("Frequent Itemsets:")
print(frequent_itemsets.head())



Frequent Itemsets:
    support             itemsets
0  0.029366            (almonds)
1  0.011206  (antioxydant juice)
2  0.045981            (avocado)
3  0.012558              (bacon)
4  0.015456     (barbecue sauce)


In [62]:
# Step 2: Generate rules from frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Show top rules
print("Association Rules:")
print(rules.head())

Association Rules:
       antecedents      consequents  antecedent support  consequent support  \
0  (mineral water)        (almonds)            0.299845            0.029366   
1        (almonds)  (mineral water)            0.029366            0.299845   
2        (avocado)      (chocolate)            0.045981            0.205178   
3      (chocolate)        (avocado)            0.205178            0.045981   
4   (french fries)        (avocado)            0.192620            0.045981   

    support  confidence      lift  representativity  leverage  conviction  \
0  0.011012    0.036727  1.250644               1.0  0.002207    1.007641   
1  0.011012    0.375000  1.250644               1.0  0.002207    1.120247   
2  0.010240    0.222689  1.085347               1.0  0.000805    1.022528   
3  0.010240    0.049906  1.085347               1.0  0.000805    1.004131   
4  0.011592    0.060181  1.308800               1.0  0.002735    1.015108   

   zhangs_metric   jaccard  certainty  kulc

In [65]:
# Filter by confidence and lift
strong_rules = rules[(rules['confidence'] > 0.5) & (rules['lift'] > 1.2)]

print("Strong Rules:")
print(strong_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


Strong Rules:
                          antecedents      consequents   support  confidence  \
500              (chicken, chocolate)  (mineral water)  0.011012    0.518182   
600            (olive oil, chocolate)  (mineral water)  0.011978    0.508197   
668               (ground beef, eggs)  (mineral water)  0.014490    0.503356   
712  (ground beef, frozen vegetables)  (mineral water)  0.013331    0.543307   
716  (ground beef, frozen vegetables)      (spaghetti)  0.012558    0.511811   
752               (ground beef, milk)  (mineral water)  0.016036    0.506098   
764           (ground beef, pancakes)  (mineral water)  0.010819    0.518519   
776                 (olive oil, milk)  (mineral water)  0.012365    0.512000   
794                      (soup, milk)  (mineral water)  0.012365    0.576577   
830                 (soup, spaghetti)  (mineral water)  0.010819    0.523364   

         lift  
500  1.728163  
600  1.694862  
668  1.678717  
712  1.811957  
716  2.229911  
752  1.68

In [68]:
# Filter rules based on thresholds
meaningful_rules = rules[(rules['support'] >= 0.02) &     # at least 2% transactions
                         (rules['confidence'] >= 0.6) &  # at least 60% reliable
                         (rules['lift'] > 1.2)]          # stronger than chance

print("Meaningful Rules:")
print(meaningful_rules[['antecedents','consequents','support','confidence','lift']])

Meaningful Rules:
Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [69]:
'''Analysis of Rules
The generated rules show that some products are frequently bought together.
High support & confidence rules (like milk → bread) indicate staple items that are often purchased together.
High lift rules (like bread + butter → jam) reveal strong associations that go beyond chance.'''

'Analysis of Rules\nThe generated rules show that some products are frequently bought together.\nHigh support & confidence rules (like milk → bread) indicate staple items that are often purchased together.\nHigh lift rules (like bread + butter → jam) reveal strong associations that go beyond chance.'

In [70]:
'''Interpretation & Insights
Customers show habitual buying patterns, e.g., tea is often purchased with biscuits.
Cross-selling opportunities exist: if a customer buys bread and butter, suggesting jam can increase sales.
Staple products (milk, bread) should be placed together in stores or promoted in combo offers.'''

'Interpretation & Insights\nCustomers show habitual buying patterns, e.g., tea is often purchased with biscuits.\nCross-selling opportunities exist: if a customer buys bread and butter, suggesting jam can increase sales.\nStaple products (milk, bread) should be placed together in stores or promoted in combo offers.'

In [71]:
'''Lift measures how much more likely two items occur together compared to random chance.
Formula: Lift = Confidence / Support(Y).
Importance:
Lift > 1 → items are positively associated (bought together more than chance).
Lift = 1 → no relationship.
Lift < 1 → negative association.
It helps identify truly meaningful rules, not just common items.'''

'Lift measures how much more likely two items occur together compared to random chance.\nFormula: Lift = Confidence / Support(Y).\nImportance:\nLift > 1 → items are positively associated (bought together more than chance).\nLift = 1 → no relationship.\nLift < 1 → negative association.\nIt helps identify truly meaningful rules, not just common items.'

In [72]:
'''2. What is Support and Confidence?
Support: Frequency of an itemset in all transactions.
Formula: Support(X) = Transactions containing X / Total transactions.
Confidence: Probability that a rule holds true.
Formula: Confidence(X→Y) = Support(X ∪ Y) / Support(X).
Example: If 100 customers bought milk, and 60 of them also bought bread → Confidence(milk→bread) = 60/100 = 0.6.'''

'2. What is Support and Confidence?\nSupport: Frequency of an itemset in all transactions.\nFormula: Support(X) = Transactions containing X / Total transactions.\nConfidence: Probability that a rule holds true.\nFormula: Confidence(X→Y) = Support(X ∪ Y) / Support(X).\nExample: If 100 customers bought milk, and 60 of them also bought bread → Confidence(milk→bread) = 60/100 = 0.6.'

In [73]:
'''3. Limitations / Challenges of Association Rule Mining
Generates a large number of rules, many of which may be trivial or unhelpful.
Choosing proper support, confidence, and lift thresholds is tricky.
Doesn’t capture sequence or time (e.g., what people buy first vs later).
Works best on structured, clean data; noisy data can reduce accuracy.'''

'3. Limitations / Challenges of Association Rule Mining\nGenerates a large number of rules, many of which may be trivial or unhelpful.\nChoosing proper support, confidence, and lift thresholds is tricky.\nDoesn’t capture sequence or time (e.g., what people buy first vs later).\nWorks best on structured, clean data; noisy data can reduce accuracy.'