In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
df=pd.read_excel('Online retail.xlsx',header=None)
df

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


In [None]:
print("Initial data shape:", df.shape)

Initial data shape: (7501, 1)


In [None]:
df.isnull().sum()

Unnamed: 0,0
0,0


In [None]:
print(df.columns)


Index([0], dtype='int64')


In [None]:
df.drop_duplicates(inplace=True)


In [None]:
pip install pandas mlxtend openpyxl




In [None]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder


In [None]:
transactions = df.iloc[:, 0].dropna().apply(lambda x: x.split(','))

In [None]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
basket = pd.DataFrame(te_ary, columns=te.columns_)

In [None]:
frequent_itemsets = apriori(basket, min_support=0.02, use_colnames=True)

print("Frequent Itemsets:")
print(frequent_itemsets)

Frequent Itemsets:
      support                                 itemsets
0    0.029366                                (almonds)
1    0.045981                                (avocado)
2    0.020479                              (black tea)
3    0.045015                               (brownies)
4    0.113794                                (burgers)
..        ...                                      ...
169  0.020093         (chocolate, mineral water, milk)
170  0.022991    (chocolate, spaghetti, mineral water)
171  0.020672         (eggs, spaghetti, mineral water)
172  0.024730  (spaghetti, mineral water, ground beef)
173  0.022604         (spaghetti, mineral water, milk)

[174 rows x 2 columns]


In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)


In [None]:
rules = rules[(rules['confidence'] >= 0.5) & (rules['lift'] >= 1.2)]


In [None]:
print("\nAssociation Rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


Association Rules:
Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [None]:
rules = rules.sort_values('lift', ascending=False)


In [None]:
frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)  # 1% support
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)   # 40% confidence
rules = rules[rules['lift'] >= 1.0]  # lift at least 1


In [None]:
print("Rules shape:", rules.shape)
print(rules.head())


Rules shape: (48, 14)
     antecedents      consequents  antecedent support  consequent support  \
0  (cooking oil)  (mineral water)            0.071870            0.299845   
1  (ground beef)  (mineral water)            0.135819            0.299845   
2  (ground beef)      (spaghetti)            0.135819            0.229521   
3  (light cream)  (mineral water)            0.022411            0.299845   
4    (olive oil)  (mineral water)            0.087713            0.299845   

    support  confidence      lift  representativity  leverage  conviction  \
0  0.028980    0.403226  1.344779               1.0  0.007430    1.173232   
1  0.058733    0.432432  1.442184               1.0  0.018008    1.233606   
2  0.055835    0.411095  1.791102               1.0  0.024661    1.308326   
3  0.010626    0.474138  1.581274               1.0  0.003906    1.331441   
4  0.038640    0.440529  1.469186               1.0  0.012340    1.251457   

   zhangs_metric   jaccard  certainty  kulczynski  


In [None]:
print("Top 10 Strongest Rules (by Lift):")
top_rules = rules.sort_values('lift', ascending=False).head(10)
print(top_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


Top 10 Strongest Rules (by Lift):
                         antecedents      consequents   support  confidence  \
28  (frozen vegetables, ground beef)      (spaghetti)  0.012558    0.511811   
24               (eggs, ground beef)      (spaghetti)  0.012944    0.449664   
40                      (soup, milk)  (mineral water)  0.012365    0.576577   
33               (milk, ground beef)      (spaghetti)  0.013910    0.439024   
20            (olive oil, chocolate)      (spaghetti)  0.010240    0.434426   
42                 (olive oil, milk)      (spaghetti)  0.010240    0.424000   
36      (mineral water, ground beef)      (spaghetti)  0.024730    0.421053   
27  (frozen vegetables, ground beef)  (mineral water)  0.013331    0.543307   
2                      (ground beef)      (spaghetti)  0.055835    0.411095   
14          (chocolate, ground beef)      (spaghetti)  0.013331    0.401163   

        lift  
28  2.229911  
24  1.959144  
40  1.922913  
33  1.912786  
20  1.892753  
42  1.

In [None]:
from collections import Counter

antecedents_list = []
for i in rules['antecedents']:
    antecedents_list.extend(list(i))
antecedent_counts = Counter(antecedents_list)

print("Most Common Products in Antecedents:")
for product, count in antecedent_counts.most_common(10):
    print(f"{product}: {count} times")

print("\n" + "-"*60 + "\n")

Most Common Products in Antecedents:
ground beef: 13 times
spaghetti: 12 times
chocolate: 11 times
milk: 11 times
frozen vegetables: 7 times
olive oil: 6 times
eggs: 6 times
pancakes: 4 times
shrimp: 4 times
soup: 3 times

------------------------------------------------------------



In [None]:
consequents_list = []
for i in rules['consequents']:
    consequents_list.extend(list(i))
consequent_counts = Counter(consequents_list)

print("Most Common Products in Consequents:")
for product, count in consequent_counts.most_common(10):
    print(f"{product}: {count} times")

print("\n" + "-"*60 + "\n")


Most Common Products in Consequents:
mineral water: 40 times
spaghetti: 8 times

------------------------------------------------------------



In [None]:
high_lift_rules = rules[rules['lift'] > 2]
high_confidence_rules = rules[rules['confidence'] > 0.7]

In [None]:
print(f"Number of rules with Lift > 2: {len(high_lift_rules)}")
print(f"Number of rules with Confidence > 70%: {len(high_confidence_rules)}")

Number of rules with Lift > 2: 0
Number of rules with Confidence > 70%: 0


In [None]:
print("\nInterpretation Insights:")
if not high_lift_rules.empty:
    print("- High Lift rules suggest very strong associations; customers who buy items in the antecedents are very likely to buy the consequents too.")
if not high_confidence_rules.empty:
    print("- High Confidence rules indicate that when a customer buys antecedent items, there is a high chance (>70%) they will also buy the consequent items.")

print("- Frequent items in Antecedents mean they are 'starter' products that lead to other purchases.")
print("- Frequent items in Consequents mean they are 'add-on' products often purchased together.")



Interpretation Insights:
- Frequent items in Antecedents mean they are 'starter' products that lead to other purchases.
- Frequent items in Consequents mean they are 'add-on' products often purchased together.


Top 10 strongest association rules with lift, support, and confidence.

Top 10 most common products that appear on the left side (antecedents).

Top 10 most common products that appear on the right side (consequents).

Counts of very strong rules (lift > 2) and highly confident rules (confidence > 0.7).

**What is lift and why is it important in Association ruls ?**

Lift measures how much more likely two items are to be bought together compared to being bought independently.
It tells you if two products are truly associated — or if they just happen to appear together by random chance.

It filters out useless rules. (High support/confidence alone can be misleading.)

It detects true relationships, not random co-occurrences.

It prioritizes strong rules that have real business value (like bundling promotions).



**What is support and confidence. How do you calculate them ?**

Support measures how frequently an itemset appears in the dataset.

🔵 Support(A) =

(Number of transactions containing A)
÷
(Total number of transactions)

Confidence measures how often B is purchased when A is purchased.
It’s the conditional probability that B is bought given A is already bought.

🔵 Confidence(A → B) =

(Support of A and B together)
÷
(Support of A)


**What are some limitations or challenges of Association rules mining ?**

Too many rules-	Hard to find the important ones

Missing rare rules-	Rare but important patterns lost

Hard to interpret-	Associations ≠ Causation

Scalability-	Algorithms can be very slow

Handling numbers-	Needs extra data pre-processing

Redundant rules	-Creates confusion and clutter