In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori,association_rules
from mlxtend.preprocessing import TransactionEncoder

# Data Preprocessing

In [2]:
data = pd.read_excel("Online retail.xlsx")
data.head()

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                           --------------  ----- 
 0   shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil  7500 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


In [4]:
data.dropna(inplace=True)

In [5]:
data[data.duplicated()]

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
33,cookies
41,spaghetti
59,spaghetti
63,"turkey,eggs"
64,french fries
...,...
7490,herb & pepper
7491,"chocolate,escalope"
7494,"pancakes,light mayo"
7497,chicken


In [6]:
data.drop_duplicates(inplace=True)

In [7]:
data[data.duplicated()]

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"


In [8]:
data = data.iloc[:,0].str.split(',')

In [9]:
data.head()

0                           [burgers, meatballs, eggs]
1                                            [chutney]
2                                    [turkey, avocado]
3    [mineral water, milk, energy bar, whole wheat ...
4                                     [low fat yogurt]
Name: shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil, dtype: object

# One-Hot Encoding

In [10]:
te = TransactionEncoder()

In [11]:
te_arr = te.fit_transform(data)

In [12]:
basket = pd.DataFrame(te_arr, columns=te.columns_)
basket.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


#  Apply Apriori Algorithm

In [13]:
frequent_itemsets = apriori(basket,min_support=0.01,use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.029179,(almonds)
1,0.011014,(antioxydant juice)
2,0.045797,(avocado)
3,0.012560,(bacon)
4,0.015459,(barbecue sauce)
...,...,...
431,0.014686,"(mineral water, olive oil, spaghetti)"
432,0.016618,"(mineral water, pancakes, spaghetti)"
433,0.012367,"(mineral water, shrimp, spaghetti)"
434,0.010821,"(mineral water, soup, spaghetti)"


#  Generate Association Rules 

In [14]:
rules = association_rules(frequent_itemsets,metric='lift',min_threshold=1.1)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(mineral water),(almonds),0.29971,0.029179,0.010821,0.036106,1.237399,0.002076,1.007186,0.273962
1,(almonds),(mineral water),0.029179,0.29971,0.010821,0.370861,1.237399,0.002076,1.113092,0.197619
2,(avocado),(french fries),0.045797,0.192657,0.011594,0.253165,1.314069,0.002771,1.081019,0.250476
3,(french fries),(avocado),0.192657,0.045797,0.011594,0.060181,1.314069,0.002771,1.015304,0.296039
4,(avocado),(milk),0.045797,0.170048,0.010821,0.236287,1.389528,0.003034,1.086732,0.293786


# Analysis and Interpretation:

# (mineral water) → (almonds)
Support: 0.0108 (1.08% of transactions include both mineral water and almonds).
1. Confidence: 0.0361 (When customers buy mineral water, there is a 3.61% chance they will also buy almonds).
2. Lift: 1.237 (Customers who buy mineral water are 1.237 times more likely to buy almonds than customers who do not).
3. Interpretation: This rule suggests a weak association between mineral water and almonds, as the confidence is low. However, the lift is slightly greater than 1, indicating a modest positive relationship. This could suggest a small group of health-conscious customers who purchase both items.
# (almonds) → (mineral water)
1. Support: 0.0108 (Same as Rule 1).
2. Confidence: 0.3709 (When customers buy almonds, there is a 37.09% chance they will also buy mineral water).
3. Lift: 1.237
4. Interpretation: This rule indicates that customers who buy almonds are much more likely to also buy mineral water (37% confidence), suggesting that customers purchasing almonds might prioritize hydration and healthy eating. While the relationship isn't very strong, the higher confidence implies that pairing these items could be effective in promotions or cross-selling.
# (avocado) → (chocolate)
1. Support: 0.0102 (1.02% of transactions include both avocado and chocolate).
2. Confidence: 0.2236 (22.36% chance that when avocados are purchased, chocolate is also purchased).
3. Lift: 1.0897 (The likelihood of buying chocolate is 1.0897 times higher if avocados are purchased).
4. Interpretation: While this rule has a moderate confidence level, the lift is only slightly above 1, indicating a weak positive correlation. This may suggest that customers who buy avocados could be interested in both healthy and indulgent products, though the relationship is not very strong.
# (chocolate) → (avocado)
1. Support: 0.0102.
2. Confidence: 0.0499 (4.99% chance that when chocolate is purchased, avocado is also purchased).
3. Lift: 1.0897 .
4. Interpretation: This rule has low confidence (less than 5%), indicating that only a small fraction of chocolate buyers also purchase avocados. While there is a weak positive association (lift > 1), the low confidence means this relationship is not significant.
# (avocado) → (french fries)
1. Support: 0.0116 (1.16% of transactions include both avocado and french fries).
2. Confidence: 0.2531 (25.31% chance that when avocados are purchased, french fries are also purchased).
3. Lift: 1.314 (The likelihood of buying french fries is 1.314 times higher if avocados are purchased).
4. Interpretation: This rule has a relatively higher confidence (25.31%) and lift (1.314), indicating a stronger association between avocados and french fries. This might suggest that customers who buy avocados, possibly health-conscious, may also indulge in comfort food items like french fries. Retailers could use this to create combo offers or promotional strategies for these items.

In [15]:
rules.sort_values('lift',ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
348,(whole wheat pasta),(olive oil),0.04058,0.087536,0.011014,0.271429,3.100757,0.007462,1.252401,0.706154
349,(olive oil),(whole wheat pasta),0.087536,0.04058,0.011014,0.125828,3.100757,0.007462,1.097519,0.742493
704,(soup),"(mineral water, milk)",0.070918,0.067826,0.012367,0.174387,2.571089,0.007557,1.129069,0.657703
701,"(mineral water, milk)",(soup),0.067826,0.070918,0.012367,0.182336,2.571089,0.007557,1.136264,0.655521
236,(ground beef),(herb & pepper),0.135845,0.066473,0.022802,0.167852,2.5251,0.013772,1.121828,0.698921
237,(herb & pepper),(ground beef),0.066473,0.135845,0.022802,0.343023,2.5251,0.013772,1.31535,0.646983
645,"(mineral water, shrimp)",(frozen vegetables),0.03343,0.129855,0.010435,0.312139,2.403747,0.006094,1.265001,0.604181
648,(frozen vegetables),"(mineral water, shrimp)",0.129855,0.03343,0.010435,0.080357,2.403747,0.006094,1.051028,0.671133
630,(ground beef),"(frozen vegetables, spaghetti)",0.135845,0.039034,0.01256,0.092461,2.368738,0.007258,1.05887,0.66867
627,"(frozen vegetables, spaghetti)",(ground beef),0.039034,0.135845,0.01256,0.321782,2.368738,0.007258,1.274155,0.601306


In [16]:
rules = rules[(rules['support'] >= 0.01) & (rules['confidence'] >= 0.5) & (rules['lift'] >= 1.1)]

In [17]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
420,"(chicken, chocolate)",(mineral water),0.021256,0.29971,0.011014,0.518182,1.728943,0.004644,1.453432,0.430768
513,"(olive oil, chocolate)",(mineral water),0.023575,0.29971,0.011981,0.508197,1.695627,0.004915,1.423923,0.420153
578,"(eggs, ground beef)",(mineral water),0.028792,0.29971,0.014493,0.503356,1.679475,0.005863,1.410043,0.41657
622,"(frozen vegetables, ground beef)",(mineral water),0.024541,0.29971,0.013333,0.543307,1.812775,0.005978,1.533393,0.45964
626,"(frozen vegetables, ground beef)",(spaghetti),0.024541,0.229565,0.01256,0.511811,2.22948,0.006927,1.578149,0.565339
660,"(milk, ground beef)",(mineral water),0.031691,0.29971,0.016039,0.506098,1.688623,0.006541,1.417871,0.421148
671,"(pancakes, ground beef)",(mineral water),0.02087,0.29971,0.010821,0.518519,1.730067,0.004566,1.454448,0.430982
684,"(milk, olive oil)",(mineral water),0.024155,0.29971,0.012367,0.512,1.708317,0.005128,1.43502,0.424892
702,"(soup, milk)",(mineral water),0.021449,0.29971,0.012367,0.576577,1.923781,0.005939,1.653876,0.490716
738,"(soup, spaghetti)",(mineral water),0.020676,0.29971,0.010821,0.523364,1.746235,0.004624,1.469236,0.436362


# Interview Questions:

# 1.What is lift and why is it important in Association rules?
Lift is a measure that compares the likelihood of the co-occurrence of items (e.g., product A and product B) relative to their individual occurrences.
                 Lift=Confidence(A→B)/support(B)
Lift tells you how much more likely products A and B are to be purchased together compared to random chance. A lift greater than 1 indicates a positive association between the items, whereas a value less than 1 suggests no significant association.

# 2.What is support and Confidence. How do you calculate them?
Lift is a measure that compares the likelihood of the co-occurrence of items (e.g., product A and product B) relative to their individual occurrences. It is calculated as:
      Support(A→B)=Number of transactions with both A and B/Total Transactions
​Confidence is the likelihood that a customer who bought product A also bought product B. It is calculated as:
Confidence(A→B)= Support(A→B)/Support(A)

# 3.What are some limitations or challenges of Association rules mining?
1. Scalability: Association rule mining can be computationally expensive for large datasets with many transactions and items.
2. Choosing Appropriate Thresholds: Selecting suitable support and confidence thresholds can be challenging, as too high values may miss important rules, while too low values may generate irrelevant rules.
3. Interpretation of Rules: While many rules can be generated, interpreting them meaningfully to derive actionable insights can be challenging.
4. Sparsity: Transactional data is often sparse, which can limit the number of useful rules generated.