## Association Rule

In [1]:
# !pip install mlxtend  Install mlxtend library.

In [10]:
# Import Libraries
from mlxtend.frequent_patterns import apriori,association_rules
from mlxtend.preprocessing import TransactionEncoder #for encoding data
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [11]:
data = pd.read_excel("Online retail.xlsx")
data.head()

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


In [12]:
data.shape

(7500, 1)

In [14]:
data = data.rename(columns={'shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil': 'Transactions'})
data.head()

Unnamed: 0,Transactions
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


In [15]:
data.isnull().sum() #checking for any missing values

Unnamed: 0,0
Transactions,0


## Data Preprocessing

In [18]:
data['items'] = data['Transactions'].apply(lambda x: str(x).split(','))

In [20]:
all_items = set()
for transaction in data['items']:
    all_items.update(transaction)

In [21]:
all_items = sorted(list(all_items))

In [22]:
binary_matrix = []

In [24]:
for transaction in data['items']:
    row = {item: 1 if item in transaction else 0 for item in all_items}
    binary_matrix.append(row)

In [25]:
binary_data = pd.DataFrame(binary_matrix, columns=all_items)

In [28]:
data.shape , binary_data.shape

((7500, 2), (7500, 120))

In [32]:
# converting our data set to 120 columns and 7500 rows.
binary_data.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Apriori algorithm

In [33]:
min_support = 0.01    #Filters out rare items, ensures frequent ones are considered.
min_confidence = 0.5  #Ensures reliability of the rule (at least 50% accuracy).
min_lift = 1.2        #Ensures a meaningful association (20% stronger than chance).

In [35]:
frequent_itemsets = apriori(binary_data, min_support=min_support, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.020267,(almonds)
1,0.033200,(avocado)
2,0.010800,(barbecue sauce)
3,0.014267,(black tea)
4,0.011467,(body spray)
...,...,...
254,0.011067,"(milk, ground beef, mineral water)"
255,0.017067,"(ground beef, spaghetti, mineral water)"
256,0.015733,"(milk, spaghetti, mineral water)"
257,0.010267,"(olive oil, spaghetti, mineral water)"


# Association Rules

In [36]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(avocado),(mineral water),0.033200,0.238267,0.011467,0.345382,1.449559,1.0,0.003556,1.163629,0.320785,0.044103,0.140620,0.196753
1,(mineral water),(avocado),0.238267,0.033200,0.011467,0.048125,1.449559,1.0,0.003556,1.015680,0.407144,0.044103,0.015438,0.196753
2,(cake),(burgers),0.081067,0.087200,0.011467,0.141447,1.622103,1.0,0.004398,1.063185,0.417349,0.073129,0.059430,0.136473
3,(burgers),(cake),0.087200,0.081067,0.011467,0.131498,1.622103,1.0,0.004398,1.058068,0.420154,0.073129,0.054881,0.136473
4,(eggs),(burgers),0.179733,0.087200,0.028800,0.160237,1.837585,1.0,0.013127,1.086974,0.555682,0.120941,0.080015,0.245256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,"(pancakes, mineral water)",(spaghetti),0.033733,0.174133,0.011467,0.339921,1.952073,1.0,0.005593,1.251163,0.504751,0.058384,0.200744,0.202885
346,"(spaghetti, mineral water)",(pancakes),0.059733,0.095067,0.011467,0.191964,2.019260,1.0,0.005788,1.119917,0.536836,0.080000,0.107077,0.156291
347,(pancakes),"(spaghetti, mineral water)",0.095067,0.059733,0.011467,0.120617,2.019260,1.0,0.005788,1.069235,0.557797,0.080000,0.064752,0.156291
348,(spaghetti),"(pancakes, mineral water)",0.174133,0.033733,0.011467,0.065850,1.952073,1.0,0.005593,1.034381,0.590560,0.058384,0.033238,0.202885


In [38]:
fill_rules = rules[(rules['confidence'] >= 0.4) & (rules['lift'] >= 1.2)]

In [39]:
fill_rules[["antecedents","consequents","support","confidence","lift"]]

Unnamed: 0,antecedents,consequents,support,confidence,lift
164,(ground beef),(mineral water),0.040933,0.416554,1.748266
206,(olive oil),(mineral water),0.027467,0.41785,1.753707
216,(soup),(mineral water),0.023067,0.456464,1.915771
250,"(eggs, chocolate)",(mineral water),0.013467,0.405622,1.702389
262,"(ground beef, chocolate)",(mineral water),0.010933,0.473988,1.989319
268,"(milk, chocolate)",(mineral water),0.014,0.435685,1.828559
280,"(spaghetti, chocolate)",(mineral water),0.015867,0.404762,1.698777
286,"(ground beef, eggs)",(mineral water),0.010133,0.506667,2.126469
292,"(milk, eggs)",(mineral water),0.013067,0.424242,1.780536
308,"(milk, frozen vegetables)",(mineral water),0.011067,0.468927,1.968075


In [40]:
fill_rules.shape

(17, 14)

# Analysis of Generated Rules:

### Frequent Combinations Involving "Mineral Water":

All the filtered rules involve "mineral water" as either the antecedent or the consequent. This indicates that "mineral water" is frequently purchased alongside other products in the dataset.

Some key combinations include:
- **Ground beef → Mineral water** (Lift: 1.748)
- **Olive oil → Mineral water** (Lift: 1.753)
- **Chocolate, eggs → Mineral water** (Lift: 1.702)
- **Eggs, ground beef → Mineral water** (Lift: 2.126)

### Strong Associations (Lift > 1.7):

The rules with the highest lift values, such as:
- **Eggs, ground beef → Mineral water** (Lift: 2.126)
- **Chocolate, ground beef → Mineral water** (Lift: 1.989)
- **Spaghetti, ground beef → Mineral water** (Lift: 1.827)

These combinations suggest that there are certain product pairs that have a notably high likelihood of being purchased together with mineral water, indicating a preference for these combinations.

### Confidence:

The confidence values range between 40% and 50%, meaning that, in these rules, there’s a 40%–50% chance that if a customer buys one of the products (such as "ground beef," "eggs," or "chocolate"), they will also buy "mineral water."

Confidence values above 0.4 suggest that these rules are strong enough to indicate a potential trend in customer behavior.

### Product Combinations:

- Multiple combinations involving **chocolate** appear, such as "chocolate, eggs" and "chocolate, milk," which indicates that customers who purchase chocolate are likely to buy mineral water as well.
- **Spaghetti** appears in several combinations, such as "spaghetti, frozen vegetables" and "spaghetti, ground beef," showing that spaghetti is often purchased with products like mineral water.
---

# Insights into Customer Purchasing Behavior:

### Mineral Water as a Common Purchase:

The fact that mineral water consistently appears in the rules suggests it is a staple product for many customers. It is likely to be purchased regularly alongside other food items.

### Pairing Preferences:

Customers seem to pair mineral water with more substantial items like **ground beef**, **eggs**, and **chocolate**. This could imply that when customers purchase more substantial or frequent grocery items (e.g., meat or snacks), they are also buying a refreshing beverage like mineral water.

### Combinations for Meal Preparation:

The presence of **spaghetti**, **chocolate**, and **eggs** in combinations suggests that customers might be shopping for complete meal solutions (spaghetti with other items) while also purchasing mineral water as an accompanying beverage.

### Promotional Opportunities:

Retailers could create bundle promotions around these frequent co-purchases, such as offering discounts when purchasing **mineral water** with **ground beef**, **eggs**, or **spaghetti**. This might increase sales by encouraging customers to buy these frequently-paired items together.

### Behavioral Segmentation:

The rules show that certain product types (e.g., chocolate, eggs, ground beef) are consistently purchased together with mineral water, which can help in segmenting customers based on their preferences. For instance, a segment of customers who regularly buy **chocolate** and **eggs** could be targeted with personalized promotions involving mineral water.


# Interview Questions:

**1.What is lift and why is it important in Association rules?**

Lift is a measure used in Association Rule Mining to evaluate the strength of a rule. It helps determine whether the presence of an item in a transaction increases or decreases the likelihood of another item appearing in the same transaction.

## Lift Formula

The lift of an association rule \( A \Rightarrow B \) is calculated as:

$$
\text{Lift}(A \Rightarrow B) = \frac{P(A \cap B)}{P(A) \times P(B)}
$$

Where:
- \( P(A \cap B) \) is the probability of both A and B occurring together.
- \( P(A) \) and \( P(B) \) are the individual probabilities of A and B.


**2.	What is support and Confidence. How do you calculate them?**

Support measures how frequently an itemset appears in the dataset. It helps determine how popular or relevant a rule is :
## Support Formula

Support measures how frequently an itemset appears in transactions.

$$
\text{Support}(A) = \frac{\text{Transactions containing } A}{\text{Total transactions}}
$$

For an association rule \( A \Rightarrow B \):

$$
\text{Support}(A \Rightarrow B) = \frac{\text{Transactions containing } (A \cap B)}{\text{Total transactions}}
$$
## Confidence Formula

Confidence indicates the likelihood that item **B** appears when item **A** is present.

$$
\text{Confidence}(A \Rightarrow B) = \frac{\text{Support}(A \cap B)}{\text{Support}(A)}
$$


**3.	What are some limitations or challenges of Association rules mining?**


 **1.Computational Complexity** – Frequent itemset generation is expensive for large datasets.  

**2.Rare Item Problem** – Important but rare patterns may be ignored due to low support.  

**3.Too Many Rules** – Generates excessive, often redundant rules, making analysis difficult.  

**4.Difficult Interpretation** – High confidence doesn’t always mean a useful rule; lift is needed.  

**5.Continuous Data Handling** – Works best with categorical data; numerical data needs discretization.  

**6.Data Quality Issues** – Noisy or incomplete data can lead to misleading rules.  
