In [1]:
!pip install pandas mlxtend openpyxl



In [2]:
import pandas as pd

# Dataset

In [3]:
# Load the dataset
df = pd.read_excel('Online retail.xlsx',header=None)
df.head()

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."


In [4]:
# Assuming data is comma-separated
df_split = df[0].str.split(',', expand=True)

# Display the separated data
df_split.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


# Data Preprocessing
### Remove missing values
### Remove duplicates
### Convert to the required format for Association Rules

In [5]:
# Remove missing values
df_split.dropna(inplace=True)

In [6]:
# Remove duplicates
df_split.drop_duplicates(inplace=True)

In [7]:
# Convert to a transaction-based format
transactions = df_split.values.tolist()
transactions

[['shrimp',
  'almonds',
  'avocado',
  'vegetables mix',
  'green grapes',
  'whole weat flour',
  'yams',
  'cottage cheese',
  'energy drink',
  'tomato juice',
  'low fat yogurt',
  'green tea',
  'honey',
  'salad',
  'mineral water',
  'salmon',
  'antioxydant juice',
  'frozen smoothie',
  'spinach',
  'olive oil']]

# Association Rule Mining
### Apply Apriori Algorithm

In [8]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [9]:
# Convert data to one-hot encoded format
te = TransactionEncoder()
te

In [10]:
te_ary = te.fit(transactions).transform(transactions)
te_ary

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True]])

In [11]:
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
df_encoded

Unnamed: 0,almonds,antioxydant juice,avocado,cottage cheese,energy drink,frozen smoothie,green grapes,green tea,honey,low fat yogurt,mineral water,olive oil,salad,salmon,shrimp,spinach,tomato juice,vegetables mix,whole weat flour,yams
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [12]:
# Apply Apriori with lower min_support
frequent_itemsets = apriori(df_encoded, min_support=0.001, use_colnames=True)

# Check the frequent itemsets
frequent_itemsets


Unnamed: 0,support,itemsets
0,1.0,(almonds)
1,1.0,(antioxydant juice)
2,1.0,(avocado)
3,1.0,(cottage cheese)
4,1.0,(energy drink)
...,...,...
1048570,1.0,"(salad, honey, whole weat flour, antioxydant j..."
1048571,1.0,"(salad, honey, whole weat flour, antioxydant j..."
1048572,1.0,"(salad, honey, whole weat flour, frozen smooth..."
1048573,1.0,"(salad, honey, whole weat flour, antioxydant j..."


In [None]:
# Apply Apriori with higher support to reduce combinations
frequent_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)

# Generate association rules with higher lift threshold
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.5)

rules.head()


In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
rules.head()

# Analysis and Interpretation:

# Analysis
All the items in the dataset have a support of 1.0, meaning they appear in all the transactions. This could be due to the dataset structure or lack of diversity in the data.

# Interpretation
The generated rules show that:

##### {Low Fat Yogurt} → {Whole Wheat Flour}

Support: 1.0 → Appears in 100% of transactions.

Confidence: 1.0 → Every time Low Fat Yogurt is bought, Whole Wheat Flour is also bought.

Lift: 1.0 → No added advantage compared to random chance.

##### {Vegetables Mix} → {Green Tea}

Support: 1.0

Confidence: 1.0

Lift: 1.0

##### {Cottage Cheese} → {Salmon}

Support: 1.0

Confidence: 1.0

Lift: 1.0

##### Insights into Customer Behavior:

Since all items have a support of 1.0, it indicates that the same set of items appears in every transaction.

This could be due to a data quality issue, such as duplicate rows or incomplete data.

In a real-world scenario, we would either filter the data or adjust the support threshold to find meaningful patterns.


# Interview Questions

### 1. What is Lift and Why is it Important in Association Rules?
Lift measures how much more likely a product is bought when another product is already bought, compared to random chance.  

Formula: Lift = Confidence (A → B) / Support (B)  

Importance:
- Lift > 1: Strong positive correlation between two products.  
- Lift = 1: No correlation (random purchase).  
- Lift < 1: Negative correlation (buying one product reduces the chance of buying another).  

### 2. What is Support and Confidence? How Do You Calculate Them?
Support: The frequency of an item or itemset appearing in the dataset.  

Formula: Support (A → B) = (Transactions with A and B) / (Total Transactions)  

Confidence: The probability of buying B when A is already bought.  

Formula: Confidence (A → B) = Support (A → B) / Support (A)  

### 3. What are Some Limitations or Challenges of Association Rule Mining?
- High computational cost for large datasets.  
- Difficulty in handling sparse data.  
- Choosing the right support, confidence, and lift thresholds is challenging.  
- Generation of redundant and irrelevant rules.  
- High confidence rules can be misleading if lift is low.  
- Data quality issues like missing values and noise affect the accuracy of the results.  