In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

Collecting mlxtend==0.23.1
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.23.3
    Uninstalling mlxtend-0.23.3:
      Successfully uninstalled mlxtend-0.23.3
Successfully installed mlxtend-0.23.1


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [3]:
# load the data set ans show the first five transaction
data = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv')
data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [9]:
purchased = data.unstack().unique()
print(purchased)

['Bread' 'Cheese' 'Meat' 'Eggs' 'Wine' 'Bagel' 'Pencil' 'Diaper' 'Milk'
 nan]


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [10]:
# Create an itemset based on unique products
items = {item: 0 for item in purchased}

# encoding the feature
for item in data.iloc[0]:
    if item in items:
        items[item] = 1

items

{'Bread': 1,
 'Cheese': 1,
 'Meat': 1,
 'Eggs': 1,
 'Wine': 1,
 'Bagel': 0,
 'Pencil': 1,
 'Diaper': 1,
 'Milk': 0,
 nan: 0}

In [19]:
# Ganti nilai NaN dengan nilai spesifik, misalnya 'No Product'
data_with_placeholder = data.fillna('No Product')

# Ubah data menjadi array datar, tetapi pertahankan 'No Product' sebagai kategori
flattened_data = data_with_placeholder.values.flatten()

# Langkah 1: Gunakan OneHotEncoder, termasuk 'No Product' (merepresentasikan nilai yang hilang)
# Pastikan untuk mengganti 'sparse' dengan 'sparse_output'
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # Abaikan nilai yang tidak dikenal
one_hot_encoded_data = encoder.fit_transform(flattened_data.reshape(-1, 1))

# Langkah 2: Buat DataFrame hasil one-hot encoding
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_data, columns=encoder.categories_[0])

# Langkah 3: Buat DataFrame dengan nilai 0 dan 1, menunjukkan apakah item dibeli
final_product_df = pd.DataFrame(0, index=data.index, columns=encoder.categories_[0])

# Tandai kehadiran produk untuk setiap baris, termasuk 'No Product'
for idx, row in data_with_placeholder.iterrows():
    for product in row:
        final_product_df.loc[idx, product] = 1

# Tampilkan DataFrame akhir
print("Dataset one-hot encoding dengan 'No Product':")
final_product_df.head()


  and should_run_async(code)


Dataset one-hot encoding dengan 'No Product':


Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,No Product,Pencil,Wine
0,0,1,1,1,1,1,0,0,1,1
1,0,1,1,1,0,1,1,0,1,1
2,0,0,1,0,1,1,1,1,0,1
3,0,0,1,0,1,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1,1


In [20]:
# Karena DataFrame hasil encoding memiliki kolom kosong, kita akan menghapus kolom NaN.
# Alternatifnya, bisa menggunakan indeks kolom.

if 'NaN' in final_product_df.columns:
    final_product_df.drop(columns=['NaN'], inplace=True)

# Tampilkan DataFrame setelah kolom NaN dihapus
final_product_df.head()


  and should_run_async(code)


Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,No Product,Pencil,Wine
0,0,1,1,1,1,1,0,0,1,1
1,0,1,1,1,0,1,1,0,1,1
2,0,0,1,0,1,1,1,1,0,1
3,0,0,1,0,1,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [21]:
# Tentukan nilai ambang batas (threshold) untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules

threshold_support = 0.2  # Nilai minimum support
frequent_itemsets = apriori(final_product_df, min_support=threshold_support, use_colnames=True)

# Tampilkan itemset yang sering muncul
frequent_itemsets


  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.504762,(Bread)
2,0.501587,(Cheese)
3,0.406349,(Diaper)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.501587,(Milk)
7,0.869841,(No Product)
8,0.361905,(Pencil)
9,0.438095,(Wine)


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [22]:
# Tentukan nilai ambang batas (threshold) untuk metrik confidence
confidence_threshold = 0.6  # Nilai minimum untuk confidence

# Buat aturan asosiasi berdasarkan frequent itemsets dan ambang batas confidence
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=confidence_threshold)

# Hapus kolom 'zhangs_metric' jika tidak diperlukan
rules.drop(columns=['zhangs_metric'], inplace=True)

# Tampilkan aturan asosiasi yang dihasilkan
rules


  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
1,(Bagel),(No Product),0.425397,0.869841,0.336508,0.791045,0.909413,-0.03352,0.622902
2,(Bread),(No Product),0.504762,0.869841,0.396825,0.786164,0.903801,-0.042237,0.608683
3,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
4,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
5,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
6,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
7,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
8,(Cheese),(No Product),0.501587,0.869841,0.393651,0.78481,0.902245,-0.042651,0.604855
9,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)


Here’s an explanation of each term and its interpretation in association rule mining. These metrics evaluate the strength and significance of relationships between items in a dataset.

1. Antecedent Support
Definition: The proportion of transactions in the dataset that contain the antecedent (the "if" part of the rule).
Formula:
Antecedent Support
=
Transactions containing Antecedent
Total Transactions
Antecedent Support=
Total Transactions
Transactions containing Antecedent
​

Interpretation: Represents how often the antecedent appears in the dataset.
2. Consequent Support
Definition: The proportion of transactions in the dataset that contain the consequent (the "then" part of the rule).
Formula:
Consequent Support
=
Transactions containing Consequent
Total Transactions
Consequent Support=
Total Transactions
Transactions containing Consequent
​

Interpretation: Shows how often the consequent occurs in the dataset.
3. Support
Definition: The proportion of transactions that contain both the antecedent and the consequent (the whole rule).
Formula:
Support
=
Transactions containing both Antecedent and Consequent
Total Transactions
Support=
Total Transactions
Transactions containing both Antecedent and Consequent
​

Interpretation: Indicates the popularity or relevance of the rule in the dataset.
4. Confidence
Definition: The likelihood of the consequent occurring, given that the antecedent has occurred.
Formula:
Confidence
=
Support (Antecedent and Consequent)
Antecedent Support
Confidence=
Antecedent Support
Support (Antecedent and Consequent)
​

Interpretation: Measures the strength of the association; a higher value indicates a stronger rule.
5. Lift
Definition: The ratio of the observed support of the rule to the expected support if the antecedent and consequent were independent.
Formula:
Lift
=
Confidence
Consequent Support
Lift=
Consequent Support
Confidence
​

Interpretation:
Lift > 1: Positive association; items occur together more often than expected by chance.
Lift = 1: No association; items occur together as expected by chance.
Lift < 1: Negative association; items occur together less often than expected.
6. Leverage
Definition: Measures the difference between the observed support and the expected support if antecedent and consequent were independent.
Formula:
Leverage
=
Support
−
(
Antecedent Support
×
Consequent Support
)
Leverage=Support−(Antecedent Support×Consequent Support)
Interpretation:
Positive leverage indicates a stronger association than expected.
Zero means no association.
Negative leverage suggests a weaker association.
7. Conviction
Definition: Measures the degree to which the presence of the antecedent increases the likelihood of the consequent not occurring.
Formula:
Conviction
=
1
−
Consequent Support
1
−
Confidence
Conviction=
1−Confidence
1−Consequent Support
​

Interpretation: A higher conviction value indicates a stronger implication of the rule.
Interpretation in Practice
High Support: Suggests the rule applies to a large portion of the dataset.
High Confidence: Indicates the rule is reliable when the antecedent occurs.
High Lift: Reflects a meaningful and non-random relationship between antecedent and consequent.
Positive Leverage: Confirms that the items occur together more frequently than by random chance.
High Conviction: Shows robustness in the relationship, especially when the rule rarely fails.
Let me know if you’d like these metrics applied to a specific example!

Excellent explanation! You've provided a clear and concise overview of key association rule mining metrics.

To further solidify understanding, let's consider a practical example:

Scenario: A grocery store analyzes customer purchase data to identify patterns.

Rule: If a customer buys bread, they are likely to also buy butter.

Metrics Interpretation:

Support:

Measures the popularity of the rule.
High support indicates that many customers buy both bread and butter together.

Confidence:

Measures the reliability of the rule.
High confidence means that whenever a customer buys bread, they are likely to buy butter.
Lift:

Measures the strength of the association between bread and butter.
High lift indicates that the two items are more likely to be bought together than expected by chance.
Leverage:

Measures the unexpectedness of the rule.
Positive leverage means the rule is more significant than expected.
Conviction:

Measures the decrease in the probability of the consequent not occurring given the antecedent.
High conviction indicates a strong relationship between the two items.
Visual Representation:
Opens in a new window
www.investopedia.com
Venn diagram illustrating the relationships between antecedent, consequent, and their intersection

Additional Considerations:

Thresholds: Setting appropriate thresholds for support, confidence, and lift helps filter out weak or irrelevant rules.
Rule Interestingness: While these metrics are useful, it's important to consider the practical implications of a rule. A rule with high support and confidence might not be interesting if it's obvious or doesn't provide actionable insights.
Domain Knowledge: Understanding the context and domain knowledge can help interpret the results more effectively.

https://chatgpt.com/c/67482d6c-0be8-8006-9e49-7645a7a1e29b

https://gemini.google.com/app/2128204ce3eabd8d