In [15]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

! pip install mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules



# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [2]:
df = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


Get the unique product that has been purchased

In [3]:
unique_products = set(pd.unique(df.values.ravel()))

print(unique_products)

{'Wine', 'Eggs', 'Cheese', 'Bread', 'Bagel', 'Diaper', 'Milk', 'Pencil', 'Meat', nan}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [9]:
print("Tampilan Awal Dataset:")
print(df)

# Mengonversi semua nilai ke dalam string
df = df.applymap(lambda x: str(x))

# Membuat representasi itemset
te = TransactionEncoder()
te_ary = te.fit_transform(df.values.tolist())

# Mengonversi array itemset menjadi DataFrame
df_encoded = pd.DataFrame(te_ary.astype(int), columns=te.columns_)

Tampilan Awal Dataset:
          0       1       2       3       4       5       6
0     Bread    Wine    Eggs    Meat  Cheese  Pencil  Diaper
1     Bread  Cheese    Meat  Diaper    Wine    Milk  Pencil
2    Cheese    Meat    Eggs    Milk    Wine     nan     nan
3    Cheese    Meat    Eggs    Milk    Wine     nan     nan
4      Meat  Pencil    Wine     nan     nan     nan     nan
..      ...     ...     ...     ...     ...     ...     ...
310   Bread    Eggs  Cheese     nan     nan     nan     nan
311    Meat    Milk  Pencil     nan     nan     nan     nan
312   Bread  Cheese    Eggs    Meat  Pencil  Diaper    Wine
313    Meat  Cheese     nan     nan     nan     nan     nan
314    Eggs    Wine   Bagel   Bread    Meat     nan     nan

[315 rows x 7 columns]


In [13]:
df_combined = pd.concat([df_encoded.loc[:, 'Bagel':'nan']], axis=1)

print("\nDataset Setelah Encoding:")
print(df_combined.loc[:, 'Bagel':'nan'])


Dataset Setelah Encoding:
     Bagel  Bread  Cheese  Diaper  Eggs  Meat  Milk  Pencil  Wine  nan
0        0      1       1       1     1     1     0       1     1    0
1        0      1       1       1     0     1     1       1     1    0
2        0      0       1       0     1     1     1       0     1    1
3        0      0       1       0     1     1     1       0     1    1
4        0      0       0       0     0     1     0       1     1    1
..     ...    ...     ...     ...   ...   ...   ...     ...   ...  ...
310      0      1       1       0     1     0     0       0     0    1
311      0      0       0       0     0     1     1       1     0    1
312      0      1       1       1     1     1     0       1     1    0
313      0      0       1       0     0     1     0       0     0    1
314      1      1       0       0     1     1     0       0     1    1

[315 rows x 10 columns]


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

In [14]:
# Menghapus kolom yang hanya berisi nilai NaN atau 'nan'
df_combined_cleaned = df_combined.drop(['nan'], axis=1)

# Menampilkan beberapa baris pertama dari dataset yang sudah dibersihkan
print("\nDataset Setelah Menghapus Kolom 'nan':")
print(df_combined_cleaned)


Dataset Setelah Menghapus Kolom 'nan':
     Bagel  Bread  Cheese  Diaper  Eggs  Meat  Milk  Pencil  Wine
0        0      1       1       1     1     1     0       1     1
1        0      1       1       1     0     1     1       1     1
2        0      0       1       0     1     1     1       0     1
3        0      0       1       0     1     1     1       0     1
4        0      0       0       0     0     1     0       1     1
..     ...    ...     ...     ...   ...   ...   ...     ...   ...
310      0      1       1       0     1     0     0       0     0
311      0      0       0       0     0     1     1       1     0
312      0      1       1       1     1     1     0       1     1
313      0      0       1       0     0     1     0       0     0
314      1      1       0       0     1     1     0       0     1

[315 rows x 9 columns]


## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [26]:
frequent_itemsets = apriori(df_combined, min_support=0.2, use_colnames=True)

print("\nFrequently Purchased Products:")
frequent_itemsets


Frequently Purchased Products:




Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.504762,(Bread)
2,0.501587,(Cheese)
3,0.406349,(Diaper)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.501587,(Milk)
7,0.361905,(Pencil)
8,0.438095,(Wine)
9,0.869841,(nan)


Then, we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [27]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

print("\nAssociation Rules based on Confidence:")
rules[['antecedents', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'leverage', 'conviction']]


Association Rules based on Confidence:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
1,(Bagel),(nan),0.425397,0.869841,0.336508,0.791045,0.909413,-0.03352,0.622902
2,(Bread),(nan),0.504762,0.869841,0.396825,0.786164,0.903801,-0.042237,0.608683
3,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
4,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
5,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
6,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
7,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
8,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754
9,(Cheese),(nan),0.501587,0.869841,0.393651,0.78481,0.902245,-0.042651,0.604855


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__ and __conviction__

Antecedent Support: Persentase transaksi yang memiliki itemset pada bagian sebelum panah (antecedent).

Consequent Support: Persentase transaksi yang memiliki itemset pada bagian setelah panah (consequent).

Support: Persentase transaksi yang memiliki itemset secara keseluruhan (antecedent dan consequent).

Confidence: Probabilitas bahwa consequent muncul jika antecedent sudah ada dalam transaksi.

Lift: Seberapa banyak peningkatan dalam kemunculan consequent jika sudah diketahui bahwa antecedent ada, dibandingkan dengan kemunculan acak.

Leverage: Seberapa sering itemset muncul bersama-sama dibandingkan dengan prediksi acak.

Conviction: Seberapa kuat dependensi antara antecedent dan consequent, dengan nilai lebih dari 1 menunjukkan dependensi yang kuat.