# Juan Sebastian Prasetyo
### 0706022210011

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

Collecting mlxtend==0.23.1
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.23.3
    Uninstalling mlxtend-0.23.3:
      Successfully uninstalled mlxtend-0.23.3
Successfully installed mlxtend-0.23.1


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [8]:
# load the data set ans show the first five transaction
# Load the dataset
df = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv')

# Display the first five transactions
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [21]:
# Mengambil semua nilai unik dari DataFrame 'df' dan mengubahnya menjadi set.
# df.values mengembalikan semua nilai dalam DataFrame sebagai array NumPy 2D.
# flatten() meratakan array 2D menjadi array 1D.
# set() membuat set dari elemen-elemen unik dalam array yang diratakan.
purchased_products = set(df.values.flatten())

# Mencetak set produk yang telah dibeli.
# Ini menampilkan semua produk unik yang ada di DataFrame.
print(purchased_products)

{'Wine', 'Milk', nan, 'Meat', 'Pencil', 'Bread', 'Eggs', 'Bagel', 'Cheese', 'Diaper'}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [22]:
#create an itemset based on the products
itemset = {item: 0 for item in purchased_products}

# encoding the feature
for item in df.iloc[0]:
    if item in itemset:
        itemset[item] = 1

itemset

{'Wine': 1,
 'Milk': 0,
 nan: 0,
 'Meat': 1,
 'Pencil': 1,
 'Bread': 1,
 'Eggs': 1,
 'Bagel': 0,
 'Cheese': 1,
 'Diaper': 1}

In [23]:
# create new dataframe from the encoded features
encoded_df = pd.DataFrame(0, index=range(len(df)), columns=itemset)

for i, row in df.iterrows():
    for item in row:
        encoded_df.loc[i, item] = 1
# show the new dataframe
encoded_df.head()

Unnamed: 0,Wine,Milk,NaN,Meat,Pencil,Bread,Eggs,Bagel,Cheese,Diaper
0,1,0,0,1,1,1,1,0,1,1
1,1,1,0,1,1,1,0,0,1,1
2,1,1,1,1,0,0,1,0,1,0
3,1,1,1,1,0,0,1,0,1,0
4,1,0,1,1,1,0,0,0,0,0


In [24]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.
encoded_df = encoded_df.iloc[:, 1:]
encoded_df.head()

Unnamed: 0,Milk,NaN,Meat,Pencil,Bread,Eggs,Bagel,Cheese,Diaper
0,0,0,1,1,1,1,0,1,1
1,1,0,1,1,1,0,0,1,1
2,1,1,1,0,0,1,0,1,0
3,1,1,1,0,0,1,0,1,0
4,0,1,1,1,0,0,0,0,0


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [25]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules

# Apply Apriori algorithm to find frequent itemsets with min_support=0.2
frequent_itemsets = apriori(encoded_df, min_support=0.2, use_colnames=True)

# Show the result
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.501587,(Milk)
1,0.869841,(nan)
2,0.47619,(Meat)
3,0.361905,(Pencil)
4,0.504762,(Bread)
5,0.438095,(Eggs)
6,0.425397,(Bagel)
7,0.501587,(Cheese)
8,0.406349,(Diaper)
9,0.409524,"(Milk, nan)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [26]:
associationrules_df = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
associationrules_df

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Milk),(nan),0.501587,0.869841,0.409524,0.816456,0.938626,-0.026778,0.709141,-0.115976
1,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
2,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
3,(Meat),(nan),0.47619,0.869841,0.368254,0.773333,0.889051,-0.045956,0.57423,-0.192405
4,(Pencil),(nan),0.361905,0.869841,0.266667,0.736842,0.8471,-0.048133,0.494603,-0.220499
5,(Bread),(nan),0.504762,0.869841,0.396825,0.786164,0.903801,-0.042237,0.608683,-0.176903
6,(Eggs),(nan),0.438095,0.869841,0.336508,0.768116,0.883053,-0.044565,0.56131,-0.190735
7,(Bagel),(nan),0.425397,0.869841,0.336508,0.791045,0.909413,-0.03352,0.622902,-0.147743
8,(Cheese),(nan),0.501587,0.869841,0.393651,0.78481,0.902245,-0.042651,0.604855,-0.178565
9,(Diaper),(nan),0.406349,0.869841,0.31746,0.78125,0.898152,-0.035999,0.595011,-0.160381


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)

# Metrik dalam Analisis Hubungan Data
#Antecedent Support (Dukungan Awal):
###Frekuensi kemunculan item sebelum tanda panah (sebagai antecedent) dalam dataset.

#1. Consequent Support (Dukungan Akhir):
###Frekuensi kemunculan item setelah tanda panah (sebagai consequent) dalam dataset.

#2. Support (Dukungan):
###Mengukur seberapa sering antecedent dan consequent muncul bersamaan dalam dataset.

#3. Confidence (Kepercayaan):
###menunjukkan kekuatan aturan; semakin tinggi nilainya, semakin mungkin aturan tersebut benar.

#4. Lift (Korelasi):
###Nilai lift > 1 menunjukkan korelasi positif (item sering muncul bersama). Lift < 1 berarti korelasi negatif.

#5. Leverage (Pengaruh):
###Nilai leverage mendekati 0 berarti antecedent dan consequent independen. Nilai positif menunjukkan hubungan kuat, negatif berarti hubungan lemah.

#6.Conviction (Keyakinan):
###Nilai conviction > 1 menunjukkan hubungan erat antara antecedent dan consequent.

#7. Zhang's Metric:
###Metrik ini bernilai antara 0 dan 1, dengan nilai lebih tinggi menunjukkan hubungan yang lebih kuat.

#Interpretasi Hasil
#Kepercayaan Tinggi:
###Contohnya, aturan Eggs → Meat memiliki confidence 0.608696, artinya jika "Eggs" dibeli, ada kemungkinan 60,87% bahwa "Meat" juga dibeli.

#Lift > 1:
###Lift 1.278261 pada aturan yang sama menunjukkan pembelian "Eggs" meningkatkan peluang pembelian "Meat" lebih tinggi dari sekadar kebetulan.

#Leverage Positif:
###Leverage 0.058050 menunjukkan bahwa "Eggs" dan "Meat" lebih sering dibeli bersamaan daripada secara acak.

#Conviction Tinggi:
###Nilai conviction 1.338624 memperkuat bahwa Eggs → Meat memiliki hubungan positif.

#Zhang's Metric:
###Nilai 0.387409 mengindikasikan kekuatan hubungan yang konsisten antara "Eggs" dan "Meat."

#Kesimpulan
###Metrik-metrik ini membantu mengidentifikasi hubungan kuat antara produk dalam analisis keranjang belanja. Hasilnya dapat dimanfaatkan untuk strategi bisnis, seperti promosi atau penjualan silang berdasarkan kombinasi produk yang sering dibeli bersama.
