In [2]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

import matplotlib.pyplot as plt


df = pd.read_csv('retail_dataset.csv', sep=',')

## Print top 5 rows
# print(df.head(5))


#Each row of the dataset represents items that were purchased together on the same day at the same store.
# The dataset is a sparse dataset as relatively high percentage of data is NA or NaN or equivalent.
#These NaNs make it hard to read the table.
# Let’s find out how many unique items are actually there in the table.

items = (df['0'].unique())
print("Unique items:", items)

#There are only 9 items in total that make up the entire dataset.


#############Data Preprocessing

#To make use of the apriori module given by mlxtend library, we need to convert the dataset according to
# it’s liking. apriori module requires a dataframe that has either 0 and 1 or True and False as data.
# The data we have is all string (name of items), we need to One Hot Encode the data.


itemset = set(items)
encoded_vals = []
for index, row in df.iterrows():
    rowset = set(row)
    labels = {}
    uncommons = list(itemset - rowset)
    commons = list(itemset.intersection(rowset))
    for uc in uncommons:
        labels[uc] = 0
    for com in commons:
        labels[com] = 1
    encoded_vals.append(labels)

print(encoded_vals[0])
ohe_df = pd.DataFrame(encoded_vals)




####################### Applying Apriori

freq_items = apriori(ohe_df, min_support=0.2, use_colnames=True, verbose=1)
print("frequent Items:")
print(freq_items.head(7))


##########################  Mining Association Rules

rules = association_rules(freq_items, metric="confidence", min_threshold=0.6)
print("Rules: ")
print(rules.head())



        0       1     2       3       4       5       6
0   Bread    Wine  Eggs    Meat  Cheese  Pencil  Diaper
1   Bread  Cheese  Meat  Diaper    Wine    Milk  Pencil
2  Cheese    Meat  Eggs    Milk    Wine     NaN     NaN
3  Cheese    Meat  Eggs    Milk    Wine     NaN     NaN
4    Meat  Pencil  Wine     NaN     NaN     NaN     NaN
Unique items: ['Bread' 'Cheese' 'Meat' 'Eggs' 'Wine' 'Bagel' 'Pencil' 'Diaper' 'Milk']
{'Bagel': 0, 'Milk': 0, 'Bread': 1, 'Meat': 1, 'Pencil': 1, 'Cheese': 1, 'Eggs': 1, 'Diaper': 1, 'Wine': 1}
Processing 4 combinations | Sampling itemset size 4 3
frequent Items:
    support  itemsets
0  0.425397   (Bagel)
1  0.501587    (Milk)
2  0.504762   (Bread)
3  0.476190    (Meat)
4  0.361905  (Pencil)
5  0.501587  (Cheese)
6  0.438095    (Eggs)
Rules: 
  antecedents consequents  antecedent support  consequent support   support  \
0     (Bagel)     (Bread)            0.425397            0.504762  0.279365   
1    (Cheese)      (Milk)            0.501587            

