# Objective:

Imagine 10000 receipts sitting on your table. Each receipt represents a transaction with items that were purchased. The receipt is a representation of stuff that went into a customer’s basket - and therefore ‘Market Basket Analysis’.

That is exactly what the Groceries Data Set contains: a collection of receipts with each line representing 1 receipt and the items purchased. Each line is called a transaction and each column in a row represents an item.  The data set is attached.

Your assignment is to use Python to mine the data for association rules. You should report support, confidence and lift and your top 10 rules by lift. 

Extra credit: do a simple cluster analysis on the data as well.  Use whichever packages you like.  

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [50]:
data = pd.read_csv("GroceryDataSet.csv",header=None)
data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,,...,,,,,,,,,,
1,tropical fruit,yogurt,coffee,,,,,,,,...,,,,,,,,,,
2,whole milk,,,,,,,,,,...,,,,,,,,,,
3,pip fruit,yogurt,cream cheese,meat spreads,,,,,,,...,,,,,,,,,,
4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,...,,,,,,,,,,
5,whole milk,butter,yogurt,rice,abrasive cleaner,,,,,,...,,,,,,,,,,
6,rolls/buns,,,,,,,,,,...,,,,,,,,,,
7,other vegetables,UHT-milk,rolls/buns,bottled beer,liquor (appetizer),,,,,,...,,,,,,,,,,
8,pot plants,,,,,,,,,,...,,,,,,,,,,
9,whole milk,cereals,,,,,,,,,...,,,,,,,,,,


In [16]:
transactions = data.values.tolist()

In [20]:
transactions = [[item for item in transaction if pd.notna(item)] for transaction in transactions]

https://rasbt.github.io/mlxtend/user_guide/preprocessing/TransactionEncoder/

In [115]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()

te_data = te.fit(transactions).transform(transactions)

df = pd.DataFrame(te_data, columns=te.columns_)
df

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9830,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,True,False,False
9831,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9832,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
9833,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [123]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True) 

rules = association_rules(frequent_itemsets, metric='lift')  

rules.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(beef),(other vegetables),0.052466,0.193493,0.019725,0.375969,1.943066,0.009574,1.292416,0.512224
1,(other vegetables),(beef),0.193493,0.052466,0.019725,0.101944,1.943066,0.009574,1.055095,0.601792
2,(rolls/buns),(beef),0.183935,0.052466,0.013625,0.074074,1.411858,0.003975,1.023337,0.357463
3,(beef),(rolls/buns),0.052466,0.183935,0.013625,0.25969,1.411858,0.003975,1.102329,0.307866
4,(root vegetables),(beef),0.108998,0.052466,0.017387,0.159515,3.040367,0.011668,1.127366,0.753189


In [136]:
rules_sorted = rules.sort_values(by = 'lift',axis = 0, ascending = False).head(20)[['antecedents','consequents','support','confidence','lift']]

top_10_rules = rules_sorted[::2].reset_index().drop('index', axis = 1)

top_10_rules

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(curd),"(whole milk, yogurt)",0.010066,0.188931,3.372304
1,(root vegetables),"(citrus fruit, other vegetables)",0.010371,0.095149,3.295045
2,"(other vegetables, yogurt)",(whipped/sour cream),0.010168,0.234192,3.267062
3,"(tropical fruit, other vegetables)",(root vegetables),0.012303,0.342776,3.14478
4,(root vegetables),(beef),0.017387,0.159515,3.040367
5,"(root vegetables, citrus fruit)",(other vegetables),0.010371,0.586207,3.029608
6,(other vegetables),"(tropical fruit, root vegetables)",0.012303,0.063584,3.020999
7,"(whole milk, other vegetables)",(root vegetables),0.023183,0.309783,2.842082
8,(butter),"(whole milk, other vegetables)",0.01149,0.207339,2.77063
9,(yogurt),"(curd, whole milk)",0.010066,0.072157,2.761356
