In this notebook we build itemsets and association rules for the kaggle instacart dataset.  ref. [this R notebook](https://www.kaggle.com/msp48731/frequent-itemsets-and-association-rules)

In [4]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

raw_orders = pd.read_csv('../Data/all/orders.csv')
raw_order_lines = pd.read_csv('../Data/all/order_products__prior.csv')
raw_prods = pd.read_csv('../Data/all/products.csv')

In [5]:
raw_order_lines.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [6]:
raw_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [7]:
raw_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [8]:
raw_prods.count()

product_id       49688
product_name     49688
aisle_id         49688
department_id    49688
dtype: int64

In [9]:
#reduce the number of items being worked with.  there are too many in the full set 
#pick the first 1000 orders
orderlist = raw_orders['order_id'].tolist()
order_lines = raw_order_lines.loc[raw_order_lines['order_id'].isin(orderlist)]
order_lines['quantity'] = 1
order_lines.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,quantity
0,2,33120,1,1,1
1,2,28985,2,1,1
2,2,9327,3,0,1
3,2,45918,4,1,1
4,2,30035,5,0,1


In [None]:
#basket = pd.concat([order_lines.drop(['product_id'], axis=1), order_lines['product_id'].apply(pd.Series)], axis=1)
basket = (order_lines
          .groupby(['order_id', 'product_id'])['quantity']
          .sum().unstack().fillna(0)
         )
basket = basket.rename(index=str, columns=dict(
    [(pid,raw_prods[raw_prods['product_id'] == pid]['product_name'].iloc[0]) for pid in basket.columns[0:]]
))


In [None]:
basket.head()

In [87]:
#lets generate some itemsets
frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.013733,(Cantaloupe)
1,0.011236,(Vanilla Almond Breeze Almond Milk)
2,0.011236,(Organic Broccoli Crowns)
3,0.013733,(Whole Milk)
4,0.019975,(Yellow Onions)
5,0.013733,(Shredded Parmesan)
6,0.021223,(Seedless Red Grapes)
7,0.018727,(100% Whole Wheat Bread)
8,0.022472,(Small Hass Avocado)
9,0.018727,(Organic Lemon)


In [90]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.05)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bag of Organic Bananas),(Original Hummus),0.127341,0.018727,0.011236,0.088235,4.711765,0.008851,1.076235
1,(Original Hummus),(Bag of Organic Bananas),0.018727,0.127341,0.011236,0.6,4.711765,0.008851,2.181648
2,(Strawberries),(Banana),0.047441,0.118602,0.014981,0.315789,2.662604,0.009355,1.288197
3,(Banana),(Strawberries),0.118602,0.047441,0.014981,0.126316,2.662604,0.009355,1.090279
4,(Organic Avocado),(Limes),0.058677,0.046192,0.011236,0.191489,4.145486,0.008526,1.17971
