<a href="https://colab.research.google.com/github/IshanKapadia-Data/Instacart-Market-Basket-Analysis/blob/main/Market_Basket_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

aisles       = pd.read_csv('aisles.csv')
departments  = pd.read_csv('departments.csv')
products     = pd.read_csv('products.csv')
orders       = pd.read_csv('orders.csv')
prior        = pd.read_csv('order_products__prior.csv')
train        = pd.read_csv('order_products__train.csv')

In [None]:
order_products = pd.concat([prior, train], ignore_index=True)
order_products.shape


(31922519, 4)

In [None]:
order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1.0,1.0
1,2,28985,2.0,1.0
2,2,9327,3.0,0.0
3,2,45918,4.0,1.0
4,2,30035,5.0,0.0


In [None]:
order_products.product_id.nunique()

49678

In [None]:
product_counts = order_products.groupby('product_id')['order_id'].count().reset_index().rename(columns = {'order_id':'frequency'})
product_counts = product_counts.sort_values('frequency', ascending=False)[0:100].reset_index(drop = True)
product_counts = product_counts.merge(products, on = 'product_id', how = 'left')
product_counts.head(10)

Unnamed: 0,product_id,frequency,product_name,aisle_id,department_id
0,24852,463973,Banana,24,4
1,13176,372774,Bag of Organic Bananas,24,4
2,21137,260050,Organic Strawberries,24,4
3,21903,237625,Organic Baby Spinach,123,4
4,47209,208415,Organic Hass Avocado,24,4
5,47766,173998,Organic Avocado,24,4
6,47626,151965,Large Lemon,24,4
7,16797,141093,Strawberries,24,4
8,26209,138324,Limes,24,4
9,27845,134734,Organic Whole Milk,84,16


In [None]:
freq_products = list(product_counts.product_id)
freq_products[1:10]

[13176, 21137, 21903, 47209, 47766, 47626, 16797, 26209, 27845]

In [None]:
len(freq_products)

100

In [None]:
order_products = order_products[order_products.product_id.isin(freq_products)]
order_products.shape

(7357549, 4)

In [None]:
order_products.order_id.nunique()

2303311

In [None]:
order_products = order_products.merge(products, on = 'product_id', how='left')
order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,28985,2.0,1.0,Michigan Organic Kale,83,4
1,2,17794,6.0,1.0,Carrots,83,4
2,3,24838,2.0,1.0,Unsweetened Almondmilk,91,16
3,3,21903,4.0,1.0,Organic Baby Spinach,123,4
4,3,46667,6.0,1.0,Organic Ginger Root,83,4


In [None]:
basket = order_products.groupby(['order_id', 'product_name'])['reordered'].count().unstack().reset_index().fillna(0).set_index('order_id')
basket.head()

product_name,100% Raw Coconut Water,100% Whole Wheat Bread,2% Reduced Fat Milk,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Bartlett Pears,Blueberries,Boneless Skinless Chicken Breasts,...,Sparkling Natural Mineral Water,Sparkling Water Grapefruit,Spring Water,Strawberries,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Unsweetened Original Almond Breeze Almond Milk,Whole Milk,Yellow Onions
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
for var in ['product_counts', 'products', 'order_products', 'prior', 'train']:
    if var in locals():
        del globals()[var]


In [None]:
# Binary encoding for the basket matrix
basket = basket.map(lambda x: 1 if x >= 1 else 0)
basket.head()


product_name,100% Raw Coconut Water,100% Whole Wheat Bread,2% Reduced Fat Milk,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Bartlett Pears,Blueberries,Boneless Skinless Chicken Breasts,...,Sparkling Natural Mineral Water,Sparkling Water Grapefruit,Spring Water,Strawberries,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Unsweetened Original Almond Breeze Almond Milk,Whole Milk,Yellow Onions
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
basket.size

230331100

In [None]:
basket.shape

(2303311, 100)

In [None]:
basket = basket.astype(bool)
frequent_items = apriori(basket, min_support=0.01, use_colnames=True, low_memory=True)
frequent_items.head()


Unnamed: 0,support,itemsets
0,0.016052,(100% Raw Coconut Water)
1,0.025837,(100% Whole Wheat Bread)
2,0.015807,(2% Reduced Fat Milk)
3,0.035673,(Apple Honeycrisp Organic)
4,0.02905,(Asparagus)


In [None]:
frequent_items.tail()

Unnamed: 0,support,itemsets
125,0.010237,"(Organic Blueberries, Organic Strawberries)"
126,0.01097,"(Organic Hass Avocado, Organic Raspberries)"
127,0.017302,"(Organic Hass Avocado, Organic Strawberries)"
128,0.014499,"(Organic Raspberries, Organic Strawberries)"
129,0.010141,"(Organic Whole Milk, Organic Strawberries)"


In [None]:
frequent_items.shape

(130, 2)

In [None]:
rules = association_rules(frequent_items, metric="lift", min_threshold=1)
rules.sort_values('lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
37,(Large Lemon),(Limes),0.065851,0.059934,0.011853,0.17999,3.003131,1.0,0.007906,1.146408,0.714034,0.104031,0.12771,0.188874
36,(Limes),(Large Lemon),0.059934,0.065851,0.011853,0.197759,3.003131,1.0,0.007906,1.164424,0.70954,0.104031,0.141206,0.188874
54,(Organic Raspberries),(Organic Strawberries),0.058278,0.112704,0.014499,0.248791,2.20747,1.0,0.007931,1.181157,0.580843,0.092656,0.153373,0.188719
55,(Organic Strawberries),(Organic Raspberries),0.112704,0.058278,0.014499,0.128648,2.20747,1.0,0.007931,1.080759,0.616472,0.092656,0.074724,0.188719
39,(Large Lemon),(Organic Avocado),0.065851,0.075404,0.010529,0.159895,2.120515,1.0,0.005564,1.100572,0.565666,0.080544,0.091381,0.149766
38,(Organic Avocado),(Large Lemon),0.075404,0.065851,0.010529,0.139638,2.120515,1.0,0.005564,1.085763,0.57151,0.080544,0.078988,0.149766
48,(Organic Blueberries),(Organic Strawberries),0.042963,0.112704,0.010237,0.238275,2.114163,1.0,0.005395,1.164851,0.550657,0.070391,0.141521,0.164553
49,(Organic Strawberries),(Organic Blueberries),0.112704,0.042963,0.010237,0.090831,2.114163,1.0,0.005395,1.05265,0.593939,0.070391,0.050017,0.164553
51,(Organic Raspberries),(Organic Hass Avocado),0.058278,0.090314,0.01097,0.188232,2.084192,1.0,0.005706,1.120623,0.55239,0.07971,0.107639,0.154848
50,(Organic Hass Avocado),(Organic Raspberries),0.090314,0.058278,0.01097,0.121463,2.084192,1.0,0.005706,1.071921,0.571843,0.07971,0.067095,0.154848
