In [1]:
# Import Packages

# Pandas for rading CSV files
import pandas as pd 

# Numpy for arrays, array is a much more efficient data structure than builtin python list for our purpose
import numpy as np 


# mlxtend is a machine learing library that contations alot of ml tools
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# Import DataSets
order = pd.read_csv("../data/orders.csv")
product = pd.read_csv("../data/products.csv")
prior = pd.read_csv("../data/order_products__prior.csv")
train = pd.read_csv("../data/order_products__train.csv")

# Now we are merging prior and train datasets to get the complete order dataset.
train = train.append(prior,ignore_index = True)

train['reordered'] = 1 

In [3]:
productCount = train.groupby("product_id",as_index = False)["order_id"].count()

In [4]:
# Top 100 most frequently purchased products
freq_product = 100

productCount = productCount.sort_values("order_id",ascending = False)
topProduct = productCount.iloc[0:freq_product,:]
topProduct = topProduct.merge(product,on = "product_id")
productId= topProduct.loc[:,["product_id"]]



# Here order_id is the count, so we need to sort the data frame with repect to order_id

df = train[0:0]
for i in range(0,99):
    pId = productId.iloc[i]['product_id'] 
    stDf = train[train.product_id == pId ]
    df = df.append(stDf,ignore_index = False)

In [5]:
# Here we are defining the basket to analysis
basket = df.groupby(['order_id', 'product_id'])['reordered'].sum().unstack().reset_index().fillna(0).set_index('order_id')

In [6]:
# This function is used to convert database into a boolean database as seen below

def encode(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1    

In [7]:
# Here we encoding the basket df with our encode funtion
basket_sets = basket.applymap(encode)

In [17]:
# This is the size of the basket_sets
basket_sets.size

241667217

In [8]:
# Get frequent itemsets using apriori
frequent_itemsets = apriori(basket_sets, min_support=0.001, use_colnames=True, low_memory = True)

In [9]:
# Print frequnt items sets
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.015279,(196)
1,0.016088,(3957)
2,0.015144,(4210)
3,0.031514,(4605)
4,0.015439,(4799)
...,...,...
2518,0.001043,"(40706, 47626, 47766)"
2519,0.001013,"(47626, 47766, 45007)"
2520,0.001462,"(47626, 49683, 47766)"
2521,0.001439,"(13176, 47209, 21137, 21903)"


The final step is to generate the rules with their corresponding support, confidence and lift:

In [10]:
# Create the rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Print rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(5876),(3957),0.037381,0.016088,0.001157,0.030948,1.923701,0.000555,1.015335
1,(3957),(5876),0.016088,0.037381,0.001157,0.071911,1.923701,0.000555,1.037205
2,(13176),(3957),0.161785,0.016088,0.003762,0.023252,1.445357,0.001159,1.007335
3,(3957),(13176),0.016088,0.161785,0.003762,0.233837,1.445357,0.001159,1.094043
4,(21137),(3957),0.112891,0.016088,0.002259,0.020013,1.243979,0.000443,1.004005
...,...,...,...,...,...,...,...,...,...
6417,"(21137, 27966)","(13176, 47209)",0.014556,0.026530,0.001662,0.114147,4.302641,0.001275,1.098908
6418,(13176),"(47209, 21137, 27966)",0.161785,0.003389,0.001662,0.010270,3.030382,0.001113,1.006953
6419,(47209),"(13176, 21137, 27966)",0.090483,0.005011,0.001662,0.018363,3.664351,0.001208,1.013602
6420,(21137),"(13176, 47209, 27966)",0.112891,0.004891,0.001662,0.014718,3.009076,0.001109,1.009974


In [16]:
# We can filter the dataframe using standard pandas code
rules[ (rules['lift'] >= 2) & (rules['confidence'] >= 0.4) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
3319,"(4605, 16797)",(24852),0.002171,0.201259,0.001021,0.470377,2.337169,0.000584,1.508131
3403,"(4920, 28204)",(24852),0.002436,0.201259,0.001143,0.469055,2.330598,0.000652,1.504375
3408,"(4920, 45066)",(24852),0.002709,0.201259,0.001174,0.433238,2.152632,0.000628,1.409304
3421,"(4920, 47766)",(24852),0.003384,0.201259,0.001409,0.416414,2.069043,0.000728,1.368678
3426,"(4920, 49683)",(24852),0.002387,0.201259,0.001051,0.440268,2.187563,0.00057,1.427005
3446,"(5876, 8277)",(13176),0.002505,0.161785,0.001007,0.40206,2.485155,0.000602,1.401839
3482,"(5876, 27966)",(13176),0.004337,0.161785,0.001762,0.406309,2.511417,0.001061,1.411871
3572,"(21137, 8174)",(13176),0.003079,0.161785,0.001345,0.436876,2.700356,0.000847,1.48851
3583,"(27966, 8174)",(13176),0.002255,0.161785,0.001119,0.496094,3.066386,0.000754,1.663437
3590,"(47209, 8174)",(13176),0.003491,0.161785,0.001628,0.466385,2.882751,0.001064,1.570824
