In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import os
import zipfile
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

## Load the datasets

In [2]:
order_products = pd.read_csv('order_products__train.csv')

In [3]:
products = pd.read_csv('products.csv')

In [4]:
aisles = pd.read_csv('aisles.csv')

In [5]:
dept = pd.read_csv('departments.csv')

In [6]:
orders = pd.read_csv('orders.csv')

In [7]:
df1 = order_products

In [8]:
df2 = df1.merge(products, on='product_id')

In [9]:
df3 = df2.merge(aisles, on='aisle_id')

In [10]:
df4 = df3.merge(orders, on='order_id')

In [11]:
df5 = df4.merge(dept, on='department_id')

In [12]:
data = df5

In [13]:
data.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,department
0,1,49302,1,1,Bulgarian Yogurt,120,16,yogurt,112108,train,4,4,10,9.0,dairy eggs
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,112108,train,4,4,10,9.0,dairy eggs
2,1,22035,8,1,Organic Whole String Cheese,21,16,packaged cheese,112108,train,4,4,10,9.0,dairy eggs
3,816049,49302,7,1,Bulgarian Yogurt,120,16,yogurt,47901,train,14,4,6,16.0,dairy eggs
4,816049,35176,5,1,Cream Cheese Cream Cheese Spread,108,16,other creams cheeses,47901,train,14,4,6,16.0,dairy eggs


## Sampling data using 100 most frequent products

In [14]:
product_counts = data.groupby('product_id')['order_id'].count().reset_index().rename(columns = {'order_id':'frequency'})
product_counts = product_counts.sort_values('frequency', ascending=False)[0:100].reset_index(drop=True)
product_counts.head(10)

Unnamed: 0,product_id,frequency
0,24852,18726
1,13176,15480
2,21137,10894
3,21903,9784
4,47626,8135
5,47766,7409
6,47209,7293
7,16797,6494
8,26209,6033
9,27966,5546


In [15]:
freq_products = list(product_counts.product_id)
freq_products[1:10]


[13176, 21137, 21903, 47626, 47766, 47209, 16797, 26209, 27966]

In [16]:
order_products = data[data.product_id.isin(freq_products)]
order_products.shape

(314227, 15)

In [17]:
basket = order_products.pivot_table(columns='product_name', values='reordered', index='order_id').reset_index().fillna(0).set_index('order_id')

In [18]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
    
basket = basket.applymap(encode_units)
basket.head()

product_name,100% Whole Wheat Bread,2% Reduced Fat Milk,Apple Honeycrisp Organic,Asparagus,Bag of Organic Bananas,Banana,Blueberries,Boneless Skinless Chicken Breasts,Broccoli Crown,Bunched Cilantro,...,Sparkling Lemon Water,Sparkling Natural Mineral Water,Sparkling Water Grapefruit,Spring Water,Strawberries,Uncured Genoa Salami,Unsalted Butter,Unsweetened Almondmilk,Unsweetened Original Almond Breeze Almond Milk,Yellow Onions
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
38,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [19]:
shortbasket = basket[:100000]


## Frequent Itemsets Mining


In [20]:
import warnings
warnings.filterwarnings("ignore")

In [21]:
frequent_items = apriori(shortbasket, min_support=0.01, use_colnames=True)
frequent_items.head(10)

Unnamed: 0,support,itemsets
0,0.018668,(100% Whole Wheat Bread)
1,0.013298,(2% Reduced Fat Milk)
2,0.017304,(Apple Honeycrisp Organic)
3,0.02618,(Asparagus)
4,0.142376,(Bag of Organic Bananas)
5,0.17642,(Banana)
6,0.014992,(Blueberries)
7,0.01609,(Boneless Skinless Chicken Breasts)
8,0.017805,(Broccoli Crown)
9,0.012168,(Bunched Cilantro)


## Association Rules Mining

In [22]:
rules = association_rules(frequent_items, metric='lift', min_threshold=1)
rules.sort_values('lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
19,(Limes),(Large Lemon),0.045115,0.063111,0.010325,0.228862,3.626315,0.007478,1.214942,0.758455
18,(Large Lemon),(Limes),0.063111,0.045115,0.010325,0.1636,3.626315,0.007478,1.141661,0.773025
30,(Organic Strawberries),(Organic Raspberries),0.091668,0.045594,0.013074,0.142625,3.128143,0.008895,1.113172,0.748979
31,(Organic Raspberries),(Organic Strawberries),0.045594,0.091668,0.013074,0.286749,3.128143,0.008895,1.273511,0.712822
21,(Organic Avocado),(Large Lemon),0.06634,0.063111,0.010293,0.155156,2.458445,0.006106,1.108948,0.635391
20,(Large Lemon),(Organic Avocado),0.063111,0.06634,0.010293,0.163093,2.458445,0.006106,1.115608,0.633201
3,(Organic Hass Avocado),(Bag of Organic Bananas),0.064379,0.142376,0.021449,0.333168,2.340054,0.012283,1.286117,0.612064
2,(Bag of Organic Bananas),(Organic Hass Avocado),0.142376,0.064379,0.021449,0.150651,2.340054,0.012283,1.101574,0.667728
4,(Bag of Organic Bananas),(Organic Raspberries),0.142376,0.045594,0.014811,0.104026,2.281578,0.008319,1.065217,0.654957
5,(Organic Raspberries),(Bag of Organic Bananas),0.045594,0.142376,0.014811,0.324842,2.281578,0.008319,1.270257,0.588541


### Filtering the reults based on threshold values for confidence and lift

In [23]:
rules[(rules['confidence']>0.2) & (rules['lift']>1)]


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Organic Baby Spinach),(Bag of Organic Bananas),0.085828,0.142376,0.018498,0.215518,1.513725,0.006278,1.093236,0.371241
3,(Organic Hass Avocado),(Bag of Organic Bananas),0.064379,0.142376,0.021449,0.333168,2.340054,0.012283,1.286117,0.612064
5,(Organic Raspberries),(Bag of Organic Bananas),0.045594,0.142376,0.014811,0.324842,2.281578,0.008319,1.270257,0.588541
6,(Organic Strawberries),(Bag of Organic Bananas),0.091668,0.142376,0.025456,0.277694,1.950424,0.012404,1.187341,0.536468
8,(Large Lemon),(Banana),0.063111,0.17642,0.016537,0.262029,1.48526,0.005403,1.116007,0.348726
11,(Organic Avocado),(Banana),0.06634,0.17642,0.019318,0.291198,1.650598,0.007614,1.161933,0.422165
16,(Strawberries),(Banana),0.050996,0.17642,0.015248,0.298997,1.694804,0.006251,1.17486,0.431991
19,(Limes),(Large Lemon),0.045115,0.063111,0.010325,0.228862,3.626315,0.007478,1.214942,0.758455
31,(Organic Raspberries),(Organic Strawberries),0.045594,0.091668,0.013074,0.286749,3.128143,0.008895,1.273511,0.712822


## Recommendations:

1. Utilize the association rules to recommend complementary or frequently co-purchased items. For example, when a customer purchases "Organic Hass Avocado," recommend them to also buy "Bag of Organic Bananas." This strategy can increase the average order value and encourage customers to explore additional products, leading to increased sales and customer satisfaction. We could alos try placing the frequently co-purchased items nearby in the store.

2. By analyzing frequent item associations and understanding customers' purchasing patterns, businesses can identify distinct customer segments. With this knowledge, they can create targeted marketing messages and offers that emphasize the relationships between items. For example, a campaign could be designed to promote "Limes" and "Large Lemon" together, specifically targeting customers who have previously purchased either of these items. This personalized approach has the potential to boost customer engagement, foster repeat purchases, and cultivate stronger brand loyalty.