In [2]:
import pandas as pd
import numpy as np

from apyori import apriori
from mlxtend.frequent_patterns import association_rules

In [3]:
carts_df = pd.read_pickle("../../data/tesco/tesco_carts.pickle")
inventory_df = pd.read_csv("../../data/tesco/tesco_inventory_clean.csv")

In [3]:
carts = carts_df["cart"].tolist()

# Triplets

    I chose min_support=0.001, meaning the set must occur at least 0.1% of the time, this was a rough guess of how often triplets occur in the data. 
    I chose min_confidence=0.5 to filter out sets that occur together by chance and to return mostly triplets. 
    I chose min_lift=5 as the marketing department is interested in strong associations. 

In [5]:
association_rules = apriori(carts, min_support=0.001, min_confidence=0.5, min_lift=5)
association_rules = list(association_rules)

In [6]:
association_rules

[RelationRecord(items=frozenset({50501610, 50502269}), support=0.00189411050410001, ordered_statistics=[OrderedStatistic(items_base=frozenset({50501610}), items_add=frozenset({50502269}), confidence=0.5247680728163837, lift=5.917749115988009)]),
 RelationRecord(items=frozenset({50623419, 50502269}), support=0.0016116997651631506, ordered_statistics=[OrderedStatistic(items_base=frozenset({50623419}), items_add=frozenset({50502269}), confidence=0.5310158201498751, lift=5.988204243071593)]),
 RelationRecord(items=frozenset({50502436, 50502269, 50652534}), support=0.0038084383317928154, ordered_statistics=[OrderedStatistic(items_base=frozenset({50502269, 50652534}), items_add=frozenset({50502436}), confidence=0.538022134951803, lift=9.873986587325048)]),
 RelationRecord(items=frozenset({50503441, 50502436, 50652534}), support=0.002493048715536571, ordered_statistics=[OrderedStatistic(items_base=frozenset({50503441, 50652534}), items_add=frozenset({50502436}), confidence=0.5772381509654768,

In [7]:
association_sets = []
for sets in association_rules:
    cur_set = list(sets[0])
    if len(cur_set) > 2:
        items = [inventory_df.loc[inventory_df['product_id'] == val]["description"].values[0] for val in cur_set]
        print(items)
        print()

['Tesco Whole Cucumber Each', 'Tesco Bananas Loose', 'Tesco Iceberg Lettuce Each']

['Tesco White Potatoes 2.5Kg', 'Tesco Whole Cucumber Each', 'Tesco Iceberg Lettuce Each']

['Tesco Cauliflower Each', 'Tesco Whole Cucumber Each', 'Tesco Iceberg Lettuce Each']

['Tesco Mixed Peppers 3 Pack', 'Tesco Whole Cucumber Each', 'Tesco Iceberg Lettuce Each']

['Tesco British Whole Milk 2.272L/4 Pints', 'Tesco Whole Cucumber Each', 'Tesco Iceberg Lettuce Each']

['Tesco British Semi Skimmed Milk 2.272Ltr 4 Pints', 'Tesco Whole Cucumber Each', 'Tesco Iceberg Lettuce Each']

['Tesco Broccoli 335G', 'Tesco Whole Cucumber Each', 'Tesco Iceberg Lettuce Each']

['Tesco Bunched Spring Onions 100G', 'Tesco Whole Cucumber Each', 'Tesco Iceberg Lettuce Each']

['Tesco Whole Cucumber Each', 'Tesco Iceberg Lettuce Each', 'Tesco Cherry Tomatoes 330G']

['Tesco Mixed Peppers 3 Pack', 'Tesco Whole Cucumber Each', 'Tesco Cherry Tomatoes 330G']

['Tesco Bunched Spring Onions 100G', 'Tesco Whole Cucumber Each', '

# Pairs

    I choose min_support=0.01 as the marketing department is interested in sets occurring at least 1% of the time. 
    I choose min_lift=2 to filter out sets of len=1, i.e. standalone purchases. 

In [None]:
frequent_itemsets = apriori(carts, min_support=0.01, min_lift=2)
frequent_itemsets  = list(frequent_itemsets)

In [12]:
frequent_itemsets

[RelationRecord(items=frozenset({50502436, 50502269}), support=0.01637540031880192, ordered_statistics=[OrderedStatistic(items_base=frozenset({50502269}), items_add=frozenset({50502436}), confidence=0.18466350332720616, lift=3.3890147571430838), OrderedStatistic(items_base=frozenset({50502436}), items_add=frozenset({50502269}), confidence=0.30052756681546755, lift=3.3890147571430833)]),
 RelationRecord(items=frozenset({50502436, 50652534}), support=0.01153777385786337, ordered_statistics=[OrderedStatistic(items_base=frozenset({50502436}), items_add=frozenset({50652534}), confidence=0.21174560844106904, lift=8.542365315444068), OrderedStatistic(items_base=frozenset({50652534}), items_add=frozenset({50502436}), confidence=0.46546362848549727, lift=8.542365315444068)])]

#### 1st frequent pair

In [13]:
inventory_df.loc[inventory_df['product_id'] == 50502436]

Unnamed: 0,product_id,category,description,ingredients,energy,fat,saturates,salt,sugars,protein,carbohydrate,fibre,avg_price
186,50502436,fruit_veg,Tesco Whole Cucumber Each,no_ingredients,11.0,0.1,0.1,0.01,1.4,0.7,1.5,0.6,0.74


In [14]:
inventory_df.loc[inventory_df['product_id'] == 50502269]

Unnamed: 0,product_id,category,description,ingredients,energy,fat,saturates,salt,sugars,protein,carbohydrate,fibre,avg_price
8197,50502269,fruit_veg,Tesco Bananas Loose,no_ingredients,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.155263


#### 2nd frequent pair

In [15]:
inventory_df.loc[inventory_df['product_id'] == 50502436]

Unnamed: 0,product_id,category,description,ingredients,energy,fat,saturates,salt,sugars,protein,carbohydrate,fibre,avg_price
186,50502436,fruit_veg,Tesco Whole Cucumber Each,no_ingredients,11.0,0.1,0.1,0.01,1.4,0.7,1.5,0.6,0.74


In [16]:
inventory_df.loc[inventory_df['product_id'] == 50652534]

Unnamed: 0,product_id,category,description,ingredients,energy,fat,saturates,salt,sugars,protein,carbohydrate,fibre,avg_price
2793,50652534,fruit_veg,Tesco Iceberg Lettuce Each,no_ingredients,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99
