# Imports

In [26]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import random

# Load Data

In [27]:
df = pd.read_csv("../datasets/BreadBasket_DMS_DT.csv")

# Business Problem

## Non-modified dataset

We limit the transactions to those done on Saturdays (most common day). Our idea: To offer discounts on common pairs or triples to make cross-selling

In [28]:
df["DateTime"] = pd.to_datetime(df["DateTime"], utc=True)
df2 = df[df.DateTime.dt.day_name() == "Saturday"]

We obtain a list of transactions. We iterate over unique transaction ids on our dataframe, creating a list for each one of them and adding onto it the purchases with that transaction id.
Once the list is ready, we convert it onto a set to get rid of the repeated items and then cast it back to a list.

In [29]:
transactions=[]
for item in df['Transaction'].unique():
    itemList = list(set(df[df['Transaction']==item]['Item']))
    transactions.append(itemList)

We convert this list into a 0/1 array (we could also leave it as a True/False array as mlxtend_apriori works with both of them)

In [30]:
te = TransactionEncoder()
encodedData = te.fit(transactions).transform(transactions)
basketData = pd.DataFrame(encodedData, columns=te.columns_)
basketData.replace({False: 0, True: 1}, inplace=True)
basketData

Unnamed: 0,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
freq_items = apriori(basketData, min_support = 0.01, use_colnames = True, verbose = 1)
freq_items = freq_items.sort_values("support", ascending = False)
freq_items

Processing 720 combinations | Sampling itemset size 3


Unnamed: 0,support,itemsets
6,0.475081,(Coffee)
2,0.324940,(Bread)
27,0.141643,(Tea)
4,0.103137,(Cake)
35,0.089393,"(Coffee, Bread)"
...,...,...
57,0.010807,"(Coffee, Spanish Brunch)"
33,0.010702,"(Bread, Brownie)"
11,0.010492,(Hearty & Seasonal)
21,0.010387,(Salad)


In [32]:
df_ar = association_rules(freq_items, metric = "confidence", min_threshold = 0)
df_ar = df_ar.sort_values("confidence", ascending = False)
df_ar[(df_ar.support > 0.15) & (df_ar.confidence > 0.5) & (df_ar.lift > 1)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [33]:
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
24,(Toast),(Coffee),0.033365,0.475081,0.023502,0.704403,1.482699,0.007651,1.775789
63,(Spanish Brunch),(Coffee),0.018046,0.475081,0.010807,0.598837,1.260494,0.002233,1.308493
13,(Medialuna),(Coffee),0.061379,0.475081,0.034939,0.569231,1.198175,0.005779,1.218561
7,(Pastry),(Coffee),0.085510,0.475081,0.047214,0.552147,1.162216,0.006590,1.172079
35,(Alfajores),(Coffee),0.036093,0.475081,0.019515,0.540698,1.138116,0.002368,1.142861
...,...,...,...,...,...,...,...,...,...
46,(Coffee),(Soup),0.475081,0.034204,0.015738,0.033127,0.968514,-0.000512,0.998886
64,(Bread),(Brownie),0.324940,0.039765,0.010702,0.032935,0.828244,-0.002219,0.992938
66,(Bread),(Alfajores),0.324940,0.036093,0.010282,0.031644,0.876728,-0.001446,0.995405
59,(Coffee),"(Bread, Pastry)",0.475081,0.028958,0.011122,0.023410,0.808405,-0.002636,0.994319


Average product per transaction

In [34]:
lenSums = 0
for i in transactions:
    lenSums += len(i)
averageLen = lenSums/len(transactions)
averageLen

2.0606442136187177

Modify dataset to make the problem work better

In [35]:
for count,i in enumerate(transactions):
    rand = random.randint(0, 100)
    if count % 2 == 0:
        i.append("Tea")
        i = list(set(i))
    if rand < 90:
        i.append("Cake")
        i = list(set(i))
    if rand > 10:
        i.append("Muffin")
        i = list(set(i))

In [36]:
te = TransactionEncoder()
encodedData = te.fit(transactions).transform(transactions)
basketData = pd.DataFrame(encodedData, columns=te.columns_)
basketData.replace({False: 0, True: 1}, inplace=True)
basketData

Unnamed: 0,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
basketData = pd.read_csv("basketDataFinal.csv")
basketData

Unnamed: 0,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
#basketData.to_csv("basketData.csv",index=False)

In [39]:
freq_items = apriori(basketData, min_support = 0.01, use_colnames = True, verbose = 1)
freq_items.sort_values("support", ascending = False)

Processing 644 combinations | Sampling itemset size 43


Unnamed: 0,support,itemsets
4,0.898122,(Cake)
6,0.475081,(Coffee)
49,0.425769,"(Coffee, Cake)"
2,0.324940,(Bread)
36,0.293883,"(Cake, Bread)"
...,...,...
35,0.010702,"(Bread, Brownie)"
11,0.010492,(Hearty & Seasonal)
21,0.010387,(Salad)
85,0.010282,"(Muffin, Hot chocolate)"


In [40]:
df_ar = association_rules(freq_items, metric = "confidence", min_threshold = 0)
df_ar[(df_ar.support > 0.15) & (df_ar.confidence > 0.5) & (df_ar.lift > 1)].sort_values("confidence", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
11,(Bread),(Cake),0.32494,0.898122,0.293883,0.904424,1.007017,0.002048,1.065934


In [41]:
coffeeCost = 1
cakeCost = 1
discounts = [0.1, 0.15, 0.2, 0.25]

In [49]:
numCoffee = basketData[(basketData.Coffee == 1) & (basketData.Cake == 0)].Coffee.sum()
numCake = basketData[(basketData.Cake == 1) & (basketData.Coffee == 0)].Cake.sum()
numCoffeeCake = basketData[(basketData.Coffee == 1) & (basketData.Cake == 1)].Coffee.sum()
print("Times coffee is purchased with cake:", numCoffeeCake, "which makes up for a", 100 * numCoffeeCake/len(basketData), "% of the total transactions")
print("Times coffee is purchased without cake:", numCoffee, "which makes up for a", 100 * numCoffee/len(basketData), "% of the total transactions")
print("Times cake is purchased without coffee:", numCake, "which makes up for a", 100 * numCake/len(basketData), "% of the total transactions")

Times coffee is purchased with cake: 4058 which makes up for a 42.576854474871475 % of the total transactions
Times coffee is purchased without cake: 4528 which makes up for a 47.50813136082258 % of the total transactions
Times cake is purchased without coffee: 8560 which makes up for a 89.81219179519462 % of the total transactions
