# Imports

In [1]:
import random
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Load Data

In [2]:
df = pd.read_csv("datasets/BreadBasket_DMS_DT.csv")

# Business Problem

Our idea: To offer discounts on common pairs or triples to make cross-selling

## Original dataset

We limit the transactions to those done on Saturdays (most common day)

In [3]:
df["DateTime"] = pd.to_datetime(df["DateTime"], utc=True)
df2 = df[df.DateTime.dt.day_name() == "Saturday"]

We obtain a list of transactions. We iterate over unique transaction ids on our dataframe, creating a list for each one of them and adding onto it the purchases with that transaction id.
Once the list is ready, we convert it onto a set to get rid of the repeated items and then cast it back to a list.

In [4]:
transactions=[]
for item in df['Transaction'].unique():
    itemList = list(set(df[df['Transaction']==item]['Item']))
    transactions.append(itemList)

We convert this list into a 0/1 array (we could also leave it as a True/False array as mlxtend_apriori works with both of them)

In [5]:
te = TransactionEncoder()
encodedData = te.fit(transactions).transform(transactions)
basketData = pd.DataFrame(encodedData, columns=te.columns_)
basketData.replace({False: 0, True: 1}, inplace=True)
basketData

Unnamed: 0,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
freq_items = apriori(basketData, min_support = 0.01, use_colnames = True, verbose = 1)
freq_items = freq_items.sort_values("support", ascending = False)
freq_items

Processing 720 combinations | Sampling itemset size 3


Unnamed: 0,support,itemsets
6,0.475081,(Coffee)
2,0.324940,(Bread)
27,0.141643,(Tea)
4,0.103137,(Cake)
35,0.089393,"(Coffee, Bread)"
...,...,...
57,0.010807,"(Spanish Brunch, Coffee)"
33,0.010702,"(Bread, Brownie)"
11,0.010492,(Hearty & Seasonal)
21,0.010387,(Salad)


In [7]:
df_ar = association_rules(freq_items, metric = "confidence", min_threshold = 0)
df_ar[(df_ar.support > 0.15) & (df_ar.confidence > 0.5) & (df_ar.lift > 1)].sort_values("confidence", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [8]:
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Coffee),(Bread),0.475081,0.324940,0.089393,0.188163,0.579069,-0.064980,0.831522
1,(Bread),(Coffee),0.324940,0.475081,0.089393,0.275105,0.579069,-0.064980,0.724131
2,(Cake),(Coffee),0.103137,0.475081,0.054349,0.526958,1.109196,0.005350,1.109667
3,(Coffee),(Cake),0.475081,0.103137,0.054349,0.114399,1.109196,0.005350,1.012717
4,(Tea),(Coffee),0.141643,0.475081,0.049523,0.349630,0.735936,-0.017769,0.807107
...,...,...,...,...,...,...,...,...,...
63,(Coffee),(Spanish Brunch),0.475081,0.018046,0.010807,0.022747,1.260494,0.002233,1.004810
64,(Bread),(Brownie),0.324940,0.039765,0.010702,0.032935,0.828244,-0.002219,0.992938
65,(Brownie),(Bread),0.039765,0.324940,0.010702,0.269129,0.828244,-0.002219,0.923639
66,(Bread),(Alfajores),0.324940,0.036093,0.010282,0.031644,0.876728,-0.001446,0.995405


Average product per transaction

In [9]:
lenSums = 0
for i in transactions:
    lenSums += len(i)
averageLen = lenSums/len(transactions)
averageLen

2.0606442136187177

## Modified dataset

We modify the dataset by randomly adding some items onto transactions to make our association rules stronger

In [10]:
for count,i in enumerate(transactions):
    rand = random.randint(0, 100)
    if count % 2 == 0:
        i.append("Tea")
        i = list(set(i))
    if rand < 90:
        i.append("Cake")
        i = list(set(i))
    if rand > 10:
        i.append("Muffin")
        i = list(set(i))

We can see the average product number per transaction has been raised by 1

In [11]:
lenSums = 0
for i in transactions:
    lenSums += len(i)
averageLen = lenSums/len(transactions)
print("Average number of products per transactions", "{:.2f}".format(averageLen))

Average number of products per transactions 3.06


In [12]:
te = TransactionEncoder()
encodedData = te.fit(transactions).transform(transactions)
basketData = pd.DataFrame(encodedData, columns=te.columns_)
basketData.replace({False: 0, True: 1}, inplace=True)
#basketData.to_csv("basketDataModified.csv", index = False)
basketData = pd.read_csv("basketDataModified.csv")
basketData

Unnamed: 0,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
freq_items = apriori(basketData, min_support = 0.01, use_colnames = True, verbose = 1)
freq_items.sort_values("support", ascending = False)

Processing 440 combinations | Sampling itemset size 43


Unnamed: 0,support,itemsets
27,0.570769,(Tea)
4,0.504144,(Cake)
6,0.475081,(Coffee)
2,0.324940,(Bread)
76,0.262827,"(Tea, Coffee)"
...,...,...
104,0.010282,"(Sandwich, Tea, Bread)"
31,0.010282,"(Bread, Alfajores)"
95,0.010177,"(Cake, Coffee, Alfajores)"
106,0.010072,"(Tea, Coffee, Brownie)"


In [14]:
df_ar = association_rules(freq_items, metric = "confidence", min_threshold = 0)
df_ar[(df_ar.support > 0.15) & (df_ar.confidence > 0.5) & (df_ar.lift > 1)].sort_values("confidence", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
37,(Coffee),(Cake),0.475081,0.504144,0.239849,0.504859,1.001417,0.000339,1.001443


In [15]:
numCoffee = basketData[(basketData.Coffee == 1) & (basketData.Cake == 0)].Coffee.sum()
numCake = basketData[(basketData.Cake == 1) & (basketData.Coffee == 0)].Cake.sum()
numCoffeeCake = basketData[(basketData.Coffee == 1) & (basketData.Cake == 1)].Coffee.sum()
print("Times coffee is purchased with cake:", numCoffeeCake, "which makes up for a", "{:.2f}".format(100 * numCoffeeCake/len(basketData)), "% of the total transactions")
print("We could have done cross-selling on ", numCoffee,"transactions, which represent a", "{:.2f}".format(100 * (numCake + numCoffee)/len(basketData)), "% of the total transactions")
print(1.3001417)

Times coffee is purchased with cake: 2286 which makes up for a 23.98 % of the total transactions
We could have done cross-selling on  2242 transactions, which represent a 49.95 % of the total transactions
1.3001417


We make up a nominal price for coffee and cake, and an array of discounts. We want to investigate:
1. How would creating an offer for Coffee + Cake affect income
2. How big is the discount we could offer
3. How much cross-selling we would have to do for this offer to be worth-it for the bakery

In [16]:
discounts = [0.2, 0.15, 0.1]
coffeePrice = 1.25
cakePrice = 4.5

In [17]:
x = np.arange(100)
z = np.ones(100)
fig = go.Figure(data=go.Scatter(x = x, y = coffeePrice*(numCoffee)*(1-x/100) + (coffeePrice+cakePrice)*(1-discounts[0])*(numCoffee)*x/100 + numCoffeeCake*(1-discounts[0])*(coffeePrice+cakePrice) ) )
fig.add_scatter(x = x, y = ((coffeePrice + cakePrice)*numCoffeeCake + coffeePrice*numCoffee)*z)
fig.add_scatter(x = 0.504144 * 100 * np.ones(100), y = coffeePrice*(numCoffee)*(1-x/100) + (coffeePrice+cakePrice)*(1-discounts[0])*(numCoffee)*x/100 + numCoffeeCake*(1-discounts[0])*(coffeePrice+cakePrice))
#fig.update_yaxes(rangemode="tozero")
fig['layout'].update(width=500, height=500, autosize=False)
fig.show()

In [18]:
print("Cada sábado (de media) se realizan ", "{:.0f}".format(numCoffeeCake/28), " transacciones de cafés + tarta y se venden ", "{:.0f}".format(numCoffee/28), " cafés por separado.")
print("Descuento de ", "{:.2f}".format((coffeePrice + cakePrice)*discounts[0]), "euros " )
print("Cada sábado (de media), suponiendo que la oferta atraiga a aproximadamente el 50% de clientes que consumen solo café, se obtendría una mejora en ingresos de", "{:.2f}".format( ( (coffeePrice*(numCoffee)*(1-0.504144) + (coffeePrice+cakePrice)*(1-discounts[0])*(numCoffee)*0.504144 + numCoffeeCake*(1-discounts[0])*(coffeePrice+cakePrice) - ((coffeePrice + cakePrice)*numCoffeeCake + coffeePrice*numCoffee)))/28) )

Cada sábado (de media) se realizan  82  transacciones de cafés + tarta y se venden  80  cafés por separado.
Descuento de  1.15 euros 
Cada sábado (de media), suponiendo que la oferta atraiga a aproximadamente el 50% de clientes que consumen solo café, se obtendría una mejora en ingresos de 41.34


In [19]:
x = np.arange(100)
z = np.ones(100)
fig = go.Figure(data=go.Scatter(x = x, y = coffeePrice*(numCoffee)*(1-x/100) + (coffeePrice+cakePrice)*(1-discounts[1])*(numCoffee)*x/100 + numCoffeeCake*(1-discounts[1])*(coffeePrice+cakePrice) ) )
fig.add_scatter(x = x, y = ((coffeePrice + cakePrice)*numCoffeeCake + coffeePrice*numCoffee)*z)
fig.add_scatter(x = 0.504144 * 100 * np.ones(100), y = coffeePrice*(numCoffee)*(1-x/100) + (coffeePrice+cakePrice)*(1-discounts[1])*(numCoffee)*x/100 + numCoffeeCake*(1-discounts[1])*(coffeePrice+cakePrice))
#fig.update_yaxes(rangemode="tozero")
fig['layout'].update(width=500, height=500, autosize=False)
fig.show()

In [20]:
print("Descuento de ", "{:.2f}".format((coffeePrice + cakePrice)*discounts[1]), "euros " )
print("Cada sábado (de media), suponiendo que la oferta atraiga a aproximadamente el 50% de clientes que consumen solo café, se obtendría una mejora en ingresos de", "{:.2f}".format( ( (coffeePrice*(numCoffee)*(1-0.504144) + (coffeePrice+cakePrice)*(1-discounts[1])*(numCoffee)*0.504144 + numCoffeeCake*(1-discounts[1])*(coffeePrice+cakePrice) - ((coffeePrice + cakePrice)*numCoffeeCake + coffeePrice*numCoffee)))/28) )

Descuento de  0.86 euros 
Cada sábado (de media), suponiendo que la oferta atraiga a aproximadamente el 50% de clientes que consumen solo café, se obtendría una mejora en ingresos de 76.42


In [21]:
x = np.arange(100)
z = np.ones(100)
fig = go.Figure(data=go.Scatter(x = x, y = coffeePrice*(numCoffee)*(1-x/100) + (coffeePrice+cakePrice)*(1-discounts[2])*(numCoffee)*x/100 + numCoffeeCake*(1-discounts[2])*(coffeePrice+cakePrice) ) )
fig.add_scatter(x = x, y = ((coffeePrice + cakePrice)*numCoffeeCake + coffeePrice*numCoffee)*z)
fig.add_scatter(x = 0.504144 * 100 * np.ones(100), y = coffeePrice*(numCoffee)*(1-x/100) + (coffeePrice+cakePrice)*(1-discounts[2])*(numCoffee)*x/100 + numCoffeeCake*(1-discounts[2])*(coffeePrice+cakePrice))
#fig.update_yaxes(rangemode="tozero")
fig['layout'].update(width=500, height=500, autosize=False)
fig.show()

In [22]:
print("Descuento de ", "{:.2f}".format((coffeePrice + cakePrice)*discounts[2]), "euros " )
print("Cada sábado (de media), suponiendo que la oferta atraiga a aproximadamente el 50% de clientes que consumen solo café, se obtendría una mejora en ingresos de", "{:.2f}".format( ( (coffeePrice*(numCoffee)*(1-0.504144) + (coffeePrice+cakePrice)*(1-discounts[2])*(numCoffee)*0.504144 + numCoffeeCake*(1-discounts[2])*(coffeePrice+cakePrice) - ((coffeePrice + cakePrice)*numCoffeeCake + coffeePrice*numCoffee)))/28) )

Descuento de  0.58 euros 
Cada sábado (de media), suponiendo que la oferta atraiga a aproximadamente el 50% de clientes que consumen solo café, se obtendría una mejora en ingresos de 111.50
