In [1]:
import pandas as pd

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
df = pd.read_excel('Online Retail.xlsx')
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')

In [3]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [6]:
df[df.InvoiceNo.str.contains('C', na=False)].head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.5,14527.0,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom


In [7]:
# Remove all "credit" invoices
df = df[~df['InvoiceNo'].str.contains('C')]

In [8]:
# Group by the columns we want to consider. Only look at UK for this example.
market_basket = df[df['Country'] =="United Kingdom"].groupby(['InvoiceNo', 'Description'])['Quantity']

In [9]:
# Hot encode the data and get 1 transaction per row to prepare to run our mlxtend analysis.
market_basket = market_basket.sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')

In [10]:
market_basket.head()

Description,20713,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Convert numbers to 0s or 1s
def encode_data(datapoint):
    if datapoint <= 0:
        return 0
    if datapoint >= 1:
        return 1

In [12]:
market_basket = market_basket.applymap(encode_data)

In [13]:
# Minimum support = 3%
itemsets = apriori(market_basket, min_support=0.03, use_colnames=True)

The final step is to build your association rules using the mxltend `association_rules` function. You can set the metric that you are most interested in (either `lift` or `confidence` and set the minimum threshold for the condfidence level (called `min_threshold`). The `min_threshold` can be thought of as the level of confidence percentage that you want to return. For example, if you set `min_threshold` to 1, you will only see rules with 100% confidence. I usually set this to 0.7 to start with.

In [14]:
rules = association_rules(itemsets, metric="lift", min_threshold=0.5)

In [15]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),0.046925,0.049818,0.030159,0.642694,12.900874,0.027821,2.659296
1,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),0.049818,0.046925,0.030159,0.605376,12.900874,0.027821,2.415149
2,(GREEN REGENCY TEACUP AND SAUCER),(PINK REGENCY TEACUP AND SAUCER),0.050032,0.037658,0.030909,0.617773,16.404818,0.029024,2.517724
3,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.037658,0.050032,0.030909,0.820768,16.404818,0.029024,5.300218
4,(ROSES REGENCY TEACUP AND SAUCER ),(GREEN REGENCY TEACUP AND SAUCER),0.051264,0.050032,0.037551,0.732497,14.640537,0.034986,3.551247
5,(GREEN REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER ),0.050032,0.051264,0.037551,0.750535,14.640537,0.034986,3.803087
6,(JUMBO BAG BAROQUE BLACK WHITE),(JUMBO BAG RED RETROSPOT),0.048747,0.103814,0.030534,0.626374,6.033613,0.025473,2.398615
7,(JUMBO BAG RED RETROSPOT),(JUMBO BAG BAROQUE BLACK WHITE),0.103814,0.048747,0.030534,0.294118,6.033613,0.025473,1.347609
8,(JUMBO BAG PINK POLKADOT),(JUMBO BAG RED RETROSPOT),0.062085,0.103814,0.042051,0.677308,6.524245,0.035605,2.777218
9,(JUMBO BAG RED RETROSPOT),(JUMBO BAG PINK POLKADOT),0.103814,0.062085,0.042051,0.405057,6.524245,0.035605,1.576478


# Alternative method

The line of code will not work on large datasets, so we will go through an alternative method.

In [17]:
# market_basket = market_basket.sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')

In [18]:
from itertools import combinations, groupby
from collections import Counter

In [19]:
df_manual = df[df['Country'] =="United Kingdom"]

In [20]:
orders = df_manual.set_index('InvoiceNo')['StockCode']

In [22]:
orders

InvoiceNo
536365    85123A
536365     71053
536365    84406B
536365    84029G
536365    84029E
           ...  
581585     22466
581586     22061
581586     23275
581586     21217
581586     20685
Name: StockCode, Length: 487622, dtype: object

In [21]:
# Calculate the item frequency and support values.
statistics = orders.value_counts().to_frame("frequency")
statistics['support']  = statistics / len(set(orders.index)) * 100

In [23]:
# Filter out any rows of data that doesn’t have support above our min_support level
min_support=0.03 # same value we used above.

items_above_support = statistics[statistics['support'] >= min_support].index
orders_above_support = orders[orders.isin(items_above_support)]

In [24]:
# Filter out orders that only had 1 items ordered on the invoice, since those items won’t provide any insight into our market basket analysis.
order_counts = orders.index.value_counts()
orders_over_two_index = order_counts[order_counts>=2].index
orders_over_two = orders[orders.index.isin(orders_over_two_index)]

In [25]:
# Calculate our stats dataframe again with this new order data-set.
statistics = orders_over_two.value_counts().to_frame("frequency")
statistics['support']  = statistics / len(set(orders_over_two.index)) * 100

Calculating the itemsets / item pairs: we’ll create a function that will generate our itemsets and then send our new order dataset through the generator. Then, we calculate the frequency of each item with each other (named `frequencyAC`) as well as the support (named `supportAC`). Finally, we filter out the itemsets that are below our `min_support` level

In [26]:
def itemset_generator(orders):
    orders = orders.reset_index().values
    for order_id, order_object in groupby(orders, lambda x: x[0]):
        item_list = [item[1] for item in order_object]
        for item_pair in combinations(item_list, 2):
            yield item_pair

itemsets_gen = itemset_generator(orders_over_two)
itemsets  = pd.Series(Counter(itemsets_gen)).to_frame("frequencyAC")
itemsets['supportAC'] = itemsets['frequencyAC'] / len(orders_over_two_index) * 100
itemsets = itemsets[itemsets['supportAC'] >= min_support]

In [27]:
# Create table of association rules and compute relevant metrics
itemsets = itemsets.reset_index().rename(columns={'level_0': 'antecedents', 'level_1': 'consequents'})

itemsets = (itemsets
     .merge(statistics.rename(columns={'freq': 'freqA', 'support': 'antecedent support'}), left_on='antecedents', right_index=True)
     .merge(statistics.rename(columns={'freq': 'freqC', 'support': 'consequents support'}), left_on='consequents', right_index=True))


itemsets['confidenceAtoC'] = itemsets['supportAC'] / itemsets['antecedent support']
itemsets['confidenceCtoA'] = itemsets['supportAC'] / itemsets['consequents support']
itemsets['lift'] = itemsets['supportAC'] / (itemsets['antecedent support'] * itemsets['consequents support'])

itemsets=itemsets[['antecedents', 'consequents','antecedent support', 'consequents support', 'confidenceAtoC','lift']]

In [28]:
# Finally, let’s look at our final rules. We want to look at only those items that have confidence > 0.5.
rules = itemsets
rules_over_50 = rules[(rules.confidenceAtoC >0.50)]
rules_over_50.set_index('antecedents',inplace=True)
rules_over_50.reset_index(inplace=True)
rules_over_50=rules_over_50.sort_values('lift', ascending=False)

In [29]:
rules_over_50

Unnamed: 0,antecedents,consequents,antecedent support,consequents support,confidenceAtoC,lift
1841,48173c,85099f,0.042296,0.060423,0.714286,11.821429
1881,90082D,90082B,0.036254,0.072508,0.833333,11.493056
1888,20698,20697,0.054381,0.078550,0.888889,11.316239
1281,90129D,90129E,0.042296,0.084592,0.857143,10.132653
1880,37444C,72051S,0.054381,0.084592,0.666667,7.880952
...,...,...,...,...,...,...
1152,35915C,85123A,0.356495,13.021148,0.525424,0.040352
1135,85039A,85123A,0.966767,13.021148,0.518750,0.039839
1145,85093,85123A,0.513595,13.021148,0.517647,0.039754
1151,84638,85123A,0.199396,13.021148,0.515152,0.039563
