In [4]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the data from the UCI Machine Learning Repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
online_retail_data = pd.read_excel(url)

In [5]:
# Data preprocessing
online_retail_data['Description'] = online_retail_data['Description'].str.strip()
online_retail_data = online_retail_data.dropna(axis=0, subset=['InvoiceNo'])
online_retail_data['InvoiceNo'] = online_retail_data['InvoiceNo'].astype('str')
online_retail_data = online_retail_data[~online_retail_data['InvoiceNo'].str.contains('C')] # 취소

In [6]:
online_retail_data.sample(20)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
271261,560686,21754,HOME BUILDING BLOCK WORD,3,2011-07-20 11:50:00,5.95,15632.0,United Kingdom
398011,571216,21232,STRAWBERRY CERAMIC TRINKET BOX,1,2011-10-14 13:06:00,1.25,16081.0,United Kingdom
498835,578541,22627,MINT KITCHEN SCALES,2,2011-11-24 15:01:00,8.5,14277.0,France
421667,573020,23235,STORAGE TIN VINTAGE LEAF,6,2011-10-27 12:37:00,2.89,14842.0,United Kingdom
165055,550765,23051,RECYCLED ACAPULCO MAT BLUE,2,2011-04-20 12:28:00,8.25,16103.0,United Kingdom
70559,542084,20727,LUNCH BAG BLACK SKULL.,50,2011-01-25 12:31:00,1.65,14680.0,United Kingdom
509884,579408,22349,DOG BOWL CHASING BALL DESIGN,4,2011-11-29 12:34:00,3.75,14701.0,United Kingdom
98230,544673,22908,PACK OF 20 NAPKINS RED APPLES,9,2011-02-22 15:46:00,0.85,13421.0,United Kingdom
59321,541282,23231,WRAP DOILEY DESIGN,25,2011-01-17 12:31:00,0.42,17690.0,United Kingdom
161078,550471,21930,JUMBO STORAGE BAG SKULLS,1,2011-04-18 13:52:00,4.13,,United Kingdom


In [7]:
# Convert transaction data into a basket format
basket = (online_retail_data[online_retail_data['Country'] == "United Kingdom"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))
basket.head()

Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Convert the quantities into 0/1 (0: not in the basket, 1: in the basket)
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)

In [9]:
# Use the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(basket_sets, min_support=0.03, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)



In [10]:
basket.shape

(18667, 4175)

In [11]:
basket.head()

Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.046928,0.049821,0.03016,0.642694,12.900183,1.0,0.027822,2.659288,0.967903,0.452936,0.62396,0.624035
1,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.049821,0.046928,0.03016,0.605376,12.900183,1.0,0.027822,2.415142,0.97085,0.452936,0.585946,0.624035
2,(GREEN REGENCY TEACUP AND SAUCER),(PINK REGENCY TEACUP AND SAUCER),0.050035,0.03766,0.03091,0.617773,16.403939,1.0,0.029026,2.517719,0.988498,0.54434,0.602815,0.719271
3,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.03766,0.050035,0.03091,0.820768,16.403939,1.0,0.029026,5.300203,0.975787,0.54434,0.811328,0.719271
4,(ROSES REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.051267,0.050035,0.037553,0.732497,14.639752,1.0,0.034988,3.551237,0.982039,0.589076,0.718408,0.741516
5,(GREEN REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER),0.050035,0.051267,0.037553,0.750535,14.639752,1.0,0.034988,3.803076,0.980765,0.589076,0.737055,0.741516
6,(JUMBO BAG BAROQUE BLACK WHITE),(JUMBO BAG RED RETROSPOT),0.048749,0.10382,0.030535,0.626374,6.03329,1.0,0.025474,2.398601,0.877006,0.250219,0.58309,0.460246
7,(JUMBO BAG RED RETROSPOT),(JUMBO BAG BAROQUE BLACK WHITE),0.10382,0.048749,0.030535,0.294118,6.03329,1.0,0.025474,1.347605,0.930898,0.250219,0.257943,0.460246
8,(JUMBO BAG PINK POLKADOT),(JUMBO BAG RED RETROSPOT),0.062088,0.10382,0.042053,0.677308,6.523895,1.0,0.035607,2.777201,0.902769,0.339533,0.639925,0.541182
9,(JUMBO BAG RED RETROSPOT),(JUMBO BAG PINK POLKADOT),0.10382,0.062088,0.042053,0.405057,6.523895,1.0,0.035607,1.576473,0.944807,0.339533,0.365673,0.541182


In [13]:
# Filter rules by a minimum lift and confidence
filtered_rules = rules[(rules['lift'] >= 3.0) & (rules['confidence'] >= 0.5)]

In [14]:
# Print the association rules
print(filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

                            antecedents                        consequents   
0          (ALARM CLOCK BAKELIKE GREEN)         (ALARM CLOCK BAKELIKE RED)  \
1            (ALARM CLOCK BAKELIKE RED)       (ALARM CLOCK BAKELIKE GREEN)   
2     (GREEN REGENCY TEACUP AND SAUCER)   (PINK REGENCY TEACUP AND SAUCER)   
3      (PINK REGENCY TEACUP AND SAUCER)  (GREEN REGENCY TEACUP AND SAUCER)   
4     (ROSES REGENCY TEACUP AND SAUCER)  (GREEN REGENCY TEACUP AND SAUCER)   
5     (GREEN REGENCY TEACUP AND SAUCER)  (ROSES REGENCY TEACUP AND SAUCER)   
6      (JUMBO  BAG BAROQUE BLACK WHITE)          (JUMBO BAG RED RETROSPOT)   
8             (JUMBO BAG PINK POLKADOT)          (JUMBO BAG RED RETROSPOT)   
10  (JUMBO SHOPPER VINTAGE RED PAISLEY)          (JUMBO BAG RED RETROSPOT)   
12             (JUMBO STORAGE BAG SUKI)          (JUMBO BAG RED RETROSPOT)   

     support  confidence       lift  
0   0.030160    0.642694  12.900183  
1   0.030160    0.605376  12.900183  
2   0.030910    0.617773  1

In [16]:
filtered_rules = rules[(rules['lift'] >= 2.0) & (rules['confidence'] >= 0.4)]
print(filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

                            antecedents                        consequents   
0          (ALARM CLOCK BAKELIKE GREEN)         (ALARM CLOCK BAKELIKE RED)  \
1            (ALARM CLOCK BAKELIKE RED)       (ALARM CLOCK BAKELIKE GREEN)   
2     (GREEN REGENCY TEACUP AND SAUCER)   (PINK REGENCY TEACUP AND SAUCER)   
3      (PINK REGENCY TEACUP AND SAUCER)  (GREEN REGENCY TEACUP AND SAUCER)   
4     (ROSES REGENCY TEACUP AND SAUCER)  (GREEN REGENCY TEACUP AND SAUCER)   
5     (GREEN REGENCY TEACUP AND SAUCER)  (ROSES REGENCY TEACUP AND SAUCER)   
6      (JUMBO  BAG BAROQUE BLACK WHITE)          (JUMBO BAG RED RETROSPOT)   
8             (JUMBO BAG PINK POLKADOT)          (JUMBO BAG RED RETROSPOT)   
9             (JUMBO BAG RED RETROSPOT)          (JUMBO BAG PINK POLKADOT)   
10  (JUMBO SHOPPER VINTAGE RED PAISLEY)          (JUMBO BAG RED RETROSPOT)   
12             (JUMBO STORAGE BAG SUKI)          (JUMBO BAG RED RETROSPOT)   
14            (LUNCH BAG RED RETROSPOT)          (LUNCH BAG  BLA