Use the data from the csv file adventureworks.csv.  Determine which items are often bought together when

* “together” means “same order”
* “together” means “same customer”

Play around with the values for support and lift to limit the number of rules. 

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
import pandas as pd
url = 'https://raw.githubusercontent.com/HOGENT-ML/course/main/datasets/adventureworks.csv'
df = pd.read_csv(url,sep=';')
df.shape

(32166, 4)

In [3]:
df.shape

(32166, 4)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32166 entries, 0 to 32165
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   OrderNumber  32166 non-null  object
 1   CustomerKey  32166 non-null  int64 
 2   LineNumber   32166 non-null  int64 
 3   Model        32166 non-null  object
dtypes: int64(2), object(2)
memory usage: 1005.3+ KB


# 1. Same order

In [5]:
# We need to consolidate the items into 1 transaction per row with each product 1 hot encoded. 

In [6]:
# basket = df.groupby(['OrderNumber', 'Model'])['LineNumber'].count().unstack().fillna(0)
basket = pd.pivot_table(df, index='OrderNumber', columns='Model', values='LineNumber', aggfunc='count', fill_value=0)
basket

Model,All-Purpose Bike Stand,Bike Wash,Classic Vest,Cycling Cap,Fender Set - Mountain,HL Mountain Tire,HL Road Tire,Half-Finger Gloves,Hitch Rack - 4-Bike,Hydration Pack,...,Road-750,Short-Sleeve Classic Jersey,Sport-100,Touring Tire,Touring Tire Tube,Touring-1000,Touring-2000,Touring-3000,Water Bottle,Women's Mountain Shorts
OrderNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SO61313,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
SO61314,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
SO61315,0,0,0,1,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
SO61316,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SO61317,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SO75119,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SO75120,0,0,0,1,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
SO75121,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
SO75122,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
basket[basket['Classic Vest'] > 0]

Model,All-Purpose Bike Stand,Bike Wash,Classic Vest,Cycling Cap,Fender Set - Mountain,HL Mountain Tire,HL Road Tire,Half-Finger Gloves,Hitch Rack - 4-Bike,Hydration Pack,...,Road-750,Short-Sleeve Classic Jersey,Sport-100,Touring Tire,Touring Tire Tube,Touring-1000,Touring-2000,Touring-3000,Water Bottle,Women's Mountain Shorts
OrderNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SO61339,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SO61353,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
SO61358,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SO61456,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
SO61459,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SO75079,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
SO75080,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
SO75100,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
SO75101,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
# There are a lot of zeros in the data but we also need to make sure any positive values are converted to a 1 and anything 
# less the 0 is set to 0. This step will complete the one hot encoding of the data 
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)   # elementwise mapping on dataframe

In [9]:
# Now that the data is structured properly, we can generate frequent item sets that have a support of at least 3% 
# (this number was chosen so that we could get enough useful examples):
frequent_itemsets = apriori(basket_sets, min_support=0.03, use_colnames=True)



In [10]:
# The final step is to generate the rules with their corresponding support, confidence and lift:
# The association_rules() function allows you to (1) specify your metric of interest and (2) the according threshold. 
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=2)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Fender Set - Mountain),(Mountain-200),0.094879,0.113025,0.0336,0.354133,3.133233,0.022876,1.373309,0.75221
1,(Mountain-200),(Fender Set - Mountain),0.113025,0.094879,0.0336,0.297279,3.133233,0.022876,1.288023,0.767598
2,(HL Mountain Tire),(Mountain Tire Tube),0.062663,0.136552,0.042365,0.676074,4.951021,0.033808,2.665568,0.851371
3,(Mountain Tire Tube),(HL Mountain Tire),0.136552,0.062663,0.042365,0.310248,4.951021,0.033808,1.358947,0.924227
4,(ML Mountain Tire),(Mountain Tire Tube),0.050592,0.136552,0.033215,0.656535,4.807936,0.026307,2.513932,0.834215
5,(Mountain Tire Tube),(ML Mountain Tire),0.136552,0.050592,0.033215,0.243243,4.807936,0.026307,1.254575,0.917265
6,(Mountain Bottle Cage),(Mountain-200),0.091958,0.113025,0.032985,0.358696,3.173602,0.022591,1.38308,0.754261
7,(Mountain-200),(Mountain Bottle Cage),0.113025,0.091958,0.032985,0.291837,3.173602,0.022591,1.28225,0.772176
8,(Water Bottle),(Mountain Bottle Cage),0.191373,0.091958,0.076349,0.398955,4.338473,0.058751,1.510774,0.951618
9,(Mountain Bottle Cage),(Water Bottle),0.091958,0.191373,0.076349,0.830268,4.338473,0.058751,4.764126,0.847432


In [11]:
# We can filter the dataframe using standard pandas code. In this case, look for a large lift (6) and high confidence (.8):
rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
13,(Touring Tire),(Touring Tire Tube),0.044672,0.068814,0.038905,0.870912,12.65596,0.035831,7.213584,0.964052


# 2. Same customer

In [12]:
# basket = df.groupby(['CustomerKey', 'Model'])['LineNumber'].count().unstack().fillna(0)
basket = pd.pivot_table(df, index='CustomerKey', columns='Model', values='LineNumber', aggfunc='count', fill_value=0)

In [13]:
basket[basket['Classic Vest'] > 0]

Model,All-Purpose Bike Stand,Bike Wash,Classic Vest,Cycling Cap,Fender Set - Mountain,HL Mountain Tire,HL Road Tire,Half-Finger Gloves,Hitch Rack - 4-Bike,Hydration Pack,...,Road-750,Short-Sleeve Classic Jersey,Sport-100,Touring Tire,Touring Tire Tube,Touring-1000,Touring-2000,Touring-3000,Water Bottle,Women's Mountain Shorts
CustomerKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11115,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
11116,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
11142,0,0,1,0,0,1,0,2,1,0,...,0,1,5,0,2,0,0,0,0,1
11164,0,0,1,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
11173,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29213,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29216,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29238,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
29462,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0


In [14]:
# There are a lot of zeros in the data but we also need to make sure any positive values are converted to a 1 and anything 
# less the 0 is set to 0. This step will complete the one hot encoding of the data and remove the postage column 
# (since that charge is not one we wish to explore):
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)   # elementwise mapping on dataframe
basket_sets

Model,All-Purpose Bike Stand,Bike Wash,Classic Vest,Cycling Cap,Fender Set - Mountain,HL Mountain Tire,HL Road Tire,Half-Finger Gloves,Hitch Rack - 4-Bike,Hydration Pack,...,Road-750,Short-Sleeve Classic Jersey,Sport-100,Touring Tire,Touring Tire Tube,Touring-1000,Touring-2000,Touring-3000,Water Bottle,Women's Mountain Shorts
CustomerKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11001,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,1,0
11012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11013,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
11017,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
11018,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29465,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
29470,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
29471,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29472,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Now that the data is structured properly, we can generate frequent item sets that have a support of at least 3% 
# (this number was chosen so that we could get enough useful examples):
frequent_itemsets = apriori(basket_sets, min_support=0.03, use_colnames=True)



In [16]:
# The final step is to generate the rules with their corresponding support, confidence and lift:
# The association_rules() function allows you to (1) specify your metric of interest and (2) the according threshold. 
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=2)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Fender Set - Mountain),(Mountain-200),0.108623,0.129607,0.038529,0.354708,2.7368,0.024451,1.348836,0.711943
1,(Mountain-200),(Fender Set - Mountain),0.129607,0.108623,0.038529,0.297279,2.7368,0.024451,1.268465,0.729107
2,(HL Mountain Tire),(Mountain Tire Tube),0.071857,0.152266,0.050079,0.696933,4.577075,0.039138,2.797179,0.842025
3,(Mountain Tire Tube),(HL Mountain Tire),0.152266,0.071857,0.050079,0.328894,4.577075,0.039138,1.383005,0.921893
4,(ML Mountain Tire),(Mountain Tire Tube),0.058014,0.152266,0.039235,0.676292,4.441518,0.030401,2.618822,0.822573
5,(Mountain Tire Tube),(ML Mountain Tire),0.152266,0.058014,0.039235,0.257672,4.441518,0.030401,1.268962,0.914027
6,(ML Road Tire),(Road Tire Tube),0.046024,0.114177,0.032181,0.699234,6.124099,0.026926,2.945219,0.877077
7,(Road Tire Tube),(ML Road Tire),0.114177,0.046024,0.032181,0.281853,6.124099,0.026926,1.328386,0.944558
8,(Mountain Bottle Cage),(Mountain-200),0.104655,0.129607,0.037824,0.361415,2.788553,0.02426,1.363004,0.716362
9,(Mountain-200),(Mountain Bottle Cage),0.129607,0.104655,0.037824,0.291837,2.788553,0.02426,1.26432,0.736898


In [17]:
# We can filter the dataframe using standard pandas code. In this case, look for a large lift (6) and high confidence (.8):
rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
15,(Touring Tire),(Touring Tire Tube),0.050785,0.076882,0.044348,0.873264,11.358439,0.040444,7.283777,0.960751
17,"(Water Bottle, Mountain-200)",(Mountain Bottle Cage),0.035003,0.104655,0.030771,0.879093,8.399895,0.027107,7.405247,0.912905
