# Association Rules - Class Exercise 1

## Metadata (Data Dictionary)

| No.| Variable | Data Type | Description |
|----|----------|-----------|-------------|
| 1  | fixed acidity | float | Fixed acidity measured |
| 2  | volatile acidity | float | Volatile acidity measured|
| 3  | citric acid | float | Amount of citric acid |
| 4  | residual sugar | float | Amount of residual sugar |
| 5  | chlorides | float | Amount of chlorides |
| 6  | free sulfur dioxide | float | Amount of free sulfur dioxide |
| 7  | total sulfur dioxide | float | Amount of total sulfur dioxide |
| 8  | density | float | Density measured |
| 9  | pH | float | pH value measured |
| 10 | sulphates | float | Amount of sulphates |
| 11 | alcohol | float | Amount of alcohol |
| 12 | Type | string | Type of Wine (R: Red, W: White) |


## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

## Import Data

In [2]:
df = pd.read_csv('Groceries.csv')
df

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21/7/2015,tropical fruit
1,2552,5/1/2015,whole milk
2,2300,19/9/2015,pip fruit
3,1187,12/12/2015,other vegetables
4,3037,1/2/2015,whole milk
...,...,...,...
38760,4471,8/10/2014,sliced cheese
38761,2022,23/2/2014,candy
38762,1097,16/4/2014,cake bar
38763,1510,3/12/2014,fruit/vegetable juice


In [3]:
# We do not need to use "Date". Drop it.

df = df.drop(['Date'], axis=1)

In [4]:
# Get a list of unique combinations of Member_number and itemDescription
# The reason is, we only care about whether a customer bought an item before, not how many times he/she bought it

df = df.drop_duplicates()
df

Unnamed: 0,Member_number,itemDescription
0,1808,tropical fruit
1,2552,whole milk
2,2300,pip fruit
3,1187,other vegetables
4,3037,whole milk
...,...,...
38760,4471,sliced cheese
38761,2022,candy
38762,1097,cake bar
38763,1510,fruit/vegetable juice


In [5]:
# Get a list of unique items

item_df = df[['itemDescription']].sort_values(['itemDescription']).drop_duplicates()
item_df

Unnamed: 0,itemDescription
23892,Instant food products
37755,UHT-milk
32081,abrasive cleaner
30067,artif. sweetener
18261,baby cosmetics
...,...
34028,white bread
19152,white wine
17356,whole milk
10885,yogurt


In [6]:
# Reset index two times
# First time, we drop the original index
# Second time, we do not drop it, so it creates a "index" column

item_df = item_df.reset_index(drop=True).reset_index()
item_df

Unnamed: 0,index,itemDescription
0,0,Instant food products
1,1,UHT-milk
2,2,abrasive cleaner
3,3,artif. sweetener
4,4,baby cosmetics
...,...,...
162,162,white bread
163,163,white wine
164,164,whole milk
165,165,yogurt


In [7]:
# Rename the "index" column to "itemID"

item_df = item_df.rename(columns={'index': 'itemID'})
item_df

Unnamed: 0,itemID,itemDescription
0,0,Instant food products
1,1,UHT-milk
2,2,abrasive cleaner
3,3,artif. sweetener
4,4,baby cosmetics
...,...,...
162,162,white bread
163,163,white wine
164,164,whole milk
165,165,yogurt


In [8]:
# Merge the "itemID" column to the main DataFrame

df = pd.merge(df, item_df, on='itemDescription')
df

Unnamed: 0,Member_number,itemDescription,itemID
0,1808,tropical fruit,156
1,2552,whole milk,164
2,2300,pip fruit,109
3,1187,other vegetables,102
4,3037,whole milk,164
...,...,...,...
34761,4471,sliced cheese,135
34762,2022,candy,19
34763,1097,cake bar,17
34764,1510,fruit/vegetable juice,64


In [9]:
# Filter out the "itemDescription" column
# Sort and reset the index

df = df[['Member_number', 'itemID']].sort_values(['Member_number', 'itemID']).reset_index(drop=True)
df

Unnamed: 0,Member_number,itemID
0,1000,20
1,1000,73
2,1000,92
3,1000,105
4,1000,108
...,...,...
34761,5000,99
34762,5000,102
34763,5000,123
34764,5000,132


In [10]:
# Create a matrix to indicate whether each menmber has bought each item
# Since we have dropped the duplicates of buying history
# This table should only have 0 and 1

customer_item_matrix = df.groupby(['Member_number','itemID']).size().unstack().fillna(0)
customer_item_matrix

itemID,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
# Convert it to "int" type for better readability (so it does not show the decimal place)

customer_item_matrix = customer_item_matrix.astype(int)
customer_item_matrix

itemID,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1001,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0
1002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4999,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0


In [12]:
# Set the minimum support
# Use it to determine the minimum count

min_support = 0.025
min_count = df['Member_number'].unique().shape[0] * min_support
min_count

97.45

# Start association rules

In [13]:
from itertools import combinations

In [14]:
level = 1

In [15]:
item_list = list(customer_item_matrix)

In [16]:
# Create a list of combinations at Level 1
# At level 1, each combination has only 1 item
# So the number of combinations is equal to the number of items

l1_combinations = list(combinations(item_list, level))
l1_combinations

[(0,),
 (1,),
 (2,),
 (3,),
 (4,),
 (5,),
 (6,),
 (7,),
 (8,),
 (9,),
 (10,),
 (11,),
 (12,),
 (13,),
 (14,),
 (15,),
 (16,),
 (17,),
 (18,),
 (19,),
 (20,),
 (21,),
 (22,),
 (23,),
 (24,),
 (25,),
 (26,),
 (27,),
 (28,),
 (29,),
 (30,),
 (31,),
 (32,),
 (33,),
 (34,),
 (35,),
 (36,),
 (37,),
 (38,),
 (39,),
 (40,),
 (41,),
 (42,),
 (43,),
 (44,),
 (45,),
 (46,),
 (47,),
 (48,),
 (49,),
 (50,),
 (51,),
 (52,),
 (53,),
 (54,),
 (55,),
 (56,),
 (57,),
 (58,),
 (59,),
 (60,),
 (61,),
 (62,),
 (63,),
 (64,),
 (65,),
 (66,),
 (67,),
 (68,),
 (69,),
 (70,),
 (71,),
 (72,),
 (73,),
 (74,),
 (75,),
 (76,),
 (77,),
 (78,),
 (79,),
 (80,),
 (81,),
 (82,),
 (83,),
 (84,),
 (85,),
 (86,),
 (87,),
 (88,),
 (89,),
 (90,),
 (91,),
 (92,),
 (93,),
 (94,),
 (95,),
 (96,),
 (97,),
 (98,),
 (99,),
 (100,),
 (101,),
 (102,),
 (103,),
 (104,),
 (105,),
 (106,),
 (107,),
 (108,),
 (109,),
 (110,),
 (111,),
 (112,),
 (113,),
 (114,),
 (115,),
 (116,),
 (117,),
 (118,),
 (119,),
 (120,),
 (121,),
 (122,),
 (1

In [17]:
# Convert the instances to lists

l1_combinations = [list(combination) for combination in l1_combinations]
l1_combinations

[[0],
 [1],
 [2],
 [3],
 [4],
 [5],
 [6],
 [7],
 [8],
 [9],
 [10],
 [11],
 [12],
 [13],
 [14],
 [15],
 [16],
 [17],
 [18],
 [19],
 [20],
 [21],
 [22],
 [23],
 [24],
 [25],
 [26],
 [27],
 [28],
 [29],
 [30],
 [31],
 [32],
 [33],
 [34],
 [35],
 [36],
 [37],
 [38],
 [39],
 [40],
 [41],
 [42],
 [43],
 [44],
 [45],
 [46],
 [47],
 [48],
 [49],
 [50],
 [51],
 [52],
 [53],
 [54],
 [55],
 [56],
 [57],
 [58],
 [59],
 [60],
 [61],
 [62],
 [63],
 [64],
 [65],
 [66],
 [67],
 [68],
 [69],
 [70],
 [71],
 [72],
 [73],
 [74],
 [75],
 [76],
 [77],
 [78],
 [79],
 [80],
 [81],
 [82],
 [83],
 [84],
 [85],
 [86],
 [87],
 [88],
 [89],
 [90],
 [91],
 [92],
 [93],
 [94],
 [95],
 [96],
 [97],
 [98],
 [99],
 [100],
 [101],
 [102],
 [103],
 [104],
 [105],
 [106],
 [107],
 [108],
 [109],
 [110],
 [111],
 [112],
 [113],
 [114],
 [115],
 [116],
 [117],
 [118],
 [119],
 [120],
 [121],
 [122],
 [123],
 [124],
 [125],
 [126],
 [127],
 [128],
 [129],
 [130],
 [131],
 [132],
 [133],
 [134],
 [135],
 [136],
 [137],
 [138]

In [18]:
# Extract the columns in each combination
# At level 1, it will only show 1 column

customer_item_matrix[l1_combinations[0]]

itemID,0
Member_number,Unnamed: 1_level_1
1000,0
1001,0
1002,0
1003,0
1004,0
...,...
4996,0
4997,0
4998,0
4999,0


In [19]:
# Sum the filtered table per row
# At level 1, this operation makes no difference

customer_item_matrix[l1_combinations[0]].sum(axis=1)

Member_number
1000    0
1001    0
1002    0
1003    0
1004    0
       ..
4996    0
4997    0
4998    0
4999    0
5000    0
Length: 3898, dtype: int64

In [20]:
# At level N, we will have N items in the combinations
# So, if the sum per row is equal to the level, it means that customer bought all items in this combination
# Then sum it to find out how many customers bought all items in this combination

count = (customer_item_matrix[l1_combinations[0]].sum(axis=1) == level).sum()
count

60

In [21]:
# Perform this operation for all combinations

l1_counts = [(customer_item_matrix[combination].sum(axis=1) == level).sum() for combination in l1_combinations]
l1_counts

[60,
 306,
 22,
 29,
 3,
 4,
 121,
 17,
 466,
 311,
 242,
 619,
 833,
 38,
 530,
 493,
 253,
 89,
 66,
 210,
 644,
 115,
 21,
 80,
 171,
 42,
 174,
 392,
 337,
 60,
 723,
 31,
 73,
 16,
 448,
 93,
 15,
 17,
 12,
 345,
 471,
 46,
 9,
 33,
 337,
 127,
 73,
 133,
 67,
 519,
 40,
 64,
 29,
 142,
 67,
 16,
 536,
 5,
 90,
 101,
 11,
 245,
 72,
 400,
 487,
 214,
 9,
 247,
 313,
 208,
 153,
 13,
 45,
 204,
 220,
 59,
 34,
 32,
 30,
 1,
 28,
 9,
 103,
 67,
 48,
 255,
 5,
 37,
 456,
 74,
 248,
 35,
 230,
 91,
 317,
 545,
 22,
 33,
 217,
 297,
 10,
 22,
 1468,
 124,
 118,
 692,
 85,
 77,
 130,
 665,
 47,
 516,
 116,
 22,
 1,
 150,
 19,
 16,
 15,
 155,
 49,
 80,
 1363,
 899,
 5,
 32,
 6,
 89,
 270,
 44,
 803,
 101,
 141,
 656,
 20,
 202,
 27,
 20,
 1222,
 147,
 41,
 48,
 46,
 205,
 71,
 228,
 29,
 11,
 39,
 98,
 257,
 68,
 21,
 27,
 22,
 5,
 911,
 78,
 50,
 269,
 603,
 8,
 346,
 172,
 1786,
 1103,
 60]

In [22]:
# Put the combinations and the counts in a DataFrame

C_df = pd.DataFrame({'combination': l1_combinations, 'count': l1_counts})
C_df

Unnamed: 0,combination,count
0,[0],60
1,[1],306
2,[2],22
3,[3],29
4,[4],3
...,...,...
162,[162],346
163,[163],172
164,[164],1786
165,[165],1103


In [23]:
# Filter C_df based on the minimum count

L_df = C_df[C_df['count'] > min_count]
L_df

Unnamed: 0,combination,count
1,[1],306
6,[6],121
8,[8],466
9,[9],311
10,[10],242
...,...,...
160,[160],603
162,[162],346
163,[163],172
164,[164],1786


In [24]:
# Then, extract the list of combinations that pass level 1

l1_items = L_df['combination']
l1_items

1        [1]
6        [6]
8        [8]
9        [9]
10      [10]
       ...  
160    [160]
162    [162]
163    [163]
164    [164]
165    [165]
Name: combination, Length: 78, dtype: object

In [25]:
# Change the pd.Series object to np.array
# Reshape to a 1D array
# Get the unique items
# At level 1, there should not be any repeat in the item

l1_items = l1_items.to_list()
l1_items = np.array(l1_items).reshape(-1)
l1_items = np.unique(l1_items)
l1_items

array([  1,   6,   8,   9,  10,  11,  12,  14,  15,  16,  19,  20,  21,
        24,  26,  27,  28,  30,  34,  39,  40,  44,  45,  47,  49,  53,
        56,  59,  61,  63,  64,  65,  67,  68,  69,  70,  73,  74,  82,
        85,  88,  90,  92,  94,  95,  98,  99, 102, 103, 104, 105, 108,
       109, 111, 112, 115, 119, 122, 123, 128, 130, 131, 132, 133, 135,
       138, 139, 143, 145, 149, 150, 156, 159, 160, 162, 163, 164, 165])

In [26]:
# Create a copy of item matrix with the items that pass level 1
customer_item_matrix_copy = customer_item_matrix[l1_items]

In [27]:
# Now, we put everything together and run it in loops

# Starting from level 1
level = 1

# Create a copy of item matrix so the original one will not be changed
customer_item_matrix_copy = customer_item_matrix.copy()

# Start a while loop (we do not know how many loops it will take, so we just start with "while True" and break it later
while True:
    # This part has been demonstrated above
    item_list = list(customer_item_matrix_copy)
    combin = list(combinations(item_list, level))
    combin = [list(combination) for combination in combin]
    counts = [(customer_item_matrix[combination].sum(axis=1) == level).sum() for combination in combin]
    C_df = pd.DataFrame({'combination': combin, 'count': counts})
    L_df = C_df[C_df['count'] > min_count]

    # We stop the loop when there is no combination left at the current level
    # i.e., all combinations fail to pass the current level
    if L_df.shape[0] < 1:
        break

    # Get the unique items from all combinations
    itemset = np.unique(np.array(L_df['combination'].to_list()).reshape(-1))
    
    # Select the combinations that pass the current level to continue
    customer_item_matrix_copy = customer_item_matrix_copy[itemset]

    # Print the result at the current level
    print(level, len(itemset), itemset)

    # Go to the next level
    level += 1

1 78 [  1   6   8   9  10  11  12  14  15  16  19  20  21  24  26  27  28  30
  34  39  40  44  45  47  49  53  56  59  61  63  64  65  67  68  69  70
  73  74  82  85  88  90  92  94  95  98  99 102 103 104 105 108 109 111
 112 115 119 122 123 128 130 131 132 133 135 138 139 143 145 149 150 156
 159 160 162 163 164 165]
2 58 [  1   8   9  10  11  12  14  15  16  19  20  26  27  28  30  34  39  40
  44  49  56  61  63  64  65  67  68  69  73  74  85  88  90  92  94  95
  98  99 102 105 109 111 122 123 128 130 133 135 138 143 145 150 156 159
 160 162 164 165]
3 28 [  8  11  12  14  15  20  30  34  40  49  56  63  64  88  95 102 105 109
 111 122 123 130 133 138 156 160 164 165]
4 6 [102 122 130 138 164 165]


level = 1

while True:
    item_columns = ['item{}'.format(i+1) for i in range(level)]
    
    combin_df = df.rename(columns={'itemID': 'item1'})
    for i in range(2, level+1):
        to_merge = df.rename(columns={'itemID': 'item{}'.format(i)})
        combin_df = pd.merge(combin_df, to_merge, on='Member_number')
        
        for j in range(1, i):
            combin_df = combin_df[combin_df['item{}'.format(j)] < combin_df['item{}'.format(i)]].reset_index(drop=True)
    
    C = combin_df[item_columns].assign(count=1).groupby(item_columns).count().reset_index()
    L = C[C['count'] > min_count]
    
    if len(L) < 1:
        break
        
    itemset = np.unique(L[item_columns].to_numpy().reshape(-1))
    print(level, len(itemset), itemset)
    level += 1

combin_df

In [28]:
itemset

array([102, 122, 130, 138, 164, 165])

In [29]:
item_df[item_df['itemID'].isin(itemset)]

Unnamed: 0,itemID,itemDescription
102,102,other vegetables
122,122,rolls/buns
130,130,sausage
138,138,soda
164,164,whole milk
165,165,yogurt


In [30]:
for n_antecedents in range(1, len(itemset)+1):
    for n_consequents in range(1, len(itemset)+1):
        if n_antecedents + n_consequents <= 6:
            print(n_antecedents, n_consequents)

1 1
1 2
1 3
1 4
1 5
2 1
2 2
2 3
2 4
3 1
3 2
3 3
4 1
4 2
5 1


In [31]:
sample_list = itemset
list_combinations = list()

for n in range(1, len(sample_list)):
    list_combinations += list(combinations(sample_list, n))

list_combinations = [list(combination) for combination in list_combinations]
print(list_combinations)

[[102], [122], [130], [138], [164], [165], [102, 122], [102, 130], [102, 138], [102, 164], [102, 165], [122, 130], [122, 138], [122, 164], [122, 165], [130, 138], [130, 164], [130, 165], [138, 164], [138, 165], [164, 165], [102, 122, 130], [102, 122, 138], [102, 122, 164], [102, 122, 165], [102, 130, 138], [102, 130, 164], [102, 130, 165], [102, 138, 164], [102, 138, 165], [102, 164, 165], [122, 130, 138], [122, 130, 164], [122, 130, 165], [122, 138, 164], [122, 138, 165], [122, 164, 165], [130, 138, 164], [130, 138, 165], [130, 164, 165], [138, 164, 165], [102, 122, 130, 138], [102, 122, 130, 164], [102, 122, 130, 165], [102, 122, 138, 164], [102, 122, 138, 165], [102, 122, 164, 165], [102, 130, 138, 164], [102, 130, 138, 165], [102, 130, 164, 165], [102, 138, 164, 165], [122, 130, 138, 164], [122, 130, 138, 165], [122, 130, 164, 165], [122, 138, 164, 165], [130, 138, 164, 165], [102, 122, 130, 138, 164], [102, 122, 130, 138, 165], [102, 122, 130, 164, 165], [102, 122, 138, 164, 165],

In [32]:
len(list_combinations)

62

In [33]:
AB_df = []

for i in range(len(list_combinations)):
    A = list(list_combinations[i])
    for j in range(len(list_combinations)):
        B = list(list_combinations[j])
        if len(A) + len(B) <= 6:
            if len(A) + len(B) == len(set(A + B)):
                AB = A + B
                AB_df.append([A, B])
                
AB_df = pd.DataFrame(AB_df, columns=['antecedents', 'consequents'])
AB_df

Unnamed: 0,antecedents,consequents
0,[102],[122]
1,[102],[130]
2,[102],[138]
3,[102],[164]
4,[102],[165]
...,...,...
597,"[102, 122, 130, 138, 165]",[164]
598,"[102, 122, 130, 164, 165]",[138]
599,"[102, 122, 138, 164, 165]",[130]
600,"[102, 130, 138, 164, 165]",[122]
