In [3]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [4]:
df = pd.read_csv('Groceries_dataset.csv')
df

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
...,...,...,...
38760,4471,08-10-2014,sliced cheese
38761,2022,23-02-2014,candy
38762,1097,16-04-2014,cake bar
38763,1510,03-12-2014,fruit/vegetable juice


In [7]:
# Group transactions by Member_number and data
df_grouped = df.groupby(['Member_number','Date'])['itemDescription'].apply(list).reset_index()
df_grouped

Unnamed: 0,Member_number,Date,itemDescription
0,1000,15-03-2015,"[sausage, whole milk, semi-finished bread, yog..."
1,1000,24-06-2014,"[whole milk, pastry, salty snack]"
2,1000,24-07-2015,"[canned beer, misc. beverages]"
3,1000,25-11-2015,"[sausage, hygiene articles]"
4,1000,27-05-2015,"[soda, pickled vegetables]"
...,...,...,...
14958,4999,24-01-2015,"[tropical fruit, berries, other vegetables, yo..."
14959,4999,26-12-2015,"[bottled water, herbs]"
14960,5000,09-03-2014,"[fruit/vegetable juice, onions]"
14961,5000,10-02-2015,"[soda, root vegetables, semi-finished bread]"


In [13]:
transactions = df_grouped['itemDescription'].tolist()

In [14]:
te= TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_array, columns=te.columns_)
df_encoded

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14958,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
14959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14960,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14961,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Support tells us  how frequently an itemset appears in the dataset

In [17]:
frequent_itemsets = apriori(df_encoded, min_support=0.002,use_colnames=True)
print(frequent_itemsets)

      support                          itemsets
0    0.004010           (Instant food products)
1    0.021386                        (UHT-milk)
2    0.008087                   (baking powder)
3    0.033950                            (beef)
4    0.021787                         (berries)
..        ...                               ...
325  0.002606             (waffles, whole milk)
326  0.004611  (whipped/sour cream, whole milk)
327  0.002941      (whipped/sour cream, yogurt)
328  0.003141         (white bread, whole milk)
329  0.011161              (yogurt, whole milk)

[330 rows x 2 columns]


Confidence tells us how likely it is  that a customer will be buy item B, given that they have already bought A

In [18]:
# Generate  associations rule
rules = association_rules(frequent_itemsets, metric='confidence',min_threshold=0.1)

lift tells us how many more likely item B is purchase whem A is purchase, Compared to when A is not purchase

In [19]:
rules_sorted = rules[['antecedents','consequents','support','confidence','lift']]\
    .sort_values(by='lift', ascending=True)

In [20]:
print('Top 10 Association Rules:\n')
print(rules_sorted.head(10))

Top 10 Association Rules:

             antecedents   consequents   support  confidence      lift
21             (dessert)  (whole milk)  0.002406    0.101983  0.645777
4              (berries)  (whole milk)  0.002272    0.104294  0.660414
58  (whipped/sour cream)  (whole milk)  0.004611    0.105505  0.668077
51     (root vegetables)  (whole milk)  0.007552    0.108549  0.687357
37             (napkins)  (whole milk)  0.002406    0.108761  0.688699
7        (bottled water)  (whole milk)  0.007151    0.117841  0.746196
1             (UHT-milk)  (whole milk)  0.002540    0.118750  0.751949
8          (brown bread)  (whole milk)  0.004478    0.119005  0.753566
54                (soda)  (whole milk)  0.011629    0.119752  0.758296
17              (coffee)  (whole milk)  0.003809    0.120507  0.763078
