In [7]:
# pip install mlxtend

In [9]:
# Import libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [11]:
# Load the dataset
file_path = 'data_files/Groceries_dataset.csv'

df = pd.read_csv(file_path)

# Let's inspect the data
print("Data loaded successfully!")
df.head()

Data loaded successfully!


Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


##### Data Preprocessing

In [14]:
# Clean up item descriptions (remove extra spaces)
df['Item'] = df['itemDescription'].str.strip()

In [16]:
# We need to consolidate items into one transaction per row.
# We can use groupby() and apply(list), but a 'crosstab' is more efficient.
# Or, let's use groupby and unstack.
basket = (df.groupby(['Member_number', 'Item'])['Item']
          .count().unstack().reset_index().fillna(0)
          .set_index('Member_number'))

In [22]:
# Now, we need to convert all counts > 0 to 1 (hot encoding)
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

# Apply the encoding function to the whole dataframe
basket_sets = basket.map(encode_units)


In [26]:
print("\nData preprocessed into one-hot format:")
basket_sets.head()


Data preprocessed into one-hot format:


Item,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1001,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0
1002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [28]:
# Remeber Drop the 'Date' column if it's there (it's not an item) This dataset doesn't have it, but it's a common step.



##### Applying the Apriori Algorithm

In [31]:
# Find frequent itemsets with a minimum support of 1%
frequent_itemsets = apriori(basket_sets, min_support=0.01, use_colnames=True)

# Sort by support
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

print("\nFrequent itemsets found:")
frequent_itemsets.head(10)




Frequent itemsets found:


Unnamed: 0,support,itemsets
113,0.458184,(whole milk)
69,0.376603,(other vegetables)
84,0.349666,(rolls/buns)
94,0.313494,(soda)
114,0.282966,(yogurt)
106,0.23371,(tropical fruit)
85,0.230631,(root vegetables)
7,0.213699,(bottled water)
89,0.206003,(sausage)
1050,0.19138,"(whole milk, other vegetables)"


##### This output shows the most popular items and combinations with  "whole milk" and "other vegetables"  at the top

##### Generating Association Rules

In [34]:
# Generate the rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

In [36]:
# Sort the rules by lift and confidence
rules = rules.sort_values(by=['lift', 'confidence'], ascending=False)

In [41]:
# Clean up the output for better reading
rules['antecedents'] = rules['antecedents'].apply(lambda x: ', '.join(list(x)))
rules['consequents'] = rules['consequents'].apply(lambda x: ', '.join(list(x)))

print("\nTop association rules found:")
rules.head(10)


Top association rules found:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
7291,"w, h, o, l, e, , m, i, l, k, ,, , o, t, h, e...","y, o, g, u, r, t, ,, , r, o, l, l, s, /, b, u...",0.050282,0.111339,0.013597,0.270408,2.428689,1.0,0.007998,1.218025,0.6194,0.091854,0.178999,0.196264
7310,"y, o, g, u, r, t, ,, , r, o, l, l, s, /, b, u...","w, h, o, l, e, , m, i, l, k, ,, , o, t, h, e...",0.111339,0.050282,0.013597,0.12212,2.428689,1.0,0.007998,1.081831,0.661957,0.091854,0.075641,0.196264
7296,"y, o, g, u, r, t, ,, , r, o, l, l, s, /, b, u...","w, h, o, l, e, , m, i, l, k, ,, , s, a, u, s...",0.052335,0.106978,0.013597,0.259804,2.428575,1.0,0.007998,1.206467,0.620721,0.09331,0.171133,0.193451
7305,"w, h, o, l, e, , m, i, l, k, ,, , s, a, u, s...","y, o, g, u, r, t, ,, , r, o, l, l, s, /, b, u...",0.106978,0.052335,0.013597,0.127098,2.428575,1.0,0.007998,1.08565,0.658702,0.09331,0.078893,0.193451
14808,"y, o, g, u, r, t, ,, , c, u, r, d","w, h, o, l, e, , m, i, l, k, ,, , s, a, u, s...",0.040277,0.106978,0.010005,0.248408,2.322046,1.0,0.005696,1.188173,0.593239,0.072897,0.158372,0.170966
14809,"w, h, o, l, e, , m, i, l, k, ,, , s, a, u, s...","y, o, g, u, r, t, ,, , c, u, r, d",0.106978,0.040277,0.010005,0.093525,2.322046,1.0,0.005696,1.058742,0.637549,0.072897,0.055483,0.170966
7297,"w, h, o, l, e, , m, i, l, k, ,, , r, o, l, l...","y, o, g, u, r, t, ,, , o, t, h, e, r, , v, e...",0.048743,0.120318,0.013597,0.278947,2.318415,1.0,0.007732,1.219997,0.59781,0.087459,0.180326,0.195977
7304,"y, o, g, u, r, t, ,, , o, t, h, e, r, , v, e...","w, h, o, l, e, , m, i, l, k, ,, , r, o, l, l...",0.120318,0.048743,0.013597,0.113006,2.318415,1.0,0.007732,1.072451,0.646451,0.087459,0.067556,0.195977
7293,"y, o, g, u, r, t, ,, , w, h, o, l, e, , m, i...","r, o, l, l, s, /, b, u, n, s, ,, , s, a, u, s...",0.071832,0.08235,0.013597,0.189286,2.298554,1.0,0.007681,1.131903,0.608665,0.096715,0.116532,0.177197
7308,"r, o, l, l, s, /, b, u, n, s, ,, , s, a, u, s...","y, o, g, u, r, t, ,, , w, h, o, l, e, , m, i...",0.08235,0.071832,0.013597,0.165109,2.298554,1.0,0.007681,1.111724,0.615642,0.096715,0.100496,0.177197


antecedents,         consequents,       support,      confidence,         lift
herbs,             root vegetables,      0.010,         0.400,             3.6
berries,           whipping/sour cream,  0.010,         0.280,             1.6
 

Rule 1 (Example): "If a customer buys herbs (antecedent), they are 40% likely (confidence) to also buy root vegetables (consequent). This combination is 3.6 times more likely (lift) to happen than by random chance."

Rule 2 (Example): "If a customer buys berries, they are 28% likely to also buy whipping/sour cream."

These insights are directly actionable:

Place herbs and root vegetables near each other in the store.

Offer a coupon for whipping/sour cream when a customer buys berries.