In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Mlxtend library
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

# pyECLAT library
from pyECLAT import ECLAT

import ast

In [2]:
clusters = pd.read_csv("Clusters.csv")
baskets = pd.read_csv("customer_basket.csv")

In [3]:
baskets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89952 entries, 0 to 89951
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   invoice_id     89952 non-null  int64 
 1   list_of_goods  89952 non-null  object
 2   customer_id    89952 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 2.1+ MB


In [6]:
baskets.describe(include="all")

Unnamed: 0,invoice_id,list_of_goods,customer_id
count,89952.0,89952,89952.0
unique,,88767,
top,,"['babies food', 'cooking oil']",
freq,,25,
mean,6126259.0,,21855.193915
std,3527265.0,,12610.661213
min,20066.0,,1.0
25%,3085110.0,,10814.0
50%,6133909.0,,21904.0
75%,9185876.0,,32771.0


In [4]:
baskets["customer_id"].nunique()

28516

In [7]:
basket = baskets.merge(clusters, on= "customer_id")

In [8]:
basket_cluster_0 = basket[basket["kmeans_cluster"] == 0]
basket_cluster_1 = basket[basket["kmeans_cluster"] == 1]
basket_cluster_2 = basket[basket["kmeans_cluster"] == 2]
basket_cluster_3 = basket[basket["kmeans_cluster"] == 3]
basket_cluster_4 = basket[basket["kmeans_cluster"] == 4]
basket_cluster_5 = basket[basket["kmeans_cluster"] == 5]
basket_cluster_6 = basket[basket["kmeans_cluster"] == 6]
basket_cluster_7 = basket[basket["kmeans_cluster"] == 7]
basket_cluster_8 = basket[basket["kmeans_cluster"] == 8]
basket_cluster_9 = basket[basket["kmeans_cluster"] == 9]

# Cluster 0 (basket analysis)

In [9]:
baskets_0 = []
for i in range(len(basket_cluster_0)):
    baskets_0.append(ast.literal_eval(basket_cluster_0.iloc[i,1]))

In [10]:
te = TransactionEncoder()
te_fit = te.fit(baskets_0).transform(baskets_0)
basket_items_0 = pd.DataFrame(te_fit, columns=te.columns_)

In [11]:
frequent_itemsets_0 = apriori(
    basket_items_0, min_support=0.01, use_colnames=True
    )

rules_0 = association_rules(frequent_itemsets_0, metric="confidence", min_threshold=0.2)

In [12]:
rules_0.sort_values(by='lift', ascending=False).head(50)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2695,"(white wine, champagne)","(beer, dessert wine)",0.048802,0.033245,0.011351,0.232586,6.99611,0.009728,1.259756,0.901035
2692,"(beer, dessert wine)","(white wine, champagne)",0.033245,0.048802,0.011351,0.341421,6.99611,0.009728,1.44432,0.886536
2693,"(beer, champagne)","(dessert wine, white wine)",0.023969,0.068564,0.011351,0.473558,6.906779,0.009707,1.769303,0.876216
2694,"(dessert wine, champagne)","(beer, white wine)",0.02829,0.058251,0.011351,0.401222,6.887843,0.009703,1.572785,0.879703
3885,"(white wine, french wine)","(dessert wine, cider)",0.035377,0.050761,0.0121,0.34202,6.737901,0.010304,1.442656,0.882817
3884,"(dessert wine, cider)","(white wine, french wine)",0.050761,0.035377,0.0121,0.238365,6.737901,0.010304,1.266517,0.897124
3886,"(french wine, cider)","(dessert wine, white wine)",0.027195,0.068564,0.0121,0.444915,6.489033,0.010235,1.678007,0.869541
2704,"(dessert wine, cider)","(beer, white wine)",0.050761,0.058251,0.018726,0.368899,6.332948,0.015769,1.492232,0.887127
2701,"(beer, white wine)","(dessert wine, cider)",0.058251,0.050761,0.018726,0.321464,6.332948,0.015769,1.398952,0.894182
2703,"(dessert wine, white wine)","(beer, cider)",0.068564,0.043213,0.018726,0.273109,6.320112,0.015763,1.316274,0.903739


# Cluster 1 (basket_analysis)

In [13]:
baskets_1 = []
for i in range(len(basket_cluster_1)):
    baskets_1.append(ast.literal_eval(basket_cluster_1.iloc[i,1]))

In [14]:
te = TransactionEncoder()
te_fit = te.fit(baskets_1).transform(baskets_1)
basket_items_1 = pd.DataFrame(te_fit, columns=te.columns_)

In [15]:
frequent_itemsets_1 = apriori(
    basket_items_1, min_support=0.05, use_colnames=True
    )

rules_1 = association_rules(frequent_itemsets_1, metric="confidence", min_threshold=0.2)

In [16]:
rules_1.sort_values(by='lift', ascending=False).head(50)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
54,(cider),(white wine),0.083806,0.107777,0.056664,0.676128,6.273413,0.047631,2.754865,0.917488
53,(white wine),(cider),0.107777,0.083806,0.056664,0.52575,6.273413,0.047631,1.931879,0.942138
319,"(asparagus, mashed potato)","(melons, tomatoes)",0.217322,0.179079,0.05337,0.24558,1.371348,0.014452,1.088148,0.345979
316,"(melons, tomatoes)","(asparagus, mashed potato)",0.179079,0.217322,0.05337,0.298025,1.371348,0.014452,1.114964,0.329862
279,"(tomatoes, green beans)","(carrots, asparagus)",0.13559,0.347484,0.0638,0.470535,1.354121,0.016685,1.232407,0.302534
320,"(melons, mashed potato)","(tomatoes, asparagus)",0.082403,0.480756,0.05337,0.647668,1.347186,0.013754,1.473736,0.280856
271,"(carrots, asparagus)","(tomatoes, frozen vegetables)",0.347484,0.162733,0.076121,0.219063,1.346152,0.019574,1.072131,0.394077
270,"(tomatoes, frozen vegetables)","(carrots, asparagus)",0.162733,0.347484,0.076121,0.467766,1.346152,0.019574,1.225995,0.30712
302,"(carrots, asparagus)","(melons, tomatoes)",0.347484,0.179079,0.083684,0.240829,1.344817,0.021457,1.081338,0.392947
301,"(melons, tomatoes)","(carrots, asparagus)",0.179079,0.347484,0.083684,0.467302,1.344817,0.021457,1.224928,0.312338


# Cluster 2 (basket analysis)

In [17]:
baskets_2 = []
for i in range(len(basket_cluster_2)):
    baskets_2.append(ast.literal_eval(basket_cluster_2.iloc[i,1]))

In [18]:
te = TransactionEncoder()
te_fit = te.fit(baskets_2).transform(baskets_2)
basket_items_2 = pd.DataFrame(te_fit, columns=te.columns_)

In [19]:
frequent_itemsets_2 = apriori(
    basket_items_2, min_support=0.05, use_colnames=True
    )

rules_2 = association_rules(frequent_itemsets_2, metric="confidence", min_threshold=0.2)

In [20]:
rules_2.sort_values(by='lift', ascending=False).head(50)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
429,"(laptop, samsung galaxy 10)","(airpods, champagne)",0.145216,0.284119,0.055852,0.384615,1.353715,0.014594,1.163307,0.305682
427,"(laptop, airpods)","(champagne, samsung galaxy 10)",0.10102,0.408936,0.055852,0.552885,1.352006,0.014542,1.321949,0.289615
458,"(bluetooth headphones, samsung galaxy 10)","(spaghetti, champagne)",0.213696,0.242351,0.069937,0.327273,1.35041,0.018148,1.126236,0.330005
455,"(spaghetti, champagne)","(bluetooth headphones, samsung galaxy 10)",0.242351,0.213696,0.069937,0.288577,1.35041,0.018148,1.105256,0.342486
403,(turkey),"(champagne, samsung galaxy 10)",0.121418,0.408936,0.066537,0.548,1.340062,0.016885,1.307663,0.288836
430,"(airpods, samsung galaxy 10)","(laptop, champagne)",0.204468,0.207382,0.055852,0.273159,1.317177,0.013449,1.090497,0.302692
428,"(laptop, champagne)","(airpods, samsung galaxy 10)",0.207382,0.204468,0.055852,0.269321,1.317177,0.013449,1.088757,0.303804
182,"(escalope, champagne)",(airpods),0.119475,0.359883,0.056338,0.471545,1.310271,0.013341,1.211297,0.268929
367,"(spaghetti, champagne)",(iphone 8),0.242351,0.233123,0.073822,0.304609,1.306647,0.017325,1.1028,0.30975
371,(iphone 8),"(spaghetti, champagne)",0.233123,0.242351,0.073822,0.316667,1.306647,0.017325,1.108755,0.306023


# Cluster 3 (basket analysis)

There is no transactions stored in this dataset about people in cluster 3

# Cluster 4 (basket analysis)

In [21]:
baskets_4 = []
for i in range(len(basket_cluster_4)):
    baskets_4.append(ast.literal_eval(basket_cluster_4.iloc[i,1]))

In [22]:
te = TransactionEncoder()
te_fit = te.fit(baskets_4).transform(baskets_4)
basket_items_4 = pd.DataFrame(te_fit, columns=te.columns_)

In [23]:
frequent_itemsets_4 = apriori(
    basket_items_4, min_support=0.07, use_colnames=True
    )

rules_4 = association_rules(frequent_itemsets_4, metric="confidence", min_threshold=0.2)

In [24]:
rules_4.sort_values(by='lift', ascending=False).head(50)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
23,(cider),(white wine),0.153892,0.214282,0.109599,0.71218,3.323565,0.076622,2.729891,0.826275
22,(white wine),(cider),0.214282,0.153892,0.109599,0.51147,3.323565,0.076622,1.731946,0.889783
36,(white wine),(dessert wine),0.214282,0.119484,0.081888,0.382153,3.19837,0.056285,1.425136,0.874793
35,(dessert wine),(white wine),0.119484,0.214282,0.081888,0.685353,3.19837,0.056285,2.497139,0.780611
50,"(cake, candy bars)",(cooking oil),0.109491,0.397126,0.070005,0.639369,1.609988,0.026523,1.671717,0.425461
53,"(cake, candy bars)",(oil),0.109491,0.435856,0.076541,0.699063,1.603885,0.028819,1.874623,0.422807
65,"(cooking oil, candy bars)",(oil),0.144493,0.435856,0.10074,0.697196,1.599603,0.037762,1.863069,0.438155
68,(oil),"(cooking oil, candy bars)",0.435856,0.144493,0.10074,0.231131,1.599603,0.037762,1.112683,0.664449
67,(cooking oil),"(oil, candy bars)",0.397126,0.158807,0.10074,0.253672,1.59736,0.037673,1.127109,0.620308
66,"(oil, candy bars)",(cooking oil),0.158807,0.397126,0.10074,0.634354,1.59736,0.037673,1.648789,0.444568


# Cluster 5 (basket analysis)

In [25]:
baskets_5 = []
for i in range(len(basket_cluster_5)):
    baskets_5.append(ast.literal_eval(basket_cluster_5.iloc[i,1]))

In [26]:
te = TransactionEncoder()
te_fit = te.fit(baskets_5).transform(baskets_5)
basket_items_5 = pd.DataFrame(te_fit, columns=te.columns_)

In [27]:
frequent_itemsets_5 = apriori(
    basket_items_5, min_support=0.05, use_colnames=True
    )

rules_5 = association_rules(frequent_itemsets_5, metric="confidence", min_threshold=0.2)

In [28]:
rules_5.sort_values(by='lift', ascending=False).head(50)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
215,"(cottage cheese, samsung galaxy 10)",(airpods),0.13584,0.349046,0.059914,0.44106,1.263613,0.012499,1.164621,0.241412
390,"(airpods, champagne)","(bluetooth headphones, samsung galaxy 10)",0.27204,0.211587,0.071608,0.263228,1.244063,0.014048,1.07009,0.269496
389,"(bluetooth headphones, samsung galaxy 10)","(airpods, champagne)",0.211587,0.27204,0.071608,0.338435,1.244063,0.014048,1.100361,0.248832
387,"(airpods, bluetooth headphones)","(champagne, samsung galaxy 10)",0.148795,0.389888,0.071608,0.481258,1.234347,0.013595,1.176136,0.223042
408,"(bluetooth headphones, samsung galaxy 10)","(laptop, champagne)",0.211587,0.202051,0.052357,0.247449,1.224685,0.009606,1.060325,0.2327
406,"(laptop, champagne)","(bluetooth headphones, samsung galaxy 10)",0.202051,0.211587,0.052357,0.259127,1.224685,0.009606,1.064168,0.229919
405,"(laptop, bluetooth headphones)","(champagne, samsung galaxy 10)",0.111911,0.389888,0.052357,0.467846,1.199947,0.008724,1.146494,0.187628
388,"(champagne, bluetooth headphones)","(airpods, samsung galaxy 10)",0.308384,0.193595,0.071608,0.232205,1.19944,0.011907,1.050288,0.240419
391,"(airpods, samsung galaxy 10)","(champagne, bluetooth headphones)",0.193595,0.308384,0.071608,0.369888,1.19944,0.011907,1.097608,0.206196
392,(airpods),"(champagne, bluetooth headphones, samsung gala...",0.349046,0.171644,0.071608,0.205155,1.19523,0.011697,1.042159,0.250926


# Cluster 6 (basket analysis)

In [29]:
baskets_6 = []
for i in range(len(basket_cluster_6)):
    baskets_6.append(ast.literal_eval(basket_cluster_6.iloc[i,1]))

In [30]:
te = TransactionEncoder()
te_fit = te.fit(baskets_6).transform(baskets_6)
basket_items_6 = pd.DataFrame(te_fit, columns=te.columns_)

In [31]:
frequent_itemsets_6 = apriori(
    basket_items_6, min_support=0.05, use_colnames=True
    )

rules_6 = association_rules(frequent_itemsets_6, metric="confidence", min_threshold=0.2)

In [32]:
rules_6.sort_values(by='lift', ascending=False).head(50)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
170,"(spaghetti, airpods)",(bluetooth headphones),0.107176,0.384111,0.051152,0.477273,1.242539,0.009985,1.178223,0.218628
415,"(laptop, bluetooth headphones)","(champagne, samsung galaxy 10)",0.104366,0.39629,0.050403,0.482944,1.218664,0.009044,1.167592,0.200338
408,"(spaghetti, champagne)","(airpods, samsung galaxy 10)",0.232153,0.182874,0.051714,0.22276,1.218106,0.00926,1.051318,0.233189
410,"(airpods, samsung galaxy 10)","(spaghetti, champagne)",0.182874,0.232153,0.051714,0.282787,1.218106,0.00926,1.070598,0.219126
407,"(spaghetti, airpods)","(champagne, samsung galaxy 10)",0.107176,0.39629,0.051714,0.482517,1.217587,0.009242,1.166629,0.200155
426,"(bluetooth headphones, samsung galaxy 10)","(spaghetti, champagne)",0.207233,0.232153,0.05846,0.282098,1.215137,0.01035,1.06957,0.223329
424,"(spaghetti, champagne)","(bluetooth headphones, samsung galaxy 10)",0.232153,0.207233,0.05846,0.251816,1.215137,0.01035,1.059589,0.230577
171,"(spaghetti, bluetooth headphones)",(airpods),0.123103,0.342702,0.051152,0.415525,1.212497,0.008965,1.124596,0.199859
401,"(airpods, champagne)","(bluetooth headphones, samsung galaxy 10)",0.265692,0.207233,0.066142,0.248942,1.20127,0.011082,1.055535,0.228171
400,"(bluetooth headphones, samsung galaxy 10)","(airpods, champagne)",0.207233,0.265692,0.066142,0.319168,1.20127,0.011082,1.078545,0.211345


# Cluster 7 (basket analysis)

There is no transactions stored in this dataset about people in cluster 7

# Cluster 8 (basket analysis)

In [33]:
baskets_8 = []
for i in range(len(basket_cluster_8)):
    baskets_8.append(ast.literal_eval(basket_cluster_8.iloc[i,1]))

In [34]:
te = TransactionEncoder()
te_fit = te.fit(baskets_8).transform(baskets_8)
basket_items_8 = pd.DataFrame(te_fit, columns=te.columns_)

In [35]:
frequent_itemsets_8 = apriori(
    basket_items_8, min_support=0.01, use_colnames=True
    )

rules_8 = association_rules(frequent_itemsets_8, metric="confidence", min_threshold=0.2)

In [36]:
rules_8.sort_values(by='lift', ascending=False).head(50)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
5551,"(muffins, soup, candy bars)","(cake, oil)",0.020155,0.362209,0.01124,0.557692,1.539696,0.00394,1.441962,0.357731
5554,"(muffins, soup)","(cake, candy bars, oil)",0.052035,0.141957,0.01124,0.216015,1.521689,0.003854,1.094463,0.361654
5703,"(gums, pet food)","(cooking oil, cake, oil)",0.036047,0.223934,0.012209,0.33871,1.512542,0.004137,1.173563,0.351533
5778,"(muffins, soup)","(cooking oil, cake, oil)",0.052035,0.223934,0.017345,0.333333,1.488533,0.005693,1.164099,0.346213
5552,"(muffins, oil, soup)","(cake, candy bars)",0.045833,0.165601,0.01124,0.245243,1.48093,0.00365,1.105521,0.340348
5766,"(muffins, olive oil)","(cooking oil, cake, oil)",0.037403,0.223934,0.012306,0.329016,1.469252,0.00393,1.156608,0.331791
3693,"(whole wheat rice, gums)","(cooking oil, cake)",0.026066,0.265698,0.010174,0.390335,1.469093,0.003249,1.204435,0.327854
5697,"(cooking oil, oil, pet food)","(cake, gums)",0.058915,0.141085,0.012209,0.207237,1.468877,0.003897,1.083444,0.339191
5701,"(oil, gums, pet food)","(cooking oil, cake)",0.031298,0.265698,0.012209,0.390093,1.468183,0.003893,1.203958,0.329189
5581,"(muffins, cooking oil, cake)","(french fries, oil)",0.078585,0.136919,0.015795,0.200986,1.467926,0.005035,1.080184,0.345954


# Cluster 9 (basket analysis)

In [37]:
baskets_9 = []
for i in range(len(basket_cluster_9)):
    baskets_9.append(ast.literal_eval(basket_cluster_9.iloc[i,1]))

In [38]:
te = TransactionEncoder()
te_fit = te.fit(baskets_9).transform(baskets_9)
basket_items_9 = pd.DataFrame(te_fit, columns=te.columns_)

In [39]:
frequent_itemsets_9 = apriori(
    basket_items_9, min_support=0.1, use_colnames=True
    )

rules_9 = association_rules(frequent_itemsets_9, metric="confidence", min_threshold=0.2)

In [40]:
rules_9.sort_values(by='lift', ascending=False).head(50)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
79,(napkins),"(cooking oil, babies food)",0.22159,0.43763,0.115582,0.521603,1.191881,0.018608,1.17553,0.206819
75,"(cooking oil, babies food)",(napkins),0.43763,0.22159,0.115582,0.264109,1.191881,0.018608,1.057779,0.286271
78,(cooking oil),"(babies food, napkins)",0.544332,0.178507,0.115582,0.212337,1.189519,0.018415,1.04295,0.34965
77,"(babies food, napkins)",(cooking oil),0.178507,0.544332,0.115582,0.647493,1.189519,0.018415,1.292651,0.193945
83,(cake),"(cooking oil, candy bars)",0.413834,0.205009,0.100735,0.24342,1.187362,0.015896,1.050769,0.269202
81,"(cooking oil, candy bars)",(cake),0.205009,0.413834,0.100735,0.491371,1.187362,0.015896,1.152443,0.198489
69,(gums),"(cooking oil, babies food)",0.289441,0.43763,0.150132,0.518696,1.185239,0.023464,1.16843,0.219951
64,"(cooking oil, babies food)",(gums),0.43763,0.289441,0.150132,0.343056,1.185239,0.023464,1.081614,0.27791
67,(cooking oil),"(babies food, gums)",0.544332,0.233037,0.150132,0.275809,1.183541,0.023282,1.059062,0.340331
66,"(babies food, gums)",(cooking oil),0.233037,0.544332,0.150132,0.644239,1.183541,0.023282,1.280828,0.202198
