In [1]:
# Question 4: Combine hierarchical clustering with Apriori to analyze clustered data and find frequent patterns within each cluster of a given dataset.

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist

dataset = [
    ['milk', 'bread', 'butter'],
    ['bread', 'butter'],
    ['milk', 'bread'],
    ['milk', 'bread', 'butter', 'eggs'],
    ['bread', 'butter'],
    ['eggs', 'bread'],
    ['milk', 'eggs'],
    ['bread', 'butter', 'eggs'],
    ['milk', 'bread', 'eggs'],
    ['butter', 'eggs']
]

te = TransactionEncoder()
te_array = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_array, columns=te.columns_)

distance_matrix = pdist(df.values, metric='jaccard')
Z = linkage(distance_matrix, method='ward')

cluster_labels = fcluster(Z, t=3, criterion='maxclust')

df['cluster'] = cluster_labels

for cluster_num in sorted(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cluster_num].drop(columns=['cluster'])
    frequent_itemsets = apriori(cluster_data, min_support=0.5, use_colnames=True)
    print(f"\nCluster {cluster_num} Frequent Itemsets:")
    print(frequent_itemsets)



Cluster 1 Frequent Itemsets:
   support       itemsets
0      0.8        (bread)
1      0.6         (eggs)
2      1.0         (milk)
3      0.8  (milk, bread)
4      0.6   (milk, eggs)

Cluster 2 Frequent Itemsets:
   support         itemsets
0      1.0          (bread)
1      1.0         (butter)
2      1.0  (butter, bread)

Cluster 3 Frequent Itemsets:
    support        itemsets
0  0.666667         (bread)
1  0.666667        (butter)
2  1.000000          (eggs)
3  0.666667   (eggs, bread)
4  0.666667  (eggs, butter)
