In [1]:
# PARAMETERS (for papermill)

# Input
CLEANED_DATA_PATH = "data/processed/cleaned_uk_data.csv"
RULES_INPUT_PATH = "data/processed/rules_apriori_filtered.csv"  # hoặc rules_fpgrowth_filtered.csv

# Feature engineering
TOP_K_RULES = 200
SORT_RULES_BY = "lift"      # lift | confidence | support
WEIGHTING = "lift"          # none | lift | confidence | support | lift_x_conf
MIN_ANTECEDENT_LEN = 1
USE_RFM = True
RFM_SCALE = True
RULE_SCALE = False

# Clustering
K_MIN = 2
K_MAX = 10
N_CLUSTERS = None            # None => chọn theo silhouette, hoặc đặt số cụ thể (vd 5)
RANDOM_STATE = 42

# Output
OUTPUT_CLUSTER_PATH = "data/processed/customer_clusters_from_rules.csv"

# Visual
PROJECTION_METHOD = "pca"   # pca | svd
PLOT_2D = True


In [2]:
# Parameters
CLEANED_DATA_PATH = "data/processed/cleaned_uk_data.csv"
RULES_INPUT_PATH = "data/processed/rules_apriori_filtered.csv"
TOP_K_RULES = 200
SORT_RULES_BY = "lift"
WEIGHTING = "Lift"
MIN_ANTECEDENT_LEN = 2
USE_RFM = True
RFM_SCALE = True
RULE_SCALE = True
K_MIN = 2
K_MAX = 12
N_CLUSTERS = 3
RANDOM_STATE = 42
OUTPUT_CLUSTER_PATH = "data/processed/customer_clusters_from_rules.csv"
PROJECTION_METHOD = "pca"
PLOT_2D = True


In [3]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Determine correct project root
cwd = os.getcwd()
if os.path.basename(cwd) == "notebooks":
    project_root = os.path.abspath("..")
else:
    project_root = cwd

src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

from cluster_library import RuleBasedCustomerClusterer


In [4]:
df_clean = pd.read_csv(CLEANED_DATA_PATH, parse_dates=["InvoiceDate"])
print(df_clean.shape)
df_clean.head()


(485123, 11)


  df_clean = pd.read_csv(CLEANED_DATA_PATH, parse_dates=["InvoiceDate"])


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,DayOfWeek,HourOfDay
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,2,8
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2,8
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,2,8
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2,8
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2,8


In [5]:
clusterer = RuleBasedCustomerClusterer(df_clean=df_clean)
customer_item_bool = clusterer.build_customer_item_matrix(threshold=1)
print('Customer × Item:', customer_item_bool.shape)

rules_df = clusterer.load_rules(
    rules_csv_path=RULES_INPUT_PATH,
    top_k=TOP_K_RULES,
    sort_by=SORT_RULES_BY,
)
print('Rules used:', rules_df.shape)
rules_df.head()


Customer × Item: (3921, 4007)
Rules used: (200, 17)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski,antecedents_str,consequents_str,rule_str
0,"frozenset({'HERB MARKER ROSEMARY', 'HERB MARKE...",frozenset({'HERB MARKER THYME'}),0.011487,0.012763,0.010932,0.951691,74.567045,1.0,0.010785,20.435808,0.998053,0.820833,0.951066,0.904106,"HERB MARKER PARSLEY, HERB MARKER ROSEMARY",HERB MARKER THYME,"HERB MARKER PARSLEY, HERB MARKER ROSEMARY → HE..."
1,"frozenset({'HERB MARKER THYME', 'HERB MARKER M...",frozenset({'HERB MARKER ROSEMARY'}),0.011098,0.012818,0.010599,0.955,74.502403,1.0,0.010456,21.937369,0.99765,0.795833,0.954416,0.89092,"HERB MARKER MINT, HERB MARKER THYME",HERB MARKER ROSEMARY,"HERB MARKER MINT, HERB MARKER THYME → HERB MAR..."
2,"frozenset({'HERB MARKER THYME', 'HERB MARKER M...",frozenset({'HERB MARKER PARSLEY'}),0.011098,0.012652,0.010432,0.94,74.297105,1.0,0.010292,16.455802,0.997612,0.783333,0.939231,0.882281,"HERB MARKER MINT, HERB MARKER THYME",HERB MARKER PARSLEY,"HERB MARKER MINT, HERB MARKER THYME → HERB MAR..."
3,"frozenset({'HERB MARKER THYME', 'HERB MARKER P...",frozenset({'HERB MARKER ROSEMARY'}),0.011487,0.012818,0.010932,0.951691,74.244244,1.0,0.010784,20.43466,0.997995,0.817427,0.951064,0.902252,"HERB MARKER PARSLEY, HERB MARKER THYME",HERB MARKER ROSEMARY,"HERB MARKER PARSLEY, HERB MARKER THYME → HERB ..."
4,"frozenset({'HERB MARKER THYME', 'HERB MARKER B...",frozenset({'HERB MARKER ROSEMARY'}),0.011265,0.012818,0.01071,0.950739,74.169983,1.0,0.010565,20.039787,0.997757,0.80083,0.950099,0.893118,"HERB MARKER BASIL, HERB MARKER THYME",HERB MARKER ROSEMARY,"HERB MARKER BASIL, HERB MARKER THYME → HERB MA..."


In [6]:
X, meta = clusterer.build_final_features(
    weighting=WEIGHTING,
    use_rfm=USE_RFM,
    rfm_scale=RFM_SCALE,
    rule_scale=RULE_SCALE,
    min_antecedent_len=MIN_ANTECEDENT_LEN,
)
print('X shape:', X.shape)
meta.head()


X shape: (3921, 203)


Unnamed: 0,CustomerID,Recency,Frequency,Monetary
0,000nan,1,1373,1716830.53
1,012346,326,1,77183.6
2,012747,2,11,4196.01
3,012748,1,209,33719.73
4,012749,4,5,4090.88


In [7]:
# =====================================================
# EXPORT CUSTOMER RULE FEATURES WITH RULE NAMES
# =====================================================

print("▶ Exporting customer_rule_features.csv (with rule names)")

n_rules = rules_df.shape[0]
X_rules = X[:, :n_rules]

# -----------------------------------------------------
# 1. Tạo tên rule dễ đọc
# -----------------------------------------------------
def format_rule(row):
    ant = list(row['antecedents'])
    con = list(row['consequents'])
    ant_str = " + ".join(ant)
    con_str = " + ".join(con)
    return f"{ant_str} → {con_str}"

rule_names = rules_df.apply(format_rule, axis=1).tolist()

# -----------------------------------------------------
# 2. DataFrame rule-features
# -----------------------------------------------------
df_rule_features = pd.DataFrame(
    X_rules,
    columns=rule_names
)

df_rule_features.insert(0, "CustomerID", meta["CustomerID"].values)

# -----------------------------------------------------
# 3. Lưu file
# -----------------------------------------------------
output_rule_feature_path = "data/processed/customer_rule_features.csv"
os.makedirs(os.path.dirname(output_rule_feature_path), exist_ok=True)

df_rule_features.to_csv(output_rule_feature_path, index=False)

print(f"✅ Saved: {output_rule_feature_path}")
print("Shape:", df_rule_features.shape)

df_rule_features.iloc[:, :5].head()


▶ Exporting customer_rule_features.csv (with rule names)


✅ Saved: data/processed/customer_rule_features.csv
Shape: (3921, 201)


Unnamed: 0,CustomerID,"f + r + o + z + e + n + s + e + t + ( + { + ' + H + E + R + B + + M + A + R + K + E + R + + R + O + S + E + M + A + R + Y + ' + , + + ' + H + E + R + B + + M + A + R + K + E + R + + P + A + R + S + L + E + Y + ' + } + ) → f + r + o + z + e + n + s + e + t + ( + { + ' + H + E + R + B + + M + A + R + K + E + R + + T + H + Y + M + E + ' + } + )","f + r + o + z + e + n + s + e + t + ( + { + ' + H + E + R + B + + M + A + R + K + E + R + + T + H + Y + M + E + ' + , + + ' + H + E + R + B + + M + A + R + K + E + R + + M + I + N + T + ' + } + ) → f + r + o + z + e + n + s + e + t + ( + { + ' + H + E + R + B + + M + A + R + K + E + R + + R + O + S + E + M + A + R + Y + ' + } + )","f + r + o + z + e + n + s + e + t + ( + { + ' + H + E + R + B + + M + A + R + K + E + R + + T + H + Y + M + E + ' + , + + ' + H + E + R + B + + M + A + R + K + E + R + + M + I + N + T + ' + } + ) → f + r + o + z + e + n + s + e + t + ( + { + ' + H + E + R + B + + M + A + R + K + E + R + + P + A + R + S + L + E + Y + ' + } + )","f + r + o + z + e + n + s + e + t + ( + { + ' + H + E + R + B + + M + A + R + K + E + R + + T + H + Y + M + E + ' + , + + ' + H + E + R + B + + M + A + R + K + E + R + + P + A + R + S + L + E + Y + ' + } + ) → f + r + o + z + e + n + s + e + t + ( + { + ' + H + E + R + B + + M + A + R + K + E + R + + R + O + S + E + M + A + R + Y + ' + } + )"
0,000nan,5.727279,5.727279,5.727279,5.702002
1,012346,-0.174603,-0.174603,-0.174603,-0.175377
2,012747,-0.174603,-0.174603,-0.174603,-0.175377
3,012748,5.727279,5.727279,5.727279,5.702002
4,012749,-0.174603,-0.174603,-0.174603,-0.175377


In [8]:
sil_df = clusterer.choose_k_by_silhouette(
    X,
    k_min=K_MIN,
    k_max=K_MAX,
    random_state=RANDOM_STATE,
)
sil_df


Unnamed: 0,k,silhouette
0,2,0.875162
1,3,0.873354
2,12,0.442162
3,11,0.384972
4,9,0.37113
5,10,0.366702
6,5,0.295026
7,8,0.271498
8,7,0.265041
9,6,0.263301


In [9]:
best_k = int(sil_df.loc[0, 'k'])
k = best_k if N_CLUSTERS is None else int(N_CLUSTERS)
print('Chosen k =', k)


Chosen k = 3


In [10]:
labels = clusterer.fit_kmeans(X, n_clusters=k, random_state=RANDOM_STATE)
meta_out = meta.copy()
meta_out['cluster'] = labels

# Lưu
os.makedirs(os.path.dirname(OUTPUT_CLUSTER_PATH), exist_ok=True)
meta_out.to_csv(OUTPUT_CLUSTER_PATH, index=False)
print('Saved:', OUTPUT_CLUSTER_PATH)
meta_out.head()


Saved: data/processed/customer_clusters_from_rules.csv


Unnamed: 0,CustomerID,Recency,Frequency,Monetary,cluster
0,000nan,1,1373,1716830.53,2
1,012346,326,1,77183.6,0
2,012747,2,11,4196.01,0
3,012748,1,209,33719.73,1
4,012749,4,5,4090.88,0


In [11]:
profile_cols = ['cluster'] + ([c for c in ['Recency','Frequency','Monetary'] if c in meta_out.columns])
summary = meta_out.groupby('cluster').agg({
    'CustomerID': 'count',
    **{c:'mean' for c in profile_cols if c!='cluster'}
}).rename(columns={'CustomerID':'n_customers'}).sort_values('n_customers', ascending=False)
summary


Unnamed: 0_level_0,n_customers,Recency,Frequency,Monetary
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3797,93.218594,4.050566,1809.822
1,123,61.02439,10.308943,3548.746
2,1,1.0,1373.0,1716831.0


In [12]:
if PLOT_2D:
    
    Z = clusterer.project_2d(X, method=PROJECTION_METHOD, random_state=RANDOM_STATE)

    meta_out = meta.copy()
    meta_out['cluster'] = labels # Nhãn cụm từ thuật toán K-Means
    meta_out['Component 1'] = Z[:, 0] # Tọa độ trục X
    meta_out['Component 2'] = Z[:, 1] # Tọa độ trục Y

    import os
    os.makedirs(os.path.dirname(OUTPUT_CLUSTER_PATH), exist_ok=True)
    meta_out.to_csv(OUTPUT_CLUSTER_PATH, index=False)

    print("✅ Đã lưu file thành công với đầy đủ cột Component 1 và Component 2!")

    print(meta_out[['CustomerID', 'cluster', 'Component 1', 'Component 2']].head())


✅ Đã lưu file thành công với đầy đủ cột Component 1 và Component 2!
  CustomerID  cluster  Component 1  Component 2
0     000nan        2    39.800220   112.813774
1     012346        0    -0.968511     0.436141
2     012747        0    -0.977250    -0.102707
3     012748        1    33.808689    23.632812
4     012749        0    -0.985881    -0.196821


In [13]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

# 1. Chạy mô hình Agglomerative (Đối trọng của K-Means)
agg_clusterer = AgglomerativeClustering(n_clusters=3)
agg_labels = agg_clusterer.fit_predict(X)

# 2. Tính toán các Metric so sánh
results = {
    "Metric": ["Silhouette Score (Cao là tốt)", "Davies-Bouldin Index (Thấp là tốt)"],
    "K-Means (Current)": [silhouette_score(X, labels), davies_bouldin_score(X, labels)],
    "Agglomerative": [silhouette_score(X, agg_labels), davies_bouldin_score(X, agg_labels)]
}

comparison_df = pd.DataFrame(results)
display(comparison_df)
meta_out['cluster_agg'] = agg_labels 

# Lưu lại file CSV (ghi đè lên file cũ nhưng có thêm cột cluster_agg)
meta_out.to_csv(OUTPUT_CLUSTER_PATH, index=False)
print("✅ Đã cập nhật file CSV với cả nhãn K-Means và Agglomerative!")

Unnamed: 0,Metric,K-Means (Current),Agglomerative
0,Silhouette Score (Cao là tốt),0.873354,0.87078
1,Davies-Bouldin Index (Thấp là tốt),0.287676,0.316192


✅ Đã cập nhật file CSV với cả nhãn K-Means và Agglomerative!


In [14]:
n_rules = rules_df.shape[0]
X_rules_only = X[:, :n_rules] 

# XOAY MA TRẬN: Chuyển từ (Khách x Luật) thành (Luật x Khách)
X_basket = X_rules_only.T 

# PHÂN CỤM LUẬT (Theo yêu cầu 2.3 hướng 2)
from sklearn.cluster import KMeans
kmeans_basket = KMeans(n_clusters=5, random_state=RANDOM_STATE)
basket_labels = kmeans_basket.fit_predict(X_basket)

# Đưa kết quả vào rules_df để Dashboard hiển thị
rules_df['basket_group'] = basket_labels
rules_df.to_csv("data/processed/rules_with_basket_groups.csv", index=False)

  super()._check_params_vs_input(X, default_n_init=10)
