In [None]:
# PARAMETERS (for papermill)

# Input
CLEANED_DATA_PATH = "data/processed/cleaned_uk_data.csv"
RULES_INPUT_PATH = "data/processed/rules_apriori_filtered.csv"  # hoặc rules_fpgrowth_filtered.csv

# Feature engineering
TOP_K_RULES = 200
SORT_RULES_BY = "lift"      # lift | confidence | support
WEIGHTING = "lift"          # none | lift | confidence | support | lift_x_conf
MIN_ANTECEDENT_LEN = 1
USE_RFM = True
RFM_SCALE = True
RULE_SCALE = False

# Clustering
K_MIN = 2
K_MAX = 10
N_CLUSTERS = None            # None => chọn theo silhouette, hoặc đặt số cụ thể (vd 5)
RANDOM_STATE = 42

# Output
OUTPUT_CLUSTER_PATH = "data/processed/customer_clusters_from_rules.csv"

# Visual
PROJECTION_METHOD = "pca"   # pca | svd
PLOT_2D = True


In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Determine correct project root
cwd = os.getcwd()
if os.path.basename(cwd) == "notebooks":
    project_root = os.path.abspath("..")
else:
    project_root = cwd

src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

from cluster_library import RuleBasedCustomerClusterer


In [None]:
df_clean = pd.read_csv(CLEANED_DATA_PATH, parse_dates=["InvoiceDate"])
print(df_clean.shape)
df_clean.head()


In [None]:
clusterer = RuleBasedCustomerClusterer(df_clean=df_clean)
customer_item_bool = clusterer.build_customer_item_matrix(threshold=1)
print('Customer × Item:', customer_item_bool.shape)

rules_df = clusterer.load_rules(
    rules_csv_path=RULES_INPUT_PATH,
    top_k=TOP_K_RULES,
    sort_by=SORT_RULES_BY,
)
print('Rules used:', rules_df.shape)
rules_df.head()


In [None]:
X, meta = clusterer.build_final_features(
    weighting=WEIGHTING,
    use_rfm=USE_RFM,
    rfm_scale=RFM_SCALE,
    rule_scale=RULE_SCALE,
    min_antecedent_len=MIN_ANTECEDENT_LEN,
)
print('X shape:', X.shape)
meta.head()


In [None]:
# =====================================================
# EXPORT CUSTOMER RULE FEATURES WITH RULE NAMES
# =====================================================

print("▶ Exporting customer_rule_features.csv (with rule names)")

n_rules = rules_df.shape[0]
X_rules = X[:, :n_rules]

# -----------------------------------------------------
# 1. Tạo tên rule dễ đọc
# -----------------------------------------------------
def format_rule(row):
    ant = list(row['antecedents'])
    con = list(row['consequents'])
    ant_str = " + ".join(ant)
    con_str = " + ".join(con)
    return f"{ant_str} → {con_str}"

rule_names = rules_df.apply(format_rule, axis=1).tolist()

# -----------------------------------------------------
# 2. DataFrame rule-features
# -----------------------------------------------------
df_rule_features = pd.DataFrame(
    X_rules,
    columns=rule_names
)

df_rule_features.insert(0, "CustomerID", meta["CustomerID"].values)

# -----------------------------------------------------
# 3. Lưu file
# -----------------------------------------------------
output_rule_feature_path = "data/processed/customer_rule_features.csv"
os.makedirs(os.path.dirname(output_rule_feature_path), exist_ok=True)

df_rule_features.to_csv(output_rule_feature_path, index=False)

print(f"✅ Saved: {output_rule_feature_path}")
print("Shape:", df_rule_features.shape)

df_rule_features.iloc[:, :5].head()


In [None]:
sil_df = clusterer.choose_k_by_silhouette(
    X,
    k_min=K_MIN,
    k_max=K_MAX,
    random_state=RANDOM_STATE,
)
sil_df


In [None]:
best_k = int(sil_df.loc[0, 'k'])
k = best_k if N_CLUSTERS is None else int(N_CLUSTERS)
print('Chosen k =', k)


In [None]:
labels = clusterer.fit_kmeans(X, n_clusters=k, random_state=RANDOM_STATE)
meta_out = meta.copy()
meta_out['cluster'] = labels

# Lưu
os.makedirs(os.path.dirname(OUTPUT_CLUSTER_PATH), exist_ok=True)
meta_out.to_csv(OUTPUT_CLUSTER_PATH, index=False)
print('Saved:', OUTPUT_CLUSTER_PATH)
meta_out.head()


In [None]:
profile_cols = ['cluster'] + ([c for c in ['Recency','Frequency','Monetary'] if c in meta_out.columns])
summary = meta_out.groupby('cluster').agg({
    'CustomerID': 'count',
    **{c:'mean' for c in profile_cols if c!='cluster'}
}).rename(columns={'CustomerID':'n_customers'}).sort_values('n_customers', ascending=False)
summary


In [None]:
if PLOT_2D:
    
    Z = clusterer.project_2d(X, method=PROJECTION_METHOD, random_state=RANDOM_STATE)

    meta_out = meta.copy()
    meta_out['cluster'] = labels # Nhãn cụm từ thuật toán K-Means
    meta_out['Component 1'] = Z[:, 0] # Tọa độ trục X
    meta_out['Component 2'] = Z[:, 1] # Tọa độ trục Y

    import os
    os.makedirs(os.path.dirname(OUTPUT_CLUSTER_PATH), exist_ok=True)
    meta_out.to_csv(OUTPUT_CLUSTER_PATH, index=False)

    print("✅ Đã lưu file thành công với đầy đủ cột Component 1 và Component 2!")

    print(meta_out[['CustomerID', 'cluster', 'Component 1', 'Component 2']].head())


In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

# 1. Chạy mô hình Agglomerative (Đối trọng của K-Means)
agg_clusterer = AgglomerativeClustering(n_clusters=3)
agg_labels = agg_clusterer.fit_predict(X)

# 2. Tính toán các Metric so sánh
results = {
    "Metric": ["Silhouette Score (Cao là tốt)", "Davies-Bouldin Index (Thấp là tốt)"],
    "K-Means (Current)": [silhouette_score(X, labels), davies_bouldin_score(X, labels)],
    "Agglomerative": [silhouette_score(X, agg_labels), davies_bouldin_score(X, agg_labels)]
}

comparison_df = pd.DataFrame(results)
display(comparison_df)
meta_out['cluster_agg'] = agg_labels 

# Lưu lại file CSV (ghi đè lên file cũ nhưng có thêm cột cluster_agg)
meta_out.to_csv(OUTPUT_CLUSTER_PATH, index=False)
print("✅ Đã cập nhật file CSV với cả nhãn K-Means và Agglomerative!")

In [None]:
n_rules = rules_df.shape[0]
X_rules_only = X[:, :n_rules] 

# XOAY MA TRẬN: Chuyển từ (Khách x Luật) thành (Luật x Khách)
X_basket = X_rules_only.T 

# PHÂN CỤM LUẬT (Theo yêu cầu 2.3 hướng 2)
from sklearn.cluster import KMeans
kmeans_basket = KMeans(n_clusters=5, random_state=RANDOM_STATE)
basket_labels = kmeans_basket.fit_predict(X_basket)

# Đưa kết quả vào rules_df để Dashboard hiển thị
rules_df['basket_group'] = basket_labels
rules_df.to_csv("data/processed/rules_with_basket_groups.csv", index=False)