In [None]:
# --- 1. Impor Pustaka ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# Prapemrosesan dan Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Model Clustering
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture
from scipy.cluster.hierarchy import linkage, dendrogram

# Metrik Evaluasi
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, silhouette_samples

In [None]:
class DataPipeline:
    """Mengelola alur kerja pembersihan data dan feature engineering."""
    def __init__(self, file_path, sample_size=10000, random_state=42):
        self.file_path = file_path
        self.sample_size = sample_size
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.tfidf = TfidfVectorizer(max_features=100, stop_words='english')

    def _load_and_clean(self):
        """Memuat, membersihkan, dan memvalidasi data transaksi."""
        data = pd.read_csv(self.file_path, encoding='ISO-8859-1')
        data.dropna(subset=['CustomerID'], inplace=True)
        data['Description'].fillna('Unknown', inplace=True)
        data = data[(data['Quantity'] > 0) & (data['UnitPrice'] > 0)]
        data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], format='%m/%d/%Y %H:%M')
        data.drop_duplicates(inplace=True)
        return data

    def _engineer_features(self, data):
        """Menciptakan fitur baru dan mengubah fitur yang ada."""
        data_fe = data.copy()
        data_fe['DayOfWeek'] = data_fe['InvoiceDate'].dt.dayofweek
        data_fe['Hour'] = data_fe['InvoiceDate'].dt.hour
        data_fe[['Quantity_scaled', 'UnitPrice_scaled']] = self.scaler.fit_transform(data_fe[['Quantity', 'UnitPrice']])

        # Sampling untuk efisiensi komputasi
        data_sample = data_fe.sample(n=self.sample_size, random_state=self.random_state)
        
        # TF-IDF pada deskripsi produk
        description_tfidf = self.tfidf.fit_transform(data_sample['Description'])
        tfidf_df = pd.DataFrame(description_tfidf.toarray(), columns=self.tfidf.get_feature_names_out(), index=data_sample.index)

        # Gabungkan semua fitur menjadi dataset final untuk clustering
        features_to_cluster = ['Quantity_scaled', 'UnitPrice_scaled', 'DayOfWeek', 'Hour']
        final_data_for_clustering = data_sample[features_to_cluster].join(tfidf_df)
        
        return data_sample, final_data_for_clustering

    def run(self):
        """Menjalankan seluruh pipeline prapemrosesan."""
        print("Tahap 1: Memuat dan membersihkan data...")
        cleaned_data = self._load_and_clean()
        print(f"   -> Data bersih berisi {len(cleaned_data)} baris.")

        print(f"Tahap 2: Melakukan feature engineering pada sampel {self.sample_size} data...")
        self.original_sample, self.final_data_for_clustering = self._engineer_features(cleaned_data)
        print("   -> Pipeline data selesai. Data siap untuk di-cluster.")

        return self.final_data_for_clustering, self.original_sample

In [None]:
class ClusterAnalyzer:
    """Menganalisis dan memvisualisasikan hasil dari berbagai model clustering."""
    def __init__(self, data_for_clustering, original_sample_data):
        self.X = data_for_clustering.values
        self.original_data = original_sample_data
        self.models = self._define_models()
        self.results_df = pd.DataFrame()
        self.labels_dict = {}
        
        # Lakukan PCA sekali untuk efisiensi
        print("Tahap 3: Melakukan reduksi dimensi PCA untuk visualisasi...")
        self.pca = PCA(n_components=2, random_state=42)
        self.X_pca = self.pca.fit_transform(self.X)

    def _define_models(self):
        """Mendefinisikan model-model clustering yang akan diuji."""
        return {
            'KMeans': KMeans(n_clusters=5, random_state=42, n_init=10),
            'Agglomerative': AgglomerativeClustering(n_clusters=5),
            'Spectral': SpectralClustering(n_clusters=5, random_state=42, affinity='nearest_neighbors'),
            'GaussianMixture': GaussianMixture(n_components=5, random_state=42)
        }

    def run_and_evaluate(self):
        """Menjalankan semua model dan menghitung metrik evaluasi."""
        print("Tahap 4: Menjalankan dan mengevaluasi model clustering...")
        results = []
        for name, model in self.models.items():
            print(f"   - Fitting {name}...")
            labels = model.fit_predict(self.X)
            self.labels_dict[name] = labels
            
            results.append({
                'Model': name,
                'Silhouette Score': silhouette_score(self.X, labels),
                'Davies-Bouldin Index': davies_bouldin_score(self.X, labels),
                'Calinski-Harabasz Score': calinski_harabasz_score(self.X, labels),
            })
        self.results_df = pd.DataFrame(results).sort_values('Silhouette Score', ascending=False)
        return self.results_df

    def visualize_all(self, best_model_name):
        """Menjalankan semua fungsi visualisasi."""
        print("\nTahap 5: Membuat visualisasi perbandingan dan analisis...")
        self._plot_metrics_comparison()
        self._plot_pca_clusters()
        self._plot_cluster_profiles(best_model_name)
        self._plot_dendrogram()

    def _plot_metrics_comparison(self):
        """Membuat bar plot perbandingan metrik."""
        fig, axes = plt.subplots(1, 3, figsize=(22, 6))
        metrics = ['Silhouette Score', 'Davies-Bouldin Index', 'Calinski-Harabasz Score']
        pal = "viridis"
        for i, metric in enumerate(metrics):
            sns.barplot(x=metric, y='Model', data=self.results_df.sort_values(metric), ax=axes[i], palette=pal)
            axes[i].set_title(f'Perbandingan {metric}', fontsize=14)
            axes[i].set_xlabel('Skor')
        plt.suptitle('Perbandingan Metrik Evaluasi Clustering', fontsize=18)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.show()
        
    def _plot_pca_clusters(self):
        """Membuat scatter plot PCA untuk setiap model."""
        num_models = len(self.models)
        fig, axes = plt.subplots(1, num_models, figsize=(8 * num_models, 6), sharey=True, sharex=True)
        for i, (name, model) in enumerate(self.models.items()):
            labels = self.labels_dict[name]
            sns.scatterplot(x=self.X_pca[:, 0], y=self.X_pca[:, 1], hue=labels, palette='viridis', s=30, ax=axes[i], legend='full', alpha=0.7)
            axes[i].set_title(f'Cluster PCA - {name}', fontsize=14)
            axes[i].set_xlabel('Komponen PCA 1')
        axes[0].set_ylabel('Komponen PCA 2')
        plt.suptitle('Visualisasi Cluster dengan Reduksi Dimensi PCA', fontsize=18)
        plt.show()

    def _plot_cluster_profiles(self, best_model_name):
        """Membuat boxplot untuk analisis profil kluster."""
        print(f"\nAnalisis Profil Cluster dari Model Terbaik: {best_model_name}")
        profile_data = self.original_data.copy()
        profile_data['Cluster'] = self.labels_dict[best_model_name]
        
        fig, axes = plt.subplots(2, 2, figsize=(18, 14))
        profile_features = ['Quantity', 'UnitPrice', 'DayOfWeek', 'Hour']
        titles = ['Distribusi Kuantitas', 'Distribusi Harga Satuan', 'Distribusi Hari Pembelian', 'Distribusi Jam Pembelian']
        
        for i, feature in enumerate(profile_features):
            ax = axes[i//2, i%2]
            sns.boxenplot(x='Cluster', y=feature, data=profile_data, ax=ax, palette='viridis')
            ax.set_title(titles[i], fontsize=14)
            if feature in ['Quantity', 'UnitPrice']:
                ax.set_yscale('log') # Skala log untuk menangani outlier
                
        plt.suptitle(f'Profil Karakteristik Cluster (Model: {best_model_name})', fontsize=18)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.show()
        
    def _plot_dendrogram(self):
        """Membuat dendrogram dari data yang telah disample lebih kecil."""
        print("\nMembuat Dendrogram untuk Hierarchical Clustering (menggunakan 500 sampel)...")
        plt.figure(figsize=(15, 8))
        dendrogram_data = self.X[:500] # Sample kecil untuk dendrogram
        linked = linkage(dendrogram_data, method='ward')
        dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
        plt.title('Dendrogram Hierarchical Clustering', fontsize=16)
        plt.xlabel('Indeks Sampel', fontsize=12)
        plt.ylabel('Jarak', fontsize=12)
        plt.show()

In [None]:
# --- 1. Jalankan Pipeline Data ---
pipeline = DataPipeline(file_path='UTSClustering.csv', sample_size=10000)
data_to_cluster, original_sample = pipeline.run()

# --- 2. Jalankan Analisis Clustering ---
analyzer = ClusterAnalyzer(data_to_cluster, original_sample)
results_df = analyzer.run_and_evaluate()

# --- 3. Tampilkan Hasil Evaluasi Numerik ---
print("\n--- Hasil Evaluasi Performa Model ---")
display(results_df)

# --- 4. Jalankan Semua Visualisasi ---
# Berdasarkan hasil, Agglomerative Clustering adalah kandidat kuat sebagai model terbaik
best_model_for_profiling = 'Agglomerative' 
analyzer.visualize_all(best_model_name=best_model_for_profiling)