In [None]:
# Install library tambahan jika diperlukan
!pip install scikit-learn pandas matplotlib numpy dask

# Import library
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import pairwise_distances

# Upload file CSV
from google.colab import files
uploaded = files.upload()

In [None]:
# Membaca data dari file CSV menggunakan Dask
file_name = list(uploaded.keys())[0]
data = dd.read_csv(file_name)

In [None]:
# Target kolom yang akan digunakan
target_columns = ['TSS', 'pH', 'EC', 'TDS', 'CHLA']
data = data[target_columns]

# Mengetahui jumlah nilai null per kolom
null_counts = data.isna().sum().compute()
print("Jumlah nilai null awal pada setiap kolom:")
print(null_counts)
print("-" * 50)

# Mengisi nilai NaN dengan median (menggunakan Dask)
data = data.fillna(data.median().compute())

# Memastikan tidak ada nilai null setelah pengisian
null_counts_after = data.isna().sum().compute()
print("Jumlah nilai null setelah pengisian:")
print(null_counts_after)
print("-" * 50)

# Mengonversi ke Pandas untuk kompatibilitas dengan Scikit-learn
data = data.compute()

# Standarisasi data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

In [None]:
from sklearn.cluster import KMeans
import numpy as np

def gap_statistic(X, n_refs=10, n_clusters=5):
    shape = X.shape
    tops = X.max(axis=0)
    bottoms = X.min(axis=0)
    dists = np.diag(tops - bottoms)

    # Generate n_refs acak dataset dalam bounding box data asli
    random_wss = []
    for _ in range(n_refs):
        random_data = np.random.random_sample(size=shape)
        random_data = random_data @ dists + bottoms
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        kmeans.fit(random_data)
        random_wss.append(kmeans.inertia_)

    # Hitung WSS data asli
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    kmeans.fit(X)
    original_wss = kmeans.inertia_

    # Gap Statistic
    log_wss_random = np.log(random_wss)
    gap = np.mean(log_wss_random) - np.log(original_wss)

    return gap

for n_clusters in [2, 3, 4, 5]:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    data['Cluster'] = kmeans.fit_predict(scaled_data)

    silhouette_avg = silhouette_score(scaled_data, data['Cluster'])
    dbi = davies_bouldin_score(scaled_data, data['Cluster'])
    calinski_harabasz = calinski_harabasz_score(scaled_data, data['Cluster'])
    wss = kmeans.inertia_
    gap = gap_statistic(scaled_data, n_refs=10, n_clusters=n_clusters)

    print(f"Evaluasi untuk {n_clusters} Klaster:")
    print(f"  Davies-Bouldin Index: {dbi:.5f}")
    print(f"  Silhouette Score: {silhouette_avg:.5f}")
    print(f"  Calinski-Harabasz Index: {calinski_harabasz:.5f}")
    print(f"  Within-Cluster Sum of Squares (WSS / Cohesion): {wss:.5f}")
    print(f"  Gap Statistic: {gap:.5f}")
    print("-" * 50)


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import pairwise_distances
import numpy as np
import pandas as pd

for n_clusters in [2, 3, 4, 5]:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    data['Cluster'] = kmeans.fit_predict(equal_weighted_data)

    # Menampilkan jumlah data dalam setiap klaster
    cluster_counts = data['Cluster'].value_counts().sort_index()
    print(f"\nJumlah data per klaster untuk {n_clusters} Klaster:")
    for cluster, count in cluster_counts.items():
        print(f"  Cluster {cluster + 1}: {count}")

    print("-" * 50)

In [None]:
# Step 2: Klasterisasi menggunakan KMeans dengan 5 klaster
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(scaled_data)
data['Cluster'] = kmeans.labels_

# Step 3: Hitung rata-rata tiap variabel per klaster
cluster_means = pd.DataFrame(scaled_data, columns=target_columns)
cluster_means['Cluster'] = kmeans.labels_
cluster_means = cluster_means.groupby('Cluster').mean()

print("Rata-rata tiap variabel setelah standarisasi per klaster:")
print(cluster_means)

# Step 4: Membuat radar plot per klaster
def create_spider_plot_per_cluster(cluster_means):
    labels = cluster_means.columns.tolist()
    num_vars = len(labels)
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1]

    # Warna untuk 5 klaster
    colors = ['#1f77b4', 'orange', 'green', 'red', 'purple']

    for cluster in cluster_means.index:
        values = cluster_means.loc[cluster].values.flatten().tolist()
        values += values[:1]

        fig, ax = plt.subplots(figsize=(7, 7), subplot_kw=dict(polar=True))
        ax.plot(angles, values, marker='o', linewidth=2, color=colors[cluster])
        ax.fill(angles, values, alpha=0.25, color=colors[cluster])

        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(labels, fontsize=10, rotation=30, ha="right")

        ax.set_title(f'Radar Plot Klaster {cluster+1}', size=14, y=1.08)
        ax.set_yticklabels([])
        plt.tight_layout()
        plt.show()

# Panggil fungsi untuk buat radar per klaster
create_spider_plot_per_cluster(cluster_means)


In [None]:
def create_combined_spider_plot(cluster_means):
    labels = cluster_means.columns.tolist()
    num_vars = len(labels)
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1]

    # Warna untuk 5 klaster
    colors = ['#1f77b4', 'orange', 'green', 'red', 'purple']

    # Buat figure tunggal
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

    for cluster in cluster_means.index:
        values = cluster_means.loc[cluster].values.flatten().tolist()
        values += values[:1]

        ax.plot(angles, values, marker='o', linewidth=2, label=f'Klaster {cluster+1}', color=colors[cluster])
        ax.fill(angles, values, alpha=0.15, color=colors[cluster])

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(labels, fontsize=10, rotation=30, ha="right")

    ax.set_title('Radar Plot Tiap Klaster', size=16, y=1.1)
    ax.set_yticklabels([])

    ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
    plt.tight_layout()
    plt.show()

# Panggil fungsi
create_combined_spider_plot(cluster_means)


In [None]:
# Menambahkan kolom 'Cluster_X' untuk setiap jumlah klaster
for n_clusters in [2, 3, 4, 5]:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    data[f'Cluster_{n_clusters}'] = kmeans.fit_predict(equal_weighted_data)

# Mengekspor data ke file Excel
output_file = 'hasil_klasterisasi_kmeans_bobotsama_revisi1.xlsx'
data.to_excel(output_file, index=False)

# Memberi tahu pengguna bahwa file telah disimpan
print(f"File hasil klasterisasi telah disimpan di {output_file}")

In [None]:
# Menyediakan file untuk di-download
files.download(output_file)