In [2]:
# Membaca data CSV secara manual
def read_csv(filename):
    data = []
    with open(filename, 'r') as f:
        header = f.readline().strip().split(',')
        for line in f:
            row = line.strip().split(',')
            # Ambil fitur numerik dan encoding yang relevan
            # Misal: Age, Annual Income (k$), Spending Score (1-100), Gender_Encoded
            # Ubah ke float/int sesuai kebutuhan
            data.append([
                int(row[5]),     # Gender_Encoded (pastikan urutan kolom benar!)
                float(row[2]),  # Age
                float(row[3]),  # Annual Income (k$)
                float(row[4])  # Spending Score (1-100)
            ])
    return data



In [3]:
# Fungsi akar kuadrat manual
def sqrt(x):
    if x == 0:
        return 0
    guess = x
    for _ in range(20):
        guess = (guess + x / guess) / 2.0
    return guess



In [4]:
# Fungsi jarak Euclidean manual
def euclidean_distance(a, b):
    total = 0
    for i in range(len(a)):
        total += (a[i] - b[i]) ** 2
    return sqrt(total)



In [5]:
# Inisialisasi centroid secara acak (ambil k data pertama)
def init_centroids(data, k):
    centroids = []
    for i in range(k):
        centroids.append(data[i][:])
    return centroids



In [6]:
# Assign data ke cluster terdekat
def assign_clusters(data, centroids):
    clusters = []
    for _ in centroids:
        clusters.append([])
    for point in data:
        min_dist = None
        min_idx = None
        for idx, centroid in enumerate(centroids):
            dist = euclidean_distance(point, centroid)
            if (min_dist is None) or (dist < min_dist):
                min_dist = dist
                min_idx = idx
        clusters[min_idx].append(point)
    return clusters



In [7]:
# Update centroid
def update_centroids(clusters):
    new_centroids = []
    for cluster in clusters:
        if len(cluster) == 0:
            new_centroids.append([0]*len(clusters[0][0]))
        else:
            centroid = []
            for i in range(len(cluster[0])):
                s = 0
                for point in cluster:
                    s += point[i]
                centroid.append(s / float(len(cluster)))
            new_centroids.append(centroid)
    return new_centroids



In [8]:
# Cek konvergensi
def is_converged(old, new, threshold=1e-4):
    for i in range(len(old)):
        if euclidean_distance(old[i], new[i]) >= threshold:
            return False
    return True



In [9]:
# Hitung WCSS (Within-Cluster Sum of Squares)
def calculate_wcss(clusters, centroids):
    wcss = 0
    for idx, cluster in enumerate(clusters):
        for point in cluster:
            dist = euclidean_distance(point, centroids[idx])
            wcss += dist ** 2
    return wcss



In [10]:
# K-Means utama
def kmeans(data, k, max_iter=100):
    centroids = init_centroids(data, k)
    for iteration in range(max_iter):
        clusters = assign_clusters(data, centroids)
        new_centroids = update_centroids(clusters)
        if is_converged(centroids, new_centroids):
            break
        centroids = new_centroids
    return clusters, centroids



In [11]:
# Analisis hasil
def analyze_clusters(clusters, centroids):
    for idx, cluster in enumerate(clusters):
        print("Cluster", idx+1, ":", len(cluster), "anggota")
        print("  Centroid:", centroids[idx])
        if len(cluster) > 0:
            avg = []
            for i in range(len(cluster[0])):
                s = 0
                for point in cluster:
                    s += point[i]
                avg.append(s / float(len(cluster)))
            print("  Rata-rata fitur:", avg)
        print()



In [12]:
# Proses training untuk mencari k terbaik
def train_best_k(data, k_min, k_max):
    best_k = k_min
    best_wcss = None
    wcss_list = []
    for k in range(k_min, k_max+1):
        clusters, centroids = kmeans(data, k)
        wcss = calculate_wcss(clusters, centroids)
        wcss_list.append((k, wcss))
        print("k =", k, "-> WCSS:", wcss)
        if (best_wcss is None) or (wcss < best_wcss):
            best_wcss = wcss
            best_k = k
    print("\nRekomendasi jumlah kluster terbaik (elbow):", best_k)
    return best_k, wcss_list



In [14]:
# Main
filename = 'Mall_Customers_Processed.csv'
data = read_csv("Mall_Customers_Processed.csv")

# Proses training: cari k terbaik dari 2 sampai 6
k_min = 2
k_max = 6
best_k, wcss_list = train_best_k(data, k_min, k_max)

# Lakukan clustering akhir dengan k terbaik
print("\nClustering akhir dengan k =", best_k)
clusters, centroids = kmeans(data, best_k)
analyze_clusters(clusters, centroids)

k = 2 -> WCSS: 438.5224115512283
k = 3 -> WCSS: 345.9488316593269
k = 4 -> WCSS: 254.40989634276002
k = 5 -> WCSS: 245.943822927785
k = 6 -> WCSS: 207.81672282184576

Rekomendasi jumlah kluster terbaik (elbow): 6

Clustering akhir dengan k = 6
Cluster 1 : 46 anggota
  Centroid: [0.34782608695652173, -0.8005106189565216, -0.4188348058043839, -0.22719052487192107]
  Rata-rata fitur: [0.34782608695652173, -0.8005106189565216, -0.4188348058043839, -0.22719052487192107]

Cluster 2 : 16 anggota
  Centroid: [1.0, -0.3794663716653858, 0.8016534361865959, 1.2806262339642072]
  Rata-rata fitur: [1.0, -0.3794663716653858, 0.8016534361865959, 1.2806262339642072]

Cluster 3 : 34 anggota
  Centroid: [0.5882352941176471, 0.17329544769644722, 1.0664538510653871, -1.2980103004705885]
  Rata-rata fitur: [0.5882352941176471, 0.17329544769644722, 1.0664538510653871, -1.2980103004705885]

Cluster 4 : 22 anggota
  Centroid: [0.4090909090909091, -0.9743959184545455, -1.3295453165, 1.1321778796599242]
  Rata-