In [1]:
import numpy as np
import pandas as pd
from PIL import Image
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, silhouette_score
import matplotlib.pyplot as plt
from os.path import join

In [2]:
# Load dataset
csv_path = 'D:/Users/natan/Skripsi/Dataset/Images_dataset/english.csv'
df = pd.read_csv(csv_path)
folder_path = 'D:/Users/natan/Skripsi/Dataset/Images_dataset/'

In [3]:
# Load images and labels
images = []
for filename in df['image']:
    img = Image.open(join(folder_path, filename))
    img = img.resize((150, 150))
    img_array = img_to_array(img)
    images.append(img_array)

images = np.array(images) / 255.0
labels = df['label'].values

In [4]:
# Extract features using PCA
def extract_features(images):
    n_samples = len(images)
    data = images.reshape((n_samples, -1))
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)
    pca = PCA(n_components=50)
    data_pca = pca.fit_transform(data_scaled)
    return data_pca, scaler, pca

features_pca, scaler, pca = extract_features(images)

In [7]:
# K-Means clustering
k = len(np.unique(labels))
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(features_pca)


In [8]:
# Map clusters to labels
def map_clusters_to_labels(cluster_labels, true_labels):
    cluster_to_labels = {}
    for cluster in np.unique(cluster_labels):
        mask = cluster_labels == cluster
        true_labels_in_cluster = true_labels[mask]
        most_common_label = np.bincount(true_labels_in_cluster).argmax()
        cluster_to_labels[cluster] = most_common_label
    return cluster_to_labels

In [9]:
# Convert true labels to numeric for mapping
label_encoder = LabelEncoder()
labels_numeric = label_encoder.fit_transform(labels)
cluster_to_labels = map_clusters_to_labels(cluster_labels, labels_numeric)

In [10]:
# Evaluate clustering accuracy
predicted_labels = np.array([cluster_to_labels[cluster] for cluster in cluster_labels])
accuracy = accuracy_score(labels_numeric, predicted_labels)
print(f"Akurasi K-Means clustering: {accuracy * 100:.2f}%")

Akurasi K-Means clustering: 11.03%


In [11]:
silhouette_avg = silhouette_score(features_pca, kmeans.labels_)
print("Silhouette Score:", silhouette_avg )

Silhouette Score: 0.046357278
