### ***Problem02***
---

In [None]:
### Import Library & Data 
# default
import numpy as np
import pandas as pd
from numpy.random import uniform 
import matplotlib.pyplot as plt
import seaborn as sns 
import random
# sklearn, tf
import tensorflow as tf
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.datasets import mnist
from sklearn import metrics

In [None]:
### Build K_Means
# Import temporary dataset (MNIST)
(X_train, y_train), (X_test, y_test) = mnist.load_data() 
X_train = X_train.reshape(len(X_train), -1) 
print("X_train: ", X_train.shape)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
X_train:  (60000, 784)


In [None]:
### K_Means Model 
# Similiarty Distance 
def euclidean(point, data):
    return np.sqrt(np.sum((point - data)**2, axis=1))

# K_Means
class kmeans:
    def __init__(self, n_clusters=10, max_iter=50):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def fit(self, X_train):
        self.centroids = [random.choice(X_train)]
        # E Step: Set up initial centroids 
        for _ in range(self.n_clusters-1):
            dists = np.sum([euclidean(centroid, X_train) for centroid in self.centroids], axis=0)
            dists /= np.sum(dists)
            new_centroid_idx, = np.random.choice(range(len(X_train)), size=1, p=dists)
            self.centroids += [X_train[new_centroid_idx]]
        # M step: Update mu 
        iteration = 0
        prev_centroids = None
        while np.not_equal(self.centroids, prev_centroids).any() and iteration < self.max_iter:
            sorted_points = [[] for _ in range(self.n_clusters)]
            # Update mu by derivative of J
            for x in X_train:
                dists = euclidean(x, self.centroids)
                centroid_idx = np.argmin(dists)
                sorted_points[centroid_idx].append(x)
            # Set prev_centroids 
            prev_centroids = self.centroids
            self.centroids = [np.mean(cluster, axis=0) for cluster in sorted_points]
            for i, centroid in enumerate(self.centroids):
                if np.isnan(centroid).any():
                    self.centroids[i] = prev_centroids[i]
            iteration += 1
        return self.centroids

In [None]:
### Result of implemented own kmeans 
kmeans = kmeans()
kmeans.fit(X_train)

[array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.000000

In [None]:
### Import A(MNIST), B(FASHION), C(20NG) Data set 
# A 
(X_train_mnist, y_train_mnist), (X_test_mnist, y_test_mnist) = mnist.load_data() 
X_train_mnist = X_train_mnist.reshape(len(X_train_mnist), -1) 
X_test_mnist = X_test_mnist.reshape(len(X_test_mnist), -1) 
print("X_train_mnist: ", X_train_mnist.shape)
# B
(X_train_fashion, y_train_fashion), (X_test_fashion, y_test_fashion) = fashion_mnist.load_data()
X_train_fashion = X_train_fashion.reshape(len(X_train_fashion), -1) 
X_test_fashion = X_test_fashion.reshape(len(X_test_fashion), -1) 
print("X_train_mnist: ", X_train_fashion.shape)
# C 
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
y_20ng = newsgroups_train.target
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
vectors_test = vectorizer.transform(newsgroups_test.data)
X_train_20ng, y_train_20ng, X_test_20ng, y_test_20ng = train_test_split(vectors, y_20ng, test_size=0.3, random_state=22)

X_train_mnist:  (60000, 784)
X_train_mnist:  (60000, 784)


In [None]:
### Implemnet Kmenas 
# A mnist 
kmeans_mnist = KMeans(n_clusters=10, random_state=22, n_init="auto").fit(X_train_mnist)
# B fashion
kmeans_fashion = KMeans(n_clusters=10, random_state=22, n_init="auto").fit(X_train_fashion)
# C 20ng
kmeans_20ng = KMeans(n_clusters=20, random_state=22, n_init="auto").fit(X_train_20ng, y_train_20ng)

In [None]:
### Evaluate Kmeans as high and low of K 
# A mnist 
kmeans_high_mnist = KMeans(n_clusters=20, random_state=22, n_init="auto").fit(X_train_mnist)
kmneas_low_mnist = KMeans(n_clusters=5, random_state=22, n_init="auto").fit(X_train_mnist)
# B fashion
kmeans_high_fashion = KMeans(n_clusters=20, random_state=22, n_init="auto").fit(X_train_fashion)
kmneas_low_fashion = KMeans(n_clusters=5, random_state=22, n_init="auto").fit(X_train_fashion)
# C 20ng
kmeans_high_20ng = KMeans(n_clusters=40, random_state=22, n_init="auto").fit(X_train_20ng)
kmneas_low_20ng = KMeans(n_clusters=10, random_state=22, n_init="auto").fit(X_train_20ng)

array([7, 4, 6, ..., 7, 3, 0], dtype=int32)

In [None]:
# Evaluate Models 
# A mnist 
score_mnist = accuracy_score(y_test_mnist, kmeans_mnist.predict(X_test_mnist))
print("mnist:", score_mnist)
# B fashion
score_fashion = accuracy_score(y_test_fashion, kmeans_fashion.predict(X_test_fashion))
print("fashion:", score_fashion)
# C 20ng
pred_20ng = kmeans_20ng.predict(vectors_test)
score_20ng = metrics.f1_score(newsgroups_test.target, pred_20ng, average='macro')
print("20ng:", score_20ng)

mnist: 0.0238
fashion: 0.1053
20ng: 0.009175870585418685
