# **Problem 1: Breast Cancer Prevention using K-Means Algorithm**

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import scipy.io

from sklearn.cluster import AgglomerativeClustering
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# A.

In [2]:
X = pd.read_csv('breast_data.csv')
y = pd.read_csv('breast_labels.csv')
init_mu = scipy.io.loadmat('init_mu.mat')

In [3]:
class KMeansCluster:
    def __init__(self, k, tol, max_iter):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
        self.centers = None

    def fit(self, X: np.ndarray, mu: np.ndarray):
        # Initialize cluster centers with the provided mu
        self.centers = mu.copy()

        for _ in range(self.max_iter):
            # Assign each data point to the nearest cluster
            labels = self.predict(X)

            # Update cluster centers
            new_centers = []
            for i in range(self.k):
                new_center_of_cluster_k = X[labels == i].mean(axis=0)
                new_centers.append(new_center_of_cluster_k)
            new_centers = np.array(new_centers)

            # Check for convergence
            if np.linalg.norm(new_centers - self.centers) < self.tol:
                break

            # Update cluster centers for the next iteration
            self.centers = new_centers

    def accuracy(self, y: np.ndarray):
        # Check the accuracy of the clustering
        n_data = y.shape[0]
        correct = 0
        for i in range(n_data):
            if y[i] == self.labels[i]:
                correct += 1

        acc = correct / n_data
        if acc < 0.5:
          acc = 1 - acc
        return acc

    def predict(self, X: np.ndarray):
        # Assign each data point to the nearest cluster
        distances = np.linalg.norm(X[:, np.newaxis] - self.centers, axis=2)
        labels = np.argmin(distances, axis=1)
        self.labels = labels
        return labels

    def sse(self):
        if self.centers is None:
            raise ValueError("Call the 'fit' method before evaluating SSE.")

        # Calculate SSE
        sse = 0.0
        for i in range(self.k):
            cluster_points = X[self.labels == i]
            sse += np.sum(np.linalg.norm(cluster_points -
                          self.centers[i], axis=1)**2)

        return sse

# B.

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_numpy = y.to_numpy()

In [5]:
for i in range(5):
    n_clusters = 2
    num_features = X_scaled.shape[1]
    random_centers = np.random.rand(n_clusters, num_features)
    
    kmeans = KMeansCluster(k=2, tol=0.5, max_iter=1000)
    kmeans.fit(X_scaled, random_centers)

    print(f"Test {i}: ")
    print("Accuracy: ", kmeans.accuracy(y_numpy))
    print("SSE: ", kmeans.sse())
    print("")

Test 0: 
Accuracy:  0.9242957746478874
SSE:  949275498.5759952

Test 1: 
Accuracy:  0.8961267605633803
SSE:  949391266.6407444

Test 2: 
Accuracy:  0.9119718309859155
SSE:  949279404.4161131

Test 3: 
Accuracy:  0.9242957746478874
SSE:  949268615.750155

Test 4: 
Accuracy:  0.903169014084507
SSE:  949183487.3352125



# C.

In [6]:
init_centers = init_mu['mu_init'].T
init_centers.shape

(2, 30)

In [7]:
# max_iter is set to 1
n_clusters = 2
num_features = X_scaled.shape[1]

kmeans_init_mu = KMeansCluster(k=2, tol=0.001, max_iter=1)
kmeans_init_mu.fit(X_scaled, init_centers)

print("Accuracy: ", kmeans_init_mu.accuracy(y_numpy))
print("SSE: ", kmeans_init_mu.sse())
print("")

Accuracy:  0.5774647887323944
SSE:  949913991.5424824



In [8]:
n_clusters = 2
num_features = X_scaled.shape[1]

kmeans_init_mu = KMeansCluster(k=2, tol=0.001, max_iter=1000)
kmeans_init_mu.fit(X_scaled, init_centers)

print("Accuracy: ", kmeans_init_mu.accuracy(y_numpy))
print("SSE: ", kmeans_init_mu.sse())
print("")

Accuracy:  0.9066901408450704
SSE:  949375956.5257089



# D.

In [9]:
true_centers = kmeans_init_mu.centers
true_centers.shape

(2, 30)

In [10]:
kmeans_true_centers = KMeansCluster(k=2, tol=0.001, max_iter=1)
kmeans_true_centers.fit(X_scaled, true_centers)

print("Accuracy: ", kmeans_true_centers.accuracy(y_numpy))
print("SSE: ", kmeans_true_centers.sse())
print("")

Accuracy:  0.9066901408450704
SSE:  949375956.5257089



In [11]:
kmeans_true_centers = KMeansCluster(k=2, tol=0.001, max_iter=1000)
kmeans_true_centers.fit(X_scaled, true_centers)

print("Accuracy: ", kmeans_true_centers.accuracy(y_numpy))
print("SSE: ", kmeans_true_centers.sse())
print("")

Accuracy:  0.9066901408450704
SSE:  949375956.5257089



# E.

### Unsupervised

In [12]:
cluster = AgglomerativeClustering()
labels = cluster.fit_predict(X_scaled)
print("Accuracy: ", 1 - accuracy_score(y, labels))

Accuracy:  0.9014084507042254


### Supervised

In [13]:
y_numpy = y_numpy.ravel() 
svm = SVC()

labels = svm.fit(X_scaled, y_numpy).predict(X_scaled)
print("Accuracy: ", accuracy_score(y_numpy, labels))

Accuracy:  0.9876760563380281
