In [53]:
# ============================
# Cell 1: Imports & Setup
# ============================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ============================
# Cell 2: Data Loading & Preprocessing
# ============================
def load_and_preprocess(path):
    df = pd.read_csv(path)
    y = (df['diagnosis'] == 'M').astype(int).values
    X = df.drop(columns=['id','diagnosis'], errors='ignore').values.astype(float)
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

    # Standardization  
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    std[std == 0] = 1
    X = (X - mean) / std

    return X, y

# Load dataset
X, y = load_and_preprocess("breast_cancer.csv")
print("Data shape:", X.shape, "Labels shape:", y.shape)


Data shape: (569, 31) Labels shape: (569,)


In [54]:
# ============================
# Cell 3: PCA Implementation
# ============================
class PCA:
    def __init__(self, n_components):
        self.n = n_components

    def fit(self, X):
        cov = np.cov(X, rowvar=False)
        vals, vecs = np.linalg.eigh(cov)
        idx = np.argsort(vals)[::-1]
        self.vals = vals[idx]
        self.vecs = vecs[:, idx]
        self.components = self.vecs[:, :self.n]
        return self

    def transform(self, X):
        return X @ self.components

    def inverse_transform(self, Z):
        return Z @ self.components.T

    def reconstruction_error(self, X):
        Z = self.transform(X)
        X_hat = self.inverse_transform(Z)
        return np.mean((X - X_hat) ** 2)



In [55]:
# ============================
# Cell 4: K-Means Implementation
# ============================
class KMeans:
    def __init__(self, k, iters=100, init='kmeans++'):
        self.k = k
        self.iters = iters
        self.init = init

    def _init_centroids(self,X):
        n = X.shape[0]
        if self.init=='random':
            return X[np.random.choice(n,self.k,replace=False)]
            
        elif self.init=='kmeans++':
            centroids = [X[np.random.randint(0,n)]]
            for _ in range(1,self.k):
                dist = np.min([np.sum((X-c)**2,axis=1) for c in centroids],axis=0)
                
                # to avoid NaN
                if np.all(dist==0) or np.isnan(dist).any():
                    prob = np.ones(n)/n
                    
                else:
                    prob = dist/dist.sum()
                centroids.append(X[np.random.choice(n,p=prob)])
                
            return np.array(centroids)

    def fit(self,X):
        n = X.shape[0]
        self.centroids = self._init_centroids(X)
        self.inertia_history = []

        for _ in range(self.iters):
            dists = np.linalg.norm(X[:,None]-self.centroids,axis=2)
            self.labels = np.argmin(dists,axis=1)

            new_centroids = np.array([X[self.labels==i].mean(axis=0) for i in range(self.k)])
            if np.allclose(self.centroids,new_centroids, atol=1e-6):
                break
            self.centroids = new_centroids

        self.inertia = np.sum((X-self.centroids[self.labels])**2)
        return self



In [56]:
# ============================
# Cell 5: GMM Implementation
# ============================
class GMM:
    def __init__(self, k, iters=100, reg_covar=1e-6):
        self.k = k
        self.iters = iters
        self.reg_covar = reg_covar

    def fit(self, X):
        n, d = X.shape
        self.means = X[np.random.choice(n, self.k, False)]
        self.covs = [np.eye(d) for _ in range(self.k)]
        self.weights = np.ones(self.k)/self.k
        self.log_likelihoods = []

        for _ in range(self.iters):
            resp = np.zeros((n, self.k))
            for i in range(self.k):
                diff = X - self.means[i]
                cov = self.covs[i] + self.reg_covar * np.eye(d)  # <-- regularization
                inv = np.linalg.inv(cov)
                exp = np.exp(-0.5 * np.sum(diff @ inv * diff, axis=1))
                denom = np.sqrt((2*np.pi)**d * np.linalg.det(cov))
                resp[:,i] = self.weights[i] * exp / denom

            resp_sum = resp.sum(axis=1, keepdims=True)
            resp = resp / resp_sum

            Nk = resp.sum(axis=0)
            for i in range(self.k):
                self.means[i] = (resp[:,i][:,None] * X).sum(axis=0) / Nk[i]
                diff = X - self.means[i]
                self.covs[i] = (resp[:,i][:,None] * diff).T @ diff / Nk[i] + self.reg_covar*np.eye(d)
                self.weights[i] = Nk[i]/n

            ll = np.sum(np.log(resp_sum))
            self.log_likelihoods.append(ll)

        return self




In [57]:
# ============================
# Cell 6: Autoencoder Implementation
# ============================
# ============================
# Fixed Autoencoder
# ============================
class Autoencoder:
    def __init__(self, layers, lr=0.001, iters=200, l2_reg=1e-4):
        self.layers = layers
        self.lr = lr
        self.iters = iters
        self.l2_reg = l2_reg
        self.W, self.b = [], []
        for i in range(len(layers)-1):
            # Small weights init
            self.W.append(np.random.randn(layers[i], layers[i+1])*0.01)
            self.b.append(np.zeros(layers[i+1]))
        self.losses = []

    def relu(self,x): 
        return np.maximum(0, np.minimum(x, 1e6))  # clip to avoid overflow

    def drelu(self,x): 
        return (x>0).astype(float)

    def fit(self,X):
        for _ in range(self.iters):
            A = [X]
            for W,b in zip(self.W,self.b):
                A.append(self.relu(A[-1]@W + b))
            loss = np.mean((A[-1]-X)**2) + self.l2_reg*sum(np.sum(W**2) for W in self.W)
            self.losses.append(loss)

            grad = 2*(A[-1]-X)/len(X)
            for i in reversed(range(len(self.W))):
                dW = A[i].T @ grad + self.l2_reg*2*self.W[i]
                db = grad.sum(axis=0)
                grad = grad @ self.W[i].T * self.drelu(A[i])
                self.W[i] -= self.lr*dW
                self.b[i] -= self.lr*db
        return self

    def encode(self,X):
        A = X
        mid = len(self.W)//2
        for i in range(mid):
            A = self.relu(A @ self.W[i] + self.b[i])
        return A



In [58]:
# ============================
# Cell 7: Internal Metrics
# ============================
def silhouette_score(X, labels):
    s = []
    for i in range(len(X)):
        same = X[labels==labels[i]]
        other = [X[labels==l] for l in set(labels) if l!=labels[i]]
        a = np.mean(np.linalg.norm(same-X[i],axis=1))
        b = min(np.mean(np.linalg.norm(o-X[i],axis=1)) for o in other)
        s.append((b-a)/max(a,b))
    return np.mean(s)



In [59]:
# ============================
# Cell 8: External Metrics
# ============================
def purity(y, labels):
    total = 0
    for c in set(labels):
        vals = y[labels==c]
        total += np.max(np.bincount(vals))
    return total/len(y)



In [60]:
# ============================
# Cell 9: Example Experiment – KMeans on Original
# ============================
for k in range(2,11):
    km = KMeans(k, iters=50, init='kmeans++').fit(X)
    print(f"k={k}, Inertia={km.inertia}")



k=2, Inertia=11595.526607115786
k=3, Inertia=10061.797818243696
k=4, Inertia=9511.763892320432
k=5, Inertia=8651.889204378089
k=6, Inertia=7981.501461252855
k=7, Inertia=7911.50519203572
k=8, Inertia=7543.749832285338
k=9, Inertia=7122.781035775732
k=10, Inertia=6773.684326509476


In [61]:
# ============================
# Cell 10: Example Experiment – GMM on Original
# ============================
for k in range(2,11):
    gmm = GMM(k, iters=50).fit(X)
    print(f"k={k}, LogLikelihood={gmm.log_likelihoods[-1]}")



k=2, LogLikelihood=3583.1451934537254
k=3, LogLikelihood=4462.013447480359
k=4, LogLikelihood=5929.654364797127
k=5, LogLikelihood=6879.855615217721
k=6, LogLikelihood=7962.157025721021
k=7, LogLikelihood=7964.914281709543
k=8, LogLikelihood=9861.5687881802
k=9, LogLikelihood=9920.612658385773
k=10, LogLikelihood=12214.883945982916


In [62]:
# ============================
# Cell 11: PCA + KMeans Example
# ============================
for d in [2,5,10,15,20]:
    Z = PCA(d).fit(X).transform(X)
    km = KMeans(2, iters=50, init='kmeans++').fit(Z)
    print(f"Dims={d}, Inertia={km.inertia}")



Dims=2, Inertia=5332.982035478693
Dims=5, Inertia=8992.130462630226
Dims=10, Inertia=10770.705565729244
Dims=15, Inertia=11365.570468457012
Dims=20, Inertia=11520.165852974756


In [63]:
# ============================
# Cell 12: Autoencoder + KMeans Example (fixed)
# ============================
input_dim = X.shape[1]  # 31
for d in [2,5,10,15,20]:
    Z = Autoencoder([input_dim, 20, d, 20, input_dim], iters=100).fit(X).encode(X)
    km = KMeans(2, iters=50, init='kmeans++').fit(Z)
    print(f"Dims={d}, Inertia={km.inertia}")


Dims=2, Inertia=0.0002956722547536264
Dims=5, Inertia=0.003221066920350642
Dims=10, Inertia=0.004389517968890002
Dims=15, Inertia=0.00470846464899286
Dims=20, Inertia=0.007166805334063794
