In [None]:
# Final Exam Spring 2025 - Adrian Halgas

**Problem 1 Clustering Noisy Images**

DATATSET 8358 sampled images with not-uniform label distribution. Each row is an image: first column is the label (digit), then the other 784 columns are pixel values. These images have noise in them; to achieve a better result, you need to work on the features first.

Task: Run a clustering algorithm on the given data set, extract k=10 clusters, and report entropy statistics using the given evaluation function (or write your own). You will have to decide the data preprocessing (if any) and the clustering algorithm. You can use scientific computing libraries (e.g. NumPy / SciPy) for both processing (for example PCA) and clustering (for example KMeans), and you can use any functions you developed in your previous homeworks.

Labels are not to be used during the algorithm/clustering/preprocessing, but only for evaluation: print a confusion matrix of counts, calculate entropy on each row and column, and compute weighted_by_count average entropy for rows (labels) and columns (clusters). Make sure to include all datapoints into the K=10 clusters.

In [28]:
import numpy as np
import pandas as pd

In [29]:
# Loading the data from the file

# Path to the dataset file
file_path = '/content/pb1data_XW_8358.txt'

data = pd.read_csv(file_path, header=None)
print(f"dataset shape: {data.shape}")
data.head()

dataset shape: (8358, 785)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,0,53,40,119,95,66,4,56,101,52,...,13,86,136,9,70,83,60,53,17,88
1,0,42,100,31,126,79,135,84,22,69,...,13,68,35,41,121,51,122,38,36,78
2,0,60,10,16,115,73,124,1,25,62,...,19,77,25,82,18,116,82,76,61,28
3,0,125,76,87,48,101,48,74,11,51,...,121,108,25,97,30,32,131,34,50,125
4,0,24,48,136,84,141,79,82,28,89,...,88,69,141,129,48,37,67,81,108,122


In [42]:
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import random

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

X = data.values.astype(np.float32) / 255.0  # Scale pixel values to [0, 1]

# First column of the original DataFrame contains the labels:
y = pd.read_csv(file_path, header=None)[0].values.astype(int)  # Extract labels

# Run KMeans clustering (using 10 clusters for the 10 digits)
kmeans = KMeans(n_clusters=10, random_state=42)
cluster_ids = kmeans.fit_predict(X)
print("KMeans clustering complete.")

pca = PCA(n_components=20)
X_pca = pca.fit_transform(X)
print("PCA complete.")
#print("Explained variance ratio (top 20 components):")
#print(pca.explained_variance_ratio_)

# Run KMeans clustering after PCA (using 10 clusters for the 10 digits)
kmeansPCA = KMeans(n_clusters=10, random_state=42)
cluster_idsPCA = kmeans.fit_predict(X_pca)
print("KMeans clustering after PCA complete.")

# Trying to run Kernel PCA like we did in HW3A in order to work better with noisy images
kernelPCA = KernelPCA(n_components=20, kernel='rbf')
X_kernPCA = kernelPCA.fit_transform(X)
print("Kernel PCA complete.")

# Run KMEANS After KPCA
kmeansKPCA = KMeans(n_clusters=10, random_state=42)
cluster_idsKPCA = kmeans.fit_predict(X_kernPCA)
print("KMeans clustering after KPCA complete.")

KMeans clustering complete.
PCA complete.
KMeans clustering after PCA complete.
Kernel PCA complete.
KMeans clustering after KPCA complete.


KMEANS on 8358 sampled images given evaluation function

In [43]:
def evaluate(true_labels: np.ndarray, pred_labels: np.ndarray) -> tuple:
  """Entropy-based evaluation of a label assignment.

  Parameters:
    true_labels: the ground-truth class labels on the input data.
    pred_labels: the predicted class labels on the input data.

  Returns:
    a tuple (CM, (cs_e, cr_e, we)) containing the confusion matrix `CM`, the class entropies `cs_e`,
    the cluster entropies `cr_e`, and the averaged weighted entropies `we`.
  """
  from scipy.stats import entropy

  assert len(true_labels) == len(pred_labels), "Label predictions don't match"

  ## Map the labels to index set {0, 1, ..., k - 1 }
  t_classes, t_labels = np.unique(true_labels, return_inverse=True)
  p_classes, p_labels = np.unique(pred_labels, return_inverse=True)
  assert np.all(np.isin(p_classes, t_classes)), "Predicted class outside of labels given"

  ## Accumulate the counts
  n_classes = len(t_classes)
  CM = np.zeros(shape=(n_classes, n_classes), dtype=np.uint32)
  ind = np.ravel_multi_index([t_labels, p_labels], CM.shape)
  np.add.at(CM.ravel(), ind, 1)

  ## Compute the entropy of the empirical row/column distributions
  empirical_dist = lambda x: x / np.sum(x)
  cluster_entropy = np.apply_along_axis(lambda x: entropy(empirical_dist(x), base=2), 0, CM)
  class_entropy = np.apply_along_axis(lambda x: entropy(empirical_dist(x), base=2), 1, CM)

  ## Average w/ count weights
  w_cluster_entropy = np.sum(cluster_entropy * CM.sum(axis=0)) / len(y)
  w_class_entropy = np.sum(class_entropy * CM.sum(axis=1)) / len(y)
  w_entropies = np.array([w_class_entropy, w_cluster_entropy])

  with np.printoptions(precision=3):
    print(f"Class Entropies: {class_entropy}")
    print(f"Cluster Entropies: {cluster_entropy}")
    print(f"Weighted average entropies: {w_entropies}, (avg: {np.mean(w_entropies):.3f})")
  return CM, (w_class_entropy, w_cluster_entropy, w_entropies)

In [44]:
evaluate(y, cluster_ids)

Class Entropies: [1.377 1.093 2.443 1.758 2.062 2.176 1.563 1.586 2.636 1.744]
Cluster Entropies: [1.897 2.04  1.109 2.179 1.945 1.156 0.972 2.174 0.573 2.238]
Weighted average entropies: [1.761 1.685], (avg: 1.723)


(array([[ 29,   1,   0,   9,  25,  25,   0,  78, 500,   5],
        [  1,   1, 853,   0,   7,   1, 710,   4,   0,   3],
        [ 23,  10,  22, 237,  79,  34,  47,  21,   8,   9],
        [334,   7,  42,   6, 541,  12,   6,  28,   2,  40],
        [  7, 274,   9, 215,   0,  16,  20,  44,   0, 376],
        [356,  13,  17,   3, 261,  10,  11, 438,  14,  90],
        [  4,   1,  22, 139,   5, 476,   4,  67,   9,   1],
        [  1, 291,  23,   6,   0,   1,  13,  11,   2, 167],
        [103,  50,  14,  16, 170,   9,  18,  83,   3,  29],
        [  7, 264,  17,  57,   8,   0,   3,   6,   3, 321]], dtype=uint32),
 (np.float64(1.7605494229029806),
  np.float64(1.6847190353687482),
  array([1.76054942, 1.68471904])))

In [45]:
evaluate(y, cluster_idsPCA)

Class Entropies: [1.336 1.646 2.569 1.838 1.726 2.33  1.368 1.592 2.529 1.537]
Cluster Entropies: [1.821 0.756 2.471 2.123 1.537 2.103 1.264 1.607 0.715 2.034]
Weighted average entropies: [1.823 1.757], (avg: 1.790)


(array([[ 30,   0,  89,   6,   0,   1,   0,  20, 496,  30],
        [  1, 540,   2,   1, 596,   2, 431,   1,   0,   6],
        [ 35,  17,  74,  19,  17,  10,  40, 220,   9,  49],
        [453,   7,  35,  29,  50,   7,   6,  15,   4, 412],
        [  2,  10,  53, 492,  22, 334,  14,  34,   0,   0],
        [348,   2, 356,  67,  26,  52,  14,   9,  21, 318],
        [  7,   3, 177,   8,  24,   0,   2, 492,   8,   7],
        [  0,  17,   8, 143,  23, 309,  11,   2,   2,   0],
        [102,   2,  57,  23,  15,  48,  25,  16,   6, 201],
        [  4,   5,   6, 306,  22, 323,   2,   1,   5,  12]], dtype=uint32),
 (np.float64(1.8229705534554745),
  np.float64(1.7565756588868129),
  array([1.82297055, 1.75657566])))

In [46]:
evaluate(y, cluster_idsKPCA)

Class Entropies: [1.334 1.077 2.352 1.811 1.972 2.212 1.499 1.53  2.649 1.682]
Cluster Entropies: [1.87  1.169 1.251 2.193 1.693 2.201 1.963 1.816 0.674 1.3  ]
Weighted average entropies: [1.737 1.654], (avg: 1.696)


(array([[ 24,   0,   0,   5,   7,  16,   0,  87, 500,  33],
        [  1, 854, 712,   3,   7,   0,   1,   1,   0,   1],
        [ 19,  24,  41,  11,  96, 243,  11,  13,   8,  24],
        [329,  43,   8,  41, 533,   6,   6,  37,   3,  12],
        [  6,  10,  27, 390,   0, 192, 303,   9,   0,  24],
        [360,  21,  52,  62, 140,   5,  11, 521,  22,  19],
        [  5,  26,   8,   1,   1, 198,   0,  25,   9, 455],
        [  0,  25,  14, 175,   0,   6, 288,   3,   3,   1],
        [112,  14,  25,  29,  81,  16,  36, 168,   4,  10],
        [  8,  17,   4, 334,   5,  45, 265,   5,   3,   0]], dtype=uint32),
 (np.float64(1.7368066919099285),
  np.float64(1.6542645105614964),
  array([1.73680669, 1.65426451])))

**Problem 2**

An auction house decides each morning, randomly based on internal rules, what class of products will be auctioned: Cars, Jewelry, Paintings, or Houses.
Each class has it own bidders who are called to place bids on matching days, characterized by a bidding_rate parameter λ, and assumed that bidding intervals overall follows negative exponential distribution . That is, probability for a bid to not occur decreases exponentially with length of time. For each day we record the number of bids, which theory dictates must follow E[#bids] = λ, E[bidding_interval] = 1 / λ

Part A (25 points). Given that exponential distribution assumption on bidding intervals, figure out the proper distribution for the #bids/day, parametrized by λ. You can use online resources to do so.

Part B (75 points) The file contains counts of auction bids for 10000 days, without specifying which class was auctioned per day. Estimate the rate_bidding parameter for each class (4 λ values) and also estimate how many days each class was auctioned (4 counts out of 10000).
Hint: use EM on a mixture of 4 distributions found in part A. You can use libraries for distribution computation (pdf), but the EM steps have to be your own implementation. Here is a possible result
Estimated λ-s: [ 6.13 15.22 1.97 22.34]
Estimated #days : [3087 1272 1953 3687 ]

**Part A Answer**

The proper distribution to use for the #bids/day is the Poisson distribution. This is because when the time between events like for our bids, follows an exponential distribution. This tells us that the events occur randomly and independently over time. This is the defining characteristic of a Poisson process.



In [47]:
# Problem 2 Part B below
# Loading the data from the file

# Path to the dataset file
file_path2 = '/content/pb2data_D.txt'

data2 = pd.read_csv(file_path, header=None)
print(f"dataset shape: {data2.shape}")
data2.head()

dataset shape: (8358, 785)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,0,53,40,119,95,66,4,56,101,52,...,13,86,136,9,70,83,60,53,17,88
1,0,42,100,31,126,79,135,84,22,69,...,13,68,35,41,121,51,122,38,36,78
2,0,60,10,16,115,73,124,1,25,62,...,19,77,25,82,18,116,82,76,61,28
3,0,125,76,87,48,101,48,74,11,51,...,121,108,25,97,30,32,131,34,50,125
4,0,24,48,136,84,141,79,82,28,89,...,88,69,141,129,48,37,67,81,108,122


In order to help implement my solution, I am using this as a resource:

https://towardsdatascience.com/implementing-expectation-maximisation-algorithm-from-scratch-with-python-9ccb2c8521b3/