In [1]:
#Clustering Tests by Daniel Mcdonough
import matplotlib.pyplot as plt
from copy import deepcopy
import random
from sklearn.datasets import load_digits
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn import metrics

# rep matrix is more moreless the same as the confusion matrix
def get_rep_matrix(labels,pred_labels,k=10):
    rep_matrix = np.zeros((k,k))
    num_points = len(pred_labels)
    for i in range(num_points):
        y = labels[i]
        x = pred_labels[i]
        rep_matrix[y][x] += 1
    for i in range(len(rep_matrix)):
        print("Best representative of cluster ",i," is ",np.argmax(rep_matrix[i]))
    return rep_matrix

def main():
    digits = load_digits()

    print("K-means Representation is in Problem 1. \n")

    X = digits.data

    n_samples, n_features = X.shape

    labels = digits.target
    # pick a ratio for splitting the digits list
    # into a training and a validation set.
    training_size = int(10) # ratio for affinity is n = clusters
    training = X[:training_size]
    validation = X[training_size:]
    #affinty propagation requires training data
    clustering = AffinityPropagation(preference=10).fit(training)
    print("\nAffinity Propagation: \n")


    validation_labels = labels[training_size:]
    cluster = clustering.predict(validation)
    get_rep_matrix(validation_labels, cluster, 10)
    con_matrix = confusion_matrix(validation_labels, cluster)
    accuracy = metrics.fowlkes_mallows_score(validation_labels, cluster)


    print("Confusion Matrix:\n", con_matrix)
    print("Accuracy: ", accuracy)







    clustering = AgglomerativeClustering(n_clusters=10).fit(X)
    print("\nAgglomerative Clustering: \n")

    get_rep_matrix(labels, clustering.labels_, 10)
    con_matrix = confusion_matrix(labels, clustering.labels_)
    accuracy = metrics.fowlkes_mallows_score(labels, clustering.labels_)

    print("Confusion Matrix:\n", con_matrix)
    print("Accuracy: ", accuracy)

if __name__ == "__main__":
    main()

K-means Representation is in Problem 1. 


Affinity Propagation: 

Best representative of cluster  0  is  0
Best representative of cluster  1  is  1
Best representative of cluster  2  is  3
Best representative of cluster  3  is  3
Best representative of cluster  4  is  4
Best representative of cluster  5  is  3
Best representative of cluster  6  is  6
Best representative of cluster  7  is  7
Best representative of cluster  8  is  8
Best representative of cluster  9  is  5
Confusion Matrix:
 [[173   0   0   0   0   0   4   0   0   0]
 [  1 122   5  10   5  15  22   0   1   0]
 [  6  10  22 109   0   1   7   1  20   0]
 [  3   6   3 147   0   8   0   7   8   0]
 [ 13  17   0   0 118   0  19   8   0   5]
 [ 33  16   0  49   0  21  21  28   1  12]
 [  4   0   1   0   0   0 174   0   1   0]
 [ 11   6   1   2   1   0   0 156   1   0]
 [  3  26  20   8   0   2   4   5 104   1]
 [ 29   4   0  27   2  73   0  11   5  28]]
Accuracy:  0.48394843088161105

Agglomerative Clustering: 

Best represen