In [1]:
#!pip install munkres

In [2]:
import pandas as pd
from sklearn import preprocessing

from sklearn.preprocessing import StandardScaler

In [3]:
dataset = pd.read_csv('/Users/madhuri/Desktop/MasterThesis/Part1/NLM_sets_top_10_clean/NLM1.csv')

In [4]:
dataset['tissue_name'].value_counts()

Leukemia patient sample                                          2096
Breast Tumor                                                     1789
HumanAorticEndothelialCells                                      1014
Lymphoblastoid cell line                                          817
pre-treatment bone marrow                                         817
PBMC                                                              550
Blasts and mononuclear cells, AML patient                         525
primary colorectal adenocarcinoma                                 519
airway epithelial cells obtained by bronchoscopy and brushing     333
colon cancer tissue                                               331
Name: tissue_name, dtype: int64

In [8]:
'''
If we use (external) classification evalutation measures like F1 or 
accuracy for clustering evaluation, problems may arise. 

One way to fix is to perform label matching.

Here we performs kmeans clustering on the Iris dataset and proceed to use 
the Hungarian (Munkres) algorithm to correct the mismatched labeling. 
'''

import sys
import numpy as np
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.metrics import confusion_matrix

from munkres import Munkres

def make_cost_matrix(c1, c2):
    """
    """
    uc1 = np.unique(c1)
    uc2 = np.unique(c2)
    l1 = uc1.size
    l2 = uc2.size
    assert(l1 == l2 and np.all(uc1 == uc2))

    m = np.ones([l1, l2])
    for i in range(l1):
        it_i = np.nonzero(c1 == uc1[i])[0]
        for j in range(l2):
            it_j = np.nonzero(c2 == uc2[j])[0]
            m_ij = np.intersect1d(it_j, it_i)
            m[i,j] =  -m_ij.size
    return m

def translate_clustering(clt, mapper):
    return np.array([ mapper[i] for i in clt ])

def accuracy(cm):
    """computes accuracy from confusion matrix"""
    return np.trace(cm, dtype=float) / np.sum(cm)

def main():
    """entry point"""
    dataset = pd.read_csv('/Users/madhuri/Desktop/MasterThesis/Part1/NLM_sets_top_10_clean/NLM1.csv') # loads the dataset
    #data, classes = dataset.data, dataset.target # data and true labels
    unscaled_data=dataset.drop(['tissue_name'], axis=1)
    classes_actual=dataset['tissue_name']
    
    le = preprocessing.LabelEncoder()
    classes= le.fit_transform(classes_actual)
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(le_name_mapping)
    
    #Standardizing
    sc=StandardScaler()
    data = sc.fit_transform(unscaled_data)
    
    algo = KMeans(n_clusters=10, random_state = 0)

    labels = algo.fit(data).labels_ # performs the algo and get the predicted labels
    #attaching labels to the datset
    dataset['cluster_label']=labels
    #print(dataset.head(2))
    
    num_labels = len(np.unique(classes))

    cm = confusion_matrix(classes, labels, labels=range(num_labels)) # gets the confusion matrix
    #print ("---------------------\nold confusion matrix:\n" \
     #     " %s\naccuracy: %.2f" % (str(cm), accuracy(cm)))

    cost_matrix = make_cost_matrix(labels, classes)

    m = Munkres()
    indexes = m.compute(cost_matrix)
    print("indexes:", indexes)
    mapper = { old: new for (old, new) in indexes }

    #print ("---------------------\nmapping:")
    #for old, new in mapper['iteritems']:
    #for old, new in mapper.iteritems():
        
    #print ("map: %s --> %s" %(old, new))

    new_labels = translate_clustering(labels, mapper)
    print(classes[:-200])
    print(new_labels[:-200])
   
    new_cm = confusion_matrix(classes, new_labels, labels=range(num_labels))
    print ("---------------------\nHungarian method confusion matrix:\n\n" \
          " %s\naccuracy: %.2f" % (str(new_cm), accuracy(new_cm)))
   


# if __name__ == "__main__":
#     main()
#main()


{'Blasts and mononuclear cells, AML patient': 0, 'Breast Tumor': 1, 'HumanAorticEndothelialCells': 2, 'Leukemia patient sample': 3, 'Lymphoblastoid cell line': 4, 'PBMC': 5, 'airway epithelial cells obtained by bronchoscopy and brushing': 6, 'colon cancer tissue': 7, 'pre-treatment bone marrow': 8, 'primary colorectal adenocarcinoma': 9}
indexes: [(0, 7), (1, 9), (2, 2), (3, 4), (4, 3), (5, 5), (6, 0), (7, 6), (8, 8), (9, 1)]
[6 6 6 ... 1 1 1]
[6 6 6 ... 1 1 1]
---------------------
Hungarian method confusion matrix:

 [[ 522    0    0    3    0    0    0    0    0    0]
 [   3 1239    0    0    0    2    0    1    0  544]
 [   0    0 1014    0    0    0    0    0    0    0]
 [   3    0    0 1514    0    0    0  579    0    0]
 [   0    0    0    0  816    1    0    0    0    0]
 [ 512    0    0    0    0   21    0   17    0    0]
 [   0    0    0    0    0    0  333    0    0    0]
 [   0   44    0    0    0    0    0    0    0  287]
 [   0    1    0    1    0    0    0    3  812    0

In [None]:
def kmeans_assignment(centroids, points):
    num_centroids, dim = centroids.shape
    num_points, _ = points.shape

    # Tile and reshape both arrays into `[num_points, num_centroids, dim]`.                                                                      
    centroids = np.tile(centroids, [num_points, 1]).reshape([num_points, num_centroids, dim])
    points = np.tile(points, [1, num_centroids]).reshape([num_points, num_centroids, dim])

    # Compute all distances (for all points and all centroids) at once and                                                                       
    # select the min centroid for each point.                                                                                                    
    distances = np.sum(np.square(centroids - points), axis=2)
    return np.argmin(distances, axis=1)