In [1]:
#!pip install munkres

In [2]:
import pandas as pd
from sklearn import preprocessing

In [3]:
dataset = pd.read_csv('/Users/madhuri/Desktop/MasterThesis/Part1/NNLM_sets_top_10_clean/NLM3.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/madhuri/Desktop/MasterThesis/Part1/NNLM_sets_top_10_clean/NLM3.csv'

In [5]:
dataset['tissue_name'].value_counts()

Leukemia patient sample                                          2096
breast tumor                                                     1365
HumanAorticEndothelialCells                                      1014
Lymphoblastoid_cell_line                                          817
pre-treatment bone marrow                                         817
PBMC                                                              550
Blasts and mononuclear cells, AML patient                         525
primary colorectal adenocarcinoma                                 519
airway epithelial cells obtained by bronchoscopy and brushing     333
colon cancer tissue                                               331
Name: tissue_name, dtype: int64

In [7]:
'''
If we use (external) classification evalutation measures like F1 or 
accuracy for clustering evaluation, problems may arise. 

One way to fix is to perform label matching.

Here we performs kmeans clustering on the Iris dataset and proceed to use 
the Hungarian (Munkres) algorithm to correct the mismatched labeling. 
'''

import sys
import numpy as np
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.metrics import confusion_matrix

from munkres import Munkres

def make_cost_matrix(c1, c2):
    """
    """
    uc1 = np.unique(c1)
    uc2 = np.unique(c2)
    l1 = uc1.size
    l2 = uc2.size
    assert(l1 == l2 and np.all(uc1 == uc2))

    m = np.ones([l1, l2])
    for i in range(l1):
        it_i = np.nonzero(c1 == uc1[i])[0]
        for j in range(l2):
            it_j = np.nonzero(c2 == uc2[j])[0]
            m_ij = np.intersect1d(it_j, it_i)
            m[i,j] =  -m_ij.size
    return m

def translate_clustering(clt, mapper):
    return np.array([ mapper[i] for i in clt ])

def accuracy(cm):
    """computes accuracy from confusion matrix"""
    return np.trace(cm, dtype=float) / np.sum(cm)

def main():
    """entry point"""
    dataset = pd.read_csv('/Users/madhuri/Desktop/MasterThesis/Part1/NNLM_sets_top_10_clean/NLM3.csv') # loads the dataset
    #data, classes = dataset.data, dataset.target # data and true labels
    data=dataset.drop(['tissue_name'], axis=1)
    classes_actual=dataset['tissue_name']
    le = preprocessing.LabelEncoder()
    classes= le.fit_transform(classes_actual)
    algo = KMeans(n_clusters=10, random_state = 0)

    labels = algo.fit(data).labels_ # performs the algo and get the predicted labels
    num_labels = len(np.unique(classes))

    cm = confusion_matrix(classes, labels, labels=range(num_labels)) # gets the confusion matrix
    #print ("---------------------\nold confusion matrix:\n" \
     #     " %s\naccuracy: %.2f" % (str(cm), accuracy(cm)))

    cost_matrix = make_cost_matrix(labels, classes)

    m = Munkres()
    indexes = m.compute(cost_matrix)
    mapper = { old: new for (old, new) in indexes }

    #print ("---------------------\nmapping:")
    #for old, new in mapper['iteritems']:
    #for old, new in mapper.iteritems():
        
        #print ("map: %s --> %s" %(old, new))

    new_labels = translate_clustering(labels, mapper)
    new_cm = confusion_matrix(classes, new_labels, labels=range(num_labels))
    print ("---------------------\nHungarian method confusion matrix:\n\n" \
          " %s\naccuracy: %.2f" % (str(new_cm), accuracy(new_cm)))


if __name__ == "__main__":
    main()


---------------------
Hungarian method confusion matrix:

 [[ 500    0    4    0   21    0    0    0    0    0]
 [   0 1014    0    0    0    0    0    0    0    0]
 [ 885    0  754    1   11    0    0  445    0    0]
 [   0    0    0  816    0    0    0    0    1    0]
 [   0    0    0    0  530    0   15    0    5    0]
 [   0    0    0    0    0  333    0    0    0    0]
 [   0    0    0    0    0    0 1362    1    0    2]
 [   0    0    0    0    0    0    2    0    0  329]
 [   1    0    0    0    0    0    0    0  816    0]
 [   0    0    0    0    0    0    6    0    0  513]]
accuracy: 0.79
