In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [15]:
from clusim.sim import *
from clusim.clustering import *
from clusim.clugen import *
from clusim.plotutils import *
from clusim.clusimelement import *

In [3]:
print available_similarity_measures
row_format2 ="{:>25}" * (2)
row_format3 ="{:>25}" * (3)

['jaccard_index', 'rand_index', 'fowlkes_mallows_index', 'rogers_tanimoto_index', 'southwood_index', 'fmeasure', 'nmi', 'vi', 'geometric_accuracy', 'overlap_quality', 'nmi_lfk', 'omega_index']


In [4]:
c1 = make_random_clustering(n_elements = 10, n_clusters = 2)
c2 = make_random_clustering(n_elements = 10, n_clusters = 5)

print_clustering(c1)
print_clustering(c2)

for simfunc in available_similarity_measures:
    print row_format2.format(simfunc, eval(simfunc+'(c1, c2)'))

0|35|87|14|926
847|6|3|15|092
            jaccard_index           0.181818181818
               rand_index                      0.8
    fowlkes_mallows_index           0.308606699924
    rogers_tanimoto_index           0.666666666667
          southwood_index           0.222222222222
                 fmeasure           0.307692307692
                      nmi           0.677079390702
                       vi            1.42646625065
       geometric_accuracy                      0.7
          overlap_quality                     -0.0
                  nmi_lfk           0.547850223247
              omega_index           0.191616766467


In [5]:
for rdm in available_random_models:
    print row_format2.format(rdm, adjrand_index(c1, c2, random_model = rdm))

                     perm        0.191616766467066
                    perm1        0.191616766467066
                      num        0.268677971300877
                     num1        0.254174169258472
                      all        0.329276680430622
                     all1        0.288682156384636


In [6]:
for rdm in available_random_models:
    print row_format2.format(rdm, adj_mi(c1, c2, random_model = rdm))

                     perm        0.215408520829728
                    perm1        0.215408520829728
                      num          0.2676906312963
                     num1        0.264686107121247
                      all         0.35706933098181
                     all1        0.310546011977348


In [10]:
N = 5

c1 = make_equal_clustering(n_elements = N, n_clusters = 2)
c2 = make_random_clustering(n_elements = N, n_clusters = 3, random_model = 'num')

print_clustering(c1)
print_clustering(c2)

for rdm in available_random_models:
    if '1' in rdm:
        random_model_ensemble1 = [clus for clus in clustering_ensemble_generator(c1, random_model = rdm[:-1])]
        random_model_ensemble2 = [c2]
    else:
        random_model_ensemble1 = [clus for clus in clustering_ensemble_generator(c1, random_model = rdm)]
        random_model_ensemble2 = [clus for clus in clustering_ensemble_generator(c2, random_model = rdm)]
    all_pairwise_rand = [rand_index(rc1, rc2) for rc1, rc2 in itertools.product(random_model_ensemble1, random_model_ensemble2)]
    
    analyitical_exp_rand = expected_rand_index(n_elements = c1.n_elements, 
                                       n_clusters1 = c1.n_clusters, 
                                       n_clusters2 = c2.n_clusters, 
                                       clus_size_seq1 = c1.clus_size_seq, 
                                       clus_size_seq2 = c2.clus_size_seq, 
                                       random_model = rdm)
    print row_format3.format(rdm, analyitical_exp_rand, np.mean(all_pairwise_rand))

024|13
0|13|24
                     perm                     0.56                     0.56
                    perm1                     0.56                     0.56
                      num        0.517333333333333           0.517333333333
                     num1                     0.52                     0.52
                      all        0.589497041420118            0.58949704142
                     all1        0.626923076923077           0.626923076923


In [13]:
N = 5

c1 = make_random_clustering(n_elements = N, random_model = 'all')
c2 = make_random_clustering(n_elements = N, n_clusters = 3, random_model = 'num')

print_clustering(c1)
print_clustering(c2)

for rdm in available_random_models:
    if '1' in rdm:
        random_model_ensemble1 = [clus for clus in clustering_ensemble_generator(c1, random_model = rdm[:-1])]
        random_model_ensemble2 = [c2]
    else:
        random_model_ensemble1 = [clus for clus in clustering_ensemble_generator(c1, random_model = rdm)]
        random_model_ensemble2 = [clus for clus in clustering_ensemble_generator(c2, random_model = rdm)]
    all_pairwise_mi = [nmi(rc1, rc2, norm_type = 'none') for rc1, rc2 in itertools.product(random_model_ensemble1, random_model_ensemble2)]
    
    analyitical_exp_mi = expected_mi(n_elements = c1.n_elements, 
                                       n_clusters1 = c1.n_clusters, 
                                       n_clusters2 = c2.n_clusters, 
                                       clus_size_seq1 = c1.clus_size_seq, 
                                       clus_size_seq2 = c2.clus_size_seq, 
                                       random_model = rdm)
    print row_format3.format(rdm, analyitical_exp_mi, np.mean(all_pairwise_mi))

0|12|4|3
02|14|3
                     perm         1.20192809488736            1.20192809489
                    perm1         1.20192809488736            1.20192809489
                      num         1.15753709471429            1.15753709471
                     num1         1.20192809488736            1.20192809489
                      all         0.73538377366121           0.735383773661
                     all1        0.803822746742339           0.803822746742


In [31]:
# Element-centeric Clustering Similarity

# first consider the case of two partitions (no overlap or hierarchy)
clustering1 = Clustering(elm2clus_dict = {0:[0], 1:[0], 2:[1], 3:[1], 4:[2], 5:[2]})
clustering2 = Clustering(elm2clus_dict = {0:[0], 1:[1], 2:[1], 3:[1], 4:[1], 5:[2]})

print_clustering(clustering1)
print_clustering(clustering2)
print "Element-centric Similarity:", element_sim(clustering1, clustering2, alpha = 0.9)
print "Element-centric Similarity for each element:"
nodeScores, relabeled_elements = element_sim_elscore(clustering1, clustering2, alpha = 0.9)
print nodeScores

print
# now lets conisder an example with overlap
clustering3 = Clustering(elm2clus_dict = {0:[0], 1:[0], 2:[0,1], 3:[1], 4:[2], 5:[2]})
print "Element-centric Similarity:", element_sim(clustering1, clustering3, alpha = 0.9)
print "Element-centric Similarity for each element:"
nodeScores, relabeled_elements = element_sim_elscore(clustering1, clustering3, alpha = 0.9)
print nodeScores

01|23|45
0|1234|5
Element-centric Similarity: 0.416666666667
Element-centric Similarity for each element:
[ 0.5   0.25  0.5   0.5   0.25  0.5 ]

Element-centric Similarity: 0.701754385965
Element-centric Similarity for each element:
[ 0.45614035  0.45614035  0.61403509  0.68421053  1.          1.        ]
