In [77]:
import pandas as pd
import numpy as np
from sklearn.metrics.cluster import adjusted_rand_score

# Original data and poisson results

In [78]:
data = pd.read_csv('./human_sim_data/sim_data_1.csv',index_col=0)
data_no_zeros = data[np.max(data.values, axis = 1) > 0 ].copy()
data.head()

Unnamed: 0,V1,V2,V3,V4
1,37,0,27,0
2,10,17,11,20
3,25,14,21,13
4,16,0,13,0
5,109,63,72,46


## Make a cero replacement
replace with a uniform value, this might help the log ones, and non-poisson algorithms

In [79]:
data_cero_replace = data.copy()
for col_number, col_name in enumerate(data.columns):
    min_element = 1
    print('Minimum element different than cero in row {}\t: {}'.format(col_name,min_element))
    for row in range(len(data_cero_replace)):
        if data_cero_replace.iloc[row,col_number] == 0 :
            data_cero_replace.iloc[row,col_number] = np.random.uniform(0,min_element)

Minimum element different than cero in row V1	: 1
Minimum element different than cero in row V2	: 1
Minimum element different than cero in row V3	: 1
Minimum element different than cero in row V4	: 1


In [80]:
labels = pd.read_csv('./human_sim_data/sim_data_1_labels.csv',index_col = 0)
labels_no_zeros = labels[np.max(data.values, axis = 1) > 0]

# Save the databases
Save them normally and with a log 

In [81]:
labels.to_csv('./human_sim_data/labels_complete.csv', index = False)
labels_no_zeros.to_csv('./human_sim_data/labels_no_zeros.csv', index = False)
data_cero_replace.to_csv('./human_sim_data/data_cero_replace.csv', index = False)
data.to_csv('./human_sim_data/data.csv', index = False)
data_no_zeros.to_csv('./human_sim_data/data_no_zeros.csv', index = False)

# Get ARI results

In [82]:
poisson_results = pd.read_csv('./human_sim_data/sim_data_poisson_clustering_14.csv',index_col=0)
results = pd.DataFrame({'method':[],'ARI':[],'database':[]})
results = results.append({
    'method':'poisson_mix',
    'ARI':adjusted_rand_score(labels_no_zeros.x.values, poisson_results.x.values),
    'database':'no_zeros'
}, ignore_index = True)
results

Unnamed: 0,method,ARI,database
0,poisson_mix,0.507364,no_zeros


In [83]:
datasets = [data,data_cero_replace, data_no_zeros, np.log(data+1), np.log(data_cero_replace+1), np.log(data_no_zeros+1)]
datasets_names = ['complete','cero_replace','no_zeros','complete_log','cero_replace_log','no_zeros_log']
datasets_gt = [labels,labels,labels_no_zeros,labels,labels,labels_no_zeros]

# K means result

In [84]:
from sklearn.cluster import KMeans
for idx, dataset in enumerate(datasets):
    X = dataset.values
    kmeans = KMeans(n_clusters = 15).fit(X)
    results = results.append({
        'method':'k-means',
        'ARI':adjusted_rand_score(datasets_gt[idx].x.values, kmeans.labels_),
        'database':datasets_names[idx]
    }, ignore_index = True)    

In [85]:
%%time
from sklearn.cluster import SpectralClustering
for idx, dataset in enumerate(datasets):
    X = dataset.values
    clustering = SpectralClustering(n_clusters=15,assign_labels='discretize', n_jobs=-1, affinity='nearest_neighbors').fit(X)
    results = results.append({
        'method':'spectral_clustering',
        'ARI':adjusted_rand_score(datasets_gt[idx].x.values, clustering.labels_),
        'database':datasets_names[idx]
    }, ignore_index = True)    

CPU times: user 4.32 s, sys: 0 ns, total: 4.32 s
Wall time: 3.43 s


# npMSL results

In [86]:
npMSL = pd.read_csv('human_sim_data/npMSL/posteriors.csv', index_col=0)
adjusted_rand_score(labels.x.values, np.argmax(npMSL.values, axis = 1))

0.15074350997294636

In [88]:
results

Unnamed: 0,method,ARI,database
0,poisson_mix,0.507364,no_zeros
1,k-means,0.007275,complete
2,k-means,0.005097,cero_replace
3,k-means,0.006929,no_zeros
4,k-means,0.143557,complete_log
5,k-means,0.145019,cero_replace_log
6,k-means,0.14321,no_zeros_log
7,spectral_clustering,0.111167,complete
8,spectral_clustering,0.11162,cero_replace
9,spectral_clustering,0.110905,no_zeros
