In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.cluster import adjusted_rand_score

# Original data and poisson results

In [2]:
data = pd.read_csv('./results/sim_data_1.csv',index_col=0)
data_no_zeros = data[np.max(data.values, axis = 1) > 0 ].copy()
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12
1,2927,1044,2335,3388,4852,11406,15518,12748,9922,8330,3505,7150
2,1952,2164,8922,9126,9842,12343,21543,15797,10793,8293,3380,7053
3,2536,1851,5193,4373,4492,6039,7176,3887,2723,1971,973,1937
4,27665,10160,14679,11232,12541,18152,23555,18284,15177,13343,7562,15900
5,1729,447,557,283,277,303,394,299,227,240,148,292


## Make a cero replacement
replace with a uniform value, this might help the log ones, and non-poisson algorithms

In [3]:
data_cero_replace = data.copy()
for col_number, col_name in enumerate(data.columns):
    min_element = 1
    print('Minimum element different than cero in row {}\t: {}'.format(col_name,min_element))
    for row in range(len(data_cero_replace)):
        if data_cero_replace.iloc[row,col_number] == 0 :
            data_cero_replace.iloc[row,col_number] = np.random.uniform(0,min_element)

Minimum element different than cero in row V1	: 1
Minimum element different than cero in row V2	: 1
Minimum element different than cero in row V3	: 1
Minimum element different than cero in row V4	: 1
Minimum element different than cero in row V5	: 1
Minimum element different than cero in row V6	: 1
Minimum element different than cero in row V7	: 1
Minimum element different than cero in row V8	: 1
Minimum element different than cero in row V9	: 1
Minimum element different than cero in row V10	: 1
Minimum element different than cero in row V11	: 1
Minimum element different than cero in row V12	: 1


In [4]:
labels = pd.read_csv('./results/sim_data_1_labels.csv',index_col = 0)
labels_no_zeros = labels[np.max(data.values, axis = 1) > 0]

# Save the databases
Save them normally and with a log 

In [5]:
labels.to_csv('./sim_databases/labels_complete.csv', index = False)
labels_no_zeros.to_csv('./sim_databases/labels_no_zeros.csv', index = False)
data_cero_replace.to_csv('./sim_databases/data_cero_replace.csv', index = False)
data.to_csv('./sim_databases/data.csv', index = False)
data_no_zeros.to_csv('./sim_databases/data_no_zeros.csv', index = False)

In [6]:
poisson_results = pd.read_csv('./results/sim_data_poisson_clustering_29.csv',index_col=0)

# Get ARI results

In [7]:
results = pd.DataFrame({'method':[],'ARI':[],'database':[]})
results = results.append({
    'method':'poisson_mix',
    'ARI':adjusted_rand_score(labels_no_zeros.x.values, poisson_results.x.values),
    'database':'no_zeros'
}, ignore_index = True)

In [8]:
datasets = [data,data_cero_replace, data_no_zeros, np.log(data+1), np.log(data_cero_replace+1), np.log(data_no_zeros+1)]
datasets_names = ['complete','cero_replace','no_zeros','complete_log','cero_replace_log','no_zeros_log']
datasets_gt = [labels,labels,labels_no_zeros,labels,labels,labels_no_zeros]

# K means result

In [9]:
from sklearn.cluster import KMeans
for idx, dataset in enumerate(datasets):
    X = dataset.values
    kmeans = KMeans(n_clusters = 29).fit(X)
    results = results.append({
        'method':'k-means',
        'ARI':adjusted_rand_score(datasets_gt[idx].x.values, kmeans.labels_),
        'database':datasets_names[idx]
    }, ignore_index = True)    

In [10]:
%%time
from sklearn.cluster import SpectralClustering
for idx, dataset in enumerate(datasets):
    X = dataset.values
    clustering = SpectralClustering(n_clusters=29,assign_labels='discretize', n_jobs=-1, affinity='nearest_neighbors').fit(X)
    results = results.append({
        'method':'spectral_clustering',
        'ARI':adjusted_rand_score(datasets_gt[idx].x.values, clustering.labels_),
        'database':datasets_names[idx]
    }, ignore_index = True)    

CPU times: user 1min 1s, sys: 84.7 ms, total: 1min 1s
Wall time: 54.8 s


In [12]:
results

Unnamed: 0,method,ARI,database
0,poisson_mix,0.725597,no_zeros
1,k-means,0.010214,complete
2,k-means,0.012913,cero_replace
3,k-means,0.013186,no_zeros
4,k-means,0.088843,complete_log
5,k-means,0.091425,cero_replace_log
6,k-means,0.091615,no_zeros_log
7,spectral_clustering,0.233935,complete
8,spectral_clustering,0.233728,cero_replace
9,spectral_clustering,0.242071,no_zeros
