In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
from sklearn.cluster import KMeans
from scipy import stats
from sklearn.decomposition import PCA

In [2]:
data_points_1 = pd.read_csv('./data/1_5_CPMcutoff_suffix_1_log_cero_replacement.csv', index_col = 0)
data_points_2 = pd.read_csv('./data/1_5_CPMcutoff_suffix_2_log_cero_replacement.csv', index_col = 0)
data_points_3 = pd.read_csv('./data/1_5_CPMcutoff_suffix_3_log_cero_replacement.csv', index_col = 0)
data_points = pd.merge(left = data_points_1, right = data_points_2, left_index = True, right_index = True)
data_points = pd.merge(left = data_points, right = data_points_3, left_index = True, right_index = True)

In [28]:
class Random_K_Means():
    '''
    K means for random SME initialization
    '''
    def __init__(self, data, cluster_size = 20, pca_dim = 4):
        '''
        data is a pandas dataframe
        '''
        self.data = data
        self.cluster_size = cluster_size
        self.pca_dim = pca_dim
        self.pca()
        
    def k_means_labels(self, pca = False):
        '''
        Returns K means labels with the data and the specified cluster size
        '''
        if pca:
            result = KMeans(n_clusters= self.cluster_size, tol=1e-8, max_iter= 1000).fit(self.data_pca.values).labels_
        else :
            result = KMeans(n_clusters= self.cluster_size, tol=1e-8, max_iter= 1000).fit(self.data.values).labels_
        return result
    
    def pca(self):
        '''
        Makes a pca with the data, makes the data a new dataframe with
        the values from the pca
        '''
        pca = PCA(n_components=self.pca_dim, svd_solver='full')
        self.data_pca = pd.DataFrame(pca.fit_transform(self.data.values))
        
    def get_random_starts_limits(
        self,
        labels,
        mean_geom= False,
        std_geom = False
    ):
        data = self.data
        means, stds = [],[]
        for label in np.unique(labels):
            sub_data = data[labels == label]
            means.append(
                stats.gmean(sub_data.mean(axis = 0)) if mean_geom else sub_data.mean(axis = 0).mean()
            )
            stds.append(
                stats.gmean(sub_data.std()) if std_geom else sub_data.std().mean()
            )
        return np.array(means), np.array(stds)
    def get_center_sample_from_data(
        self, 
        pca = False,
        mean_geom = False,
        std_geom = False
    ):
        means, stds = self.get_random_starts_limits(
            self.k_means_labels(
                    pca = pca,
            ),
            mean_geom= mean_geom,
            std_geom = std_geom, 
        )
        min_val = means - 2* stds
        for i in range(min_val.size):
            r = np.random.randint(11)
            min_val[i] += r * (stds[i]* 4/10)
        return min_val, stds
        


In [30]:

def get_mean_std(iterations = 10):
    means, stds = [], []
    for cluster_size in range(3,31):
        random_centers = Random_K_Means(data = data_points, cluster_size = cluster_size)
        for i in range(iterations):
            print (cluster_size, '-' , i, end=',')
            for pca_bool in [True, False]:
                mean_bool =  False
                for std_bool in [True,False]:
                    mean, std = random_centers.get_center_sample_from_data(
                        pca = pca_bool,
                        mean_geom = mean_bool,
                        std_geom = std_bool
                    )
                    means.append(mean)
                    stds.append(std)
    return means, stds


In [31]:
%%time
means, stds = get_mean_std(iterations=4)

3 - 0,3 - 1,3 - 2,3 - 3,4 - 0,4 - 1,4 - 2,4 - 3,5 - 0,5 - 1,5 - 2,5 - 3,6 - 0,6 - 1,6 - 2,6 - 3,7 - 0,7 - 1,7 - 2,7 - 3,8 - 0,8 - 1,8 - 2,8 - 3,9 - 0,9 - 1,9 - 2,9 - 3,10 - 0,10 - 1,10 - 2,10 - 3,11 - 0,11 - 1,11 - 2,11 - 3,12 - 0,12 - 1,12 - 2,12 - 3,13 - 0,13 - 1,13 - 2,13 - 3,14 - 0,14 - 1,14 - 2,14 - 3,15 - 0,15 - 1,15 - 2,15 - 3,16 - 0,16 - 1,16 - 2,16 - 3,17 - 0,17 - 1,17 - 2,17 - 3,18 - 0,18 - 1,18 - 2,18 - 3,19 - 0,19 - 1,19 - 2,19 - 3,20 - 0,20 - 1,20 - 2,20 - 3,21 - 0,21 - 1,21 - 2,21 - 3,22 - 0,22 - 1,22 - 2,22 - 3,23 - 0,23 - 1,23 - 2,23 - 3,24 - 0,24 - 1,24 - 2,24 - 3,25 - 0,25 - 1,25 - 2,25 - 3,26 - 0,26 - 1,26 - 2,26 - 3,27 - 0,27 - 1,27 - 2,27 - 3,28 - 0,28 - 1,28 - 2,28 - 3,29 - 0,29 - 1,29 - 2,29 - 3,30 - 0,30 - 1,30 - 2,30 - 3,CPU times: user 11min 18s, sys: 79 ms, total: 11min 18s
Wall time: 11min 18s


In [33]:
for i in range(len(means)):
    with open('../../cluster_code/data/sem/mean.txt','a') as f:
        f.write(','.join(means[i].astype(str))+'\n')
    with open('../../cluster_code/data/sem/std.txt','a') as f:
        f.write(','.join(stds[i].astype(str))+'\n')

In [18]:
','.join(means[i].astype(str))

'0.8453492911321658,3.822710533474045,1.91919132461764,4.708717631834883,2.7176542177044767,4.708074747938298,5.577464737285457,4.457913057543891,3.187995461289483,6.037493625306631,1.3846300142664534,4.333611892911507,4.54090029342451,2.5944899463326827,3.801257513110568,1.4479737110629811,3.8841149399691783,0.6923066090508357,5.94546159021206,2.050983397184033'

In [22]:
data_points

Unnamed: 0_level_0,C42_1,C42B_1,LNCAP_1,MR49F_1,C42_2,C42B_2,LNCAP_2,MR49F_2,C42_3,C42B_3,LNCAP_3,MR49F_3
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000000003,2.020308,5.802458,3.463763,3.090618,3.112967,3.613909,6.038525,4.856630,4.462864,5.196959,3.020616,2.818450
ENSG00000000419,4.223350,7.149923,5.323355,2.755359,1.740524,2.633272,7.286943,5.089189,3.583093,5.046861,2.032127,3.712340
ENSG00000000457,2.869919,1.596058,3.677716,4.178899,2.559274,1.156747,4.129601,2.305354,3.979699,2.912101,1.965240,1.173922
ENSG00000000460,1.751904,3.421740,3.052585,3.070865,2.288179,2.349831,5.859453,3.197595,4.507209,1.407528,2.044142,1.739958
ENSG00000001036,3.481197,5.683603,5.241517,4.872132,3.873432,3.554372,4.548365,4.504407,6.349452,3.415789,2.861295,3.250109
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000283577,0.000669,0.668517,0.001541,0.089811,0.010203,1.013959,0.003394,0.023367,0.006603,0.683460,0.000346,1.100471
ENSG00000283633,0.000142,0.378588,0.002325,0.000000,0.001276,2.312889,0.002761,0.000000,0.000734,2.672558,0.000581,0.000000
ENSG00000283667,0.080286,0.202083,2.258987,1.889044,0.891543,0.645639,0.162713,0.525394,2.111469,0.690646,0.169281,3.102266
ENSG00000283674,0.503064,1.205395,2.181762,1.286498,3.560078,2.148792,0.831979,0.403138,4.192234,2.678159,1.423840,3.155403
