In [1]:
import sys
sys.path.insert(0, '../../')
import ccal
%matplotlib inline
%config InlineBackend.figure_formats = {'svg',}

import numpy as np
import pandas as pd

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, fcluster, cophenet
from scipy.spatial.distance import pdist

<20:14:41> Checking dependencies ...
<20:14:41> Using the following packages:
<20:14:41> 	matplotlib (v1.5.1)
<20:14:41> 	numpy (v1.10.4)
<20:14:41> 	pandas (v0.18.0)
<20:14:41> 	rpy2 (v2.7.9)
<20:14:41> 	scikit-learn (v0.17.1)
<20:14:41> 	scipy (v0.17.0)
<20:14:41> 	seaborn (v0.7.0)


In [2]:
kmin= 1
kmax = 5
kincrement = 1
nclustering = 3
h_matrix = ccal.support.read_gct('../../../hematopoietic_cancer/result/ccle_hema/k9/ccle_hema_k_9_H.gct')

In [None]:
def compare_matrices(matrix1, matrix2, is_distance=False, plot=False, figure_filename=None):
    compared_matrix = pd.DataFrame(index=matrix1.index, columns=matrix2.index, dtype=float)
    features1_nrow = matrix1.shape[0]

    for i, (i1, r1) in enumerate(matrix1.iterrows()):
        print_log('Features 1 {} ({}/{}) vs. features 2 ...'.format(i1, i + 1, features1_nrow))
        for i2, r2 in matrix2.iterrows():
            compared_matrix.ix[i1, i2] = information_coefficient(r1, r2)
    if is_distance:
        print_log('Converting association to is_distance (is_distance = 1 - association) ...')
        compared_matrix = 1 - compared_matrix

    if plot:
        print_log('Plotting the resulting matrix ...')
        ax = sns.clustermap(compared_matrix, cmap=CMAP_CONTINUOUS)
        plt.setp(ax.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
        plt.setp(ax.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
        if figure_filename:
            establish_path(figure_filename)
            compared_matrix.to_csv(figure_filename, sep='\t')
            print_log('Saved the resulting figure as {}.'.format(figure_filename))

    return compared_matrix

In [None]:
results = pd.DataFrame(index=range(kmin, kmax + 1, kincrement), columns=list(h_matrix.index) + ['cophenetic_correlation'])

standardized_h_matrix = ccal.support.standardize_pandas_object(h_matrix)
standardized_and_clipped_h_matrix = standardized_h_matrix.clip(-3, 3)
compared_matrix = ccal.analyze.compare_matrices(standardized_and_clipped_h_matrix, standardized_and_clipped_h_matrix)

for k in range(kmin, kmax + 1, kincrement):
    print(k)
    labels = pd.DataFrame(index=range(nclustering), columns=compared_matrix.index)
    for i in range(nclustering):
        ward = AgglomerativeClustering(n_clusters=k)
        ward.fit(compared_matrix)
        labels.iloc[i, :] = ward.labels_

    cooccurences = pd.DataFrame(index=compared_matrix.index, columns=compared_matrix.columns)
    cooccurences.fillna(0, inplace=True)
    for n, s in labels.iterrows():
        for i in s.index:
            for j in s.index:
                if i is j or s.ix[i] == s.ix[j]:
                    cooccurences.ix[i, j] += 1
    cooccurences /= nclustering

    distances = 1 - cooccurences
    link = linkage(distances, method='ward')
    labels = fcluster(link, k, criterion='maxclust')
    cophenetic_correlation = cophenet(link, pdist(distances))[0]
    results.ix[k, h_matrix.index] = labels
    results.ix[k, 'cophenetic_correlation'] = cophenetic_correlation

results