# Figure 1 Visualisation of the main workflow
### Kernel-PRECISE: Generating non-linear subspace representations to transfer predictors of response from pre-clinical models to human tumor
Code for reproducing Figure 1. Figure 1 has been made for visualisation purposes -- it however relies on real data.

In [1]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import uuid
import scipy
sns.set_style("whitegrid")
sns.set_context('paper')

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import kernel_metrics
from sklearn.decomposition import KernelPCA

sys.path.insert(0, '../read_data/')
from read_data import read_data
from reformat_df import reformat_df
import library_size_normalization

from transact.pv_computation import PVComputation
from transact.interpolation import Interpolation
from transact.matrix_operations import _center_kernel, _right_center_kernel, _left_center_kernel
from transact.kernel_computer import KernelComputer
from transact import TRANSACT

In [None]:
# da: domain adaptation
tissues = {
    'TCGA': ['All'],
    'GDSC': ['All']
}
projects = {
    'TCGA': None,
    'GDSC': None
}

data_sources = ['GDSC', 'TCGA']

data_types = ['rnaseq']
data_normalization = 'library_size'

source = 'GDSC'
target = 'TCGA'

## Read data

In [None]:
data_df = read_data(tissues=tissues,
                    data_types=[e for e in data_types],
                    projects=projects,
                    data_sources=data_sources,
                    folder_basis='../data/')

source_data_key, target_data_key = reformat_df(data_df, source, target)

In [None]:
# Remove healthy samples for TCGA
target_unhealthy_index = [e for e in data_df[target_data_key].index if '11A' not in e]
data_df[target_data_key] = data_df[target_data_key].loc[target_unhealthy_index]

In [None]:
# Library size normalization
average_depth_global = 10**5
for ds in list(data_df.keys()):
    GE_normalized = library_size_normalization.TMM_normalization(data_df[ds].values.astype(float))
    GE_normalized = np.array(GE_normalized)
    average_depths = np.mean(np.sum(GE_normalized,1))
    GE_normalized = GE_normalized / average_depths * average_depth_global
    GE_normalized = np.log(np.array(GE_normalized)+1)
    data_df[ds] = pd.DataFrame(GE_normalized,
                               columns=data_df[ds].columns,
                               index=data_df[ds].index)

In [None]:
# Normalization
with_mean = True
with_std = True

normalized_data_df = {
    ds : StandardScaler(with_mean=with_mean, with_std=with_std).fit_transform(data_df[ds])
    for ds in data_df
}

In [None]:
# Make figure folder
figure_uid = str(uuid.uuid4())
while figure_uid in os.listdir('figures/'):
    figure_uid = str(uuid.uuid4())
os.mkdir('./figures/%s'%(figure_uid))

In [None]:
# Read tissue information
TCGA_biospecimen_file = '../data/TCGA/pancancer_sample_spec.csv'
TCGA_tissues = ['BRCA', 'SKCM', 'COAD', 'HNSC']
TCGA_tissues = ['TCGA-' + e for e in  TCGA_tissues]

# Compute barcode of samples
len_barcode_TCGA = [len(e) for e in data_df[target_data_key].index]
len_barcode_TCGA = np.unique(len_barcode_TCGA)
if len_barcode_TCGA.shape[0] != 1:
    print('WARNING: PB WITH BARCODE')
len_barcode_TCGA = len_barcode_TCGA[0]

TCGA_sample_tissues = pd.read_csv(TCGA_biospecimen_file, sep=',')
TCGA_sample_tissues = TCGA_sample_tissues[['barcode', 'project']]
TCGA_sample_tissues['barcode'] = TCGA_sample_tissues['barcode'].str[:len_barcode_TCGA]
TCGA_sample_tissues = TCGA_sample_tissues.drop_duplicates().set_index('barcode')
TCGA_sample_tissues = TCGA_sample_tissues.merge(data_df[target_data_key], left_index=True, right_index=True, how='right')
TCGA_sample_tissues = TCGA_sample_tissues[['project']]
TCGA_sample_tissues['project'] = TCGA_sample_tissues['project'].astype(str)
pd.testing.assert_index_equal(TCGA_sample_tissues.index, data_df[target_data_key].index)

# Select corresponding samples
data_df[target_data_key] = data_df[target_data_key][TCGA_sample_tissues['project'].isin(TCGA_tissues)]

## Select a few samples for visualisation purposes

In [None]:
n_target_samples = 50
target_index = np.random.choice(np.arange(normalized_data_df[target_data_key].shape[0]),
                                n_target_samples,
                                replace=False)
normalized_data_df[target_data_key] = normalized_data_df[target_data_key][target_index]

n_source_samples = 50
source_index = np.random.choice(np.arange(normalized_data_df[source_data_key].shape[0]),
                                n_source_samples,
                                replace=False)
normalized_data_df[source_data_key] = normalized_data_df[source_data_key][source_index]

## Similarity values
### Hyperparameter of the kernel
We here take a smaller dataset that does not completely recapitulate the diversity (pan-cancer) of the experiments. For this reason, we do not use the same hyper-parameter than the experiments.

In [None]:
kernel_surname = 'rbf_gamma_0_00005_centered_standardized'
kernel_name = 'rbf'
kernel_param = {
    'gamma': 0.00005
}
kernel = kernel_metrics()[kernel_name]

number_pc = {
    'source': 10,
    'target': 10
}

In [None]:
k_s = kernel(normalized_data_df[source_data_key], **kernel_param)
k_t = kernel(normalized_data_df[target_data_key], **kernel_param)
k_st = kernel(normalized_data_df[source_data_key],
                             normalized_data_df[target_data_key],
                             **kernel_param)
k_ts = k_st.T

kernel_matrix = np.block([
    [_center_kernel(k_s), _left_center_kernel(_right_center_kernel(k_st))],
    [_left_center_kernel(_right_center_kernel(k_ts)), _center_kernel(k_t)]
])

n_source = k_s.shape[0]
n_target = k_t.shape[0]

In [None]:
g_s = sns.clustermap(k_s, cmap='seismic_r')
plt.clf()

g_t = sns.clustermap(k_t, cmap='seismic_r')
plt.clf()

plt.figure(figsize=(8,8))
ordered_ind = np.concatenate([np.array(g_s.dendrogram_row.reordered_ind),
                              n_source+np.array(g_t.dendrogram_row.reordered_ind)])

sns.heatmap(kernel_matrix[ordered_ind][:,ordered_ind],
            cmap='seismic_r',
            cbar=False,
            center=0, vmin=-0.15, vmax=.15)
plt.xticks([])
plt.yticks([])
plt.tight_layout()
plt.savefig('./figures/%s/centered_kernel_matrix_source_%s_target_%s_%s_n_target_%s.png'%(
    figure_uid,
    '_'.join(tissues[source]),
    '_'.join(tissues[target]),
    kernel_surname,
    n_target_samples), 
            dpi=300)

## PRECISE+ show-case
### NLPC (Kernel principal components) coefficients
#### Source

In [None]:
source_clf = KernelPCA(kernel=kernel_name, **kernel_param, n_components=10)
source_clf.fit(normalized_data_df[source_data_key])
source_coef = source_clf.alphas_ / np.sqrt(source_clf.lambdas_)
sns.clustermap(source_coef.T[::-1][:5], cmap='seismic_r', row_cluster=False, figsize=(20,5), center=0)
plt.savefig('./figures/%s/source_PC_sample_coef_kernel_matrix_source_%s_target_%s_%s.png'%(
    figure_uid,
    '_'.join(tissues[source]),
    '_'.join(tissues[target]),
    kernel_surname), 
            dpi=300)
plt.show()

#### Target

In [None]:
target_clf = KernelPCA(kernel=kernel_name, **kernel_param, n_components=10)
target_clf.fit(normalized_data_df[target_data_key])
target_coef = target_clf.alphas_ / np.sqrt(target_clf.lambdas_)
sns.clustermap(target_coef.T[::-1][:5], cmap='seismic_r', row_cluster=False, figsize=(20,5), center=0)
plt.savefig('./figures/%s/target_PC_sample_coef_kernel_matrix_source_%s_target_%s_%s.png'%(
    figure_uid,
    '_'.join(tissues[source]),
    '_'.join(tissues[target]),
    kernel_surname), 
            dpi=300)
plt.show()

### Principal Vectors

In [None]:
# Compute principal vectors
PRECISE_clf = KernelPRECISE(kernel=kernel_name,
                            kernel_params=kernel_param,
                            n_components=number_pc,
                            n_jobs=20,
                            verbose=10)

PRECISE_clf.fit(normalized_data_df[source_data_key],
                normalized_data_df[target_data_key], 
                with_interpolation=False)

#### Cosine similarity

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(np.abs(PRECISE_clf.principal_vectors_.cosine_similarity_),
            cmap='seismic_r',
            center=0,
            vmax=1, 
            cbar=False)
plt.xticks([])
plt.yticks([])
plt.savefig('./figures/%s/cosine_similarity_matrix_%s_target_%s_%s.png'%(
    figure_uid,
    '_'.join(tissues[source]),
    '_'.join(tissues[target]),
    kernel_surname), 
            dpi=300)

#### Alignment

In [None]:
plt.figure(figsize=(5,3))
sns.heatmap(np.diag(np.cos(PRECISE_clf.principal_vectors_.canonical_angles)),
            cmap='seismic_r',
            center=0, 
            vmax=1, cbar=False)
for i in range(PRECISE_clf.principal_vectors_.n_pv):
    plt.text(i+1.4,i+.8,'%1.2f'%np.cos(PRECISE_clf.principal_vectors_.canonical_angles)[i],
             fontsize=25,
             color='black')

plt.xticks([])
plt.yticks([])
plt.tight_layout()
plt.savefig('./figures/%s/cosine_similarity_matrix_pv_%s_target_%s_%s.png'%(
    figure_uid,
    '_'.join(tissues[source]),
    '_'.join(tissues[target]),
    kernel_surname), 
            dpi=300)

### Interpolation

In [None]:
PRECISE_clf.fit(normalized_data_df[source_data_key],
                normalized_data_df[target_data_key], 
                n_pv=5,
                with_interpolation=True)

### Projection

In [None]:
source_consensus_features = PRECISE_clf.transform(normalized_data_df[source_data_key])
target_consensus_features = PRECISE_clf.transform(normalized_data_df[target_data_key])
all_consensus_features = np.concatenate([source_consensus_features, target_consensus_features])

In [None]:
sns.scatterplot(source_consensus_features[:n_source_samples,0],
                source_consensus_features[:n_source_samples,1],
                label='pre-clinical', marker='v', s=100, color='red')
sns.scatterplot(target_consensus_features[:,0],
                target_consensus_features[:,1],
                label='tumors', marker='o', s=100, color='blue')
plt.legend(fontsize=15)
plt.xlabel('1st Consensus Feature', fontsize=20, color='black')
plt.ylabel('2nd Consensus\n Feature', fontsize=20, color='black')
plt.tight_layout()
plt.savefig('./figures/%s/consensus_features_scatterplot.png'%(figure_uid), dpi=300)
plt.show()