# Fig 3A and Fig 3C: Proportion of non-linearities in consensus features and UMAP plot for the GDSC-TCGA analysis
This notebook supports the panels 3A and 3C of Figure 3.

In [2]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from joblib import Parallel, delayed
import scipy
from datetime import date
import umap
import pylab
sns.set_style("whitegrid")
sns.set_context('paper')

from sklearn.preprocessing import StandardScaler

sys.path.insert(0, '../read_data/')
from read_data import read_data
from read_GDSC_response import read_GDSC_response
from reformat_df import reformat_df
import library_size_normalization

from transact.pv_computation import PVComputation
from transact.interpolation import Interpolation
from transact.matrix_operations import _center_kernel, _right_center_kernel, _left_center_kernel
from transact.kernel_computer import KernelComputer
from transact.TRANSACT import TRANSACT

from compute_proportion import compute_proportion

In [5]:
# Data sources
tissues = {
    'TCGA': ['All'],
    'GDSC': ['All']
}
projects = {
    'TCGA':[None],
    'GDSC': None
}

data_sources = ['GDSC', 'TCGA']

data_types = ['rnaseq']
genes_filtering = 'mini'
data_normalization = 'library_size'

source = 'GDSC'
target = 'TCGA'

# TRANSACT analysis
kernel_surname = 'rbf_gamma_0_0005'
kernel_name = 'rbf'
kernel_param = {
    'gamma': 0.0005
}

number_pc = {
    'source': 70,
    'target': 150
}
n_pv = 30
n_interpolation = 100
n_jobs = 20

## Read data

In [6]:
data_df = read_data(tissues=tissues,
                    data_types=[e for e in data_types],
                    projects=projects,
                    data_sources=data_sources,
                    folder_basis='../data/')

source_data_key, target_data_key = reformat_df(data_df, source, target)

DATA UPLOADED
GENE HARMONIZED ACROSS VIEWS
CHECKED FOR GOOD HARMONIZATION


In [7]:
# removing healthy samples
healthy_samples_index = data_df[target_data_key].index.str.contains(r'-(10A|11A)-')
data_df[target_data_key] = data_df[target_data_key].loc[~healthy_samples_index]

  return func(self, *args, **kwargs)


In [8]:
# Library size normalization
average_depth_global = 10**5
for ds in list(data_df.keys()):
    GE_normalized = library_size_normalization.TMM_normalization(data_df[ds].values.astype(float))
    GE_normalized = np.array(GE_normalized)
    average_depths = np.mean(np.sum(GE_normalized,1))
    GE_normalized = GE_normalized / average_depths * average_depth_global
    GE_normalized = np.log(np.array(GE_normalized)+1)
    data_df[ds] = pd.DataFrame(GE_normalized,
                               columns=data_df[ds].columns,
                               index=data_df[ds].index)

In [9]:
# Normalization
with_mean = True
with_std = True

normalized_data_df = {
    ds : StandardScaler(with_mean=with_mean, with_std=with_std).fit_transform(data_df[ds])
    for ds in data_df
}

In [None]:
# Compute barcode of samples
len_barcode_TCGA = [len(e) for e in data_df[target_data_key].index]
len_barcode_TCGA = np.unique(len_barcode_TCGA)
if len_barcode_TCGA.shape[0] != 1:
    print('WARNING: PB WITH BARCODE')
len_barcode_TCGA = len_barcode_TCGA[0]

TCGA_biospecimen_file = '../data/TCGA/pancancer_sample_spec.csv'
TCGA_sample_tissues = pd.read_csv(TCGA_biospecimen_file, sep=',')
TCGA_sample_tissues = TCGA_sample_tissues[['barcode', 'project']]
TCGA_sample_tissues['barcode'] = TCGA_sample_tissues['barcode'].str[:len_barcode_TCGA]
TCGA_sample_tissues = TCGA_sample_tissues.drop_duplicates().set_index('barcode')
TCGA_sample_tissues = TCGA_sample_tissues.merge(data_df[target_data_key], left_index=True, right_index=True, how='right')
TCGA_sample_tissues = TCGA_sample_tissues[['project']]
TCGA_sample_tissues['project'] = TCGA_sample_tissues['project'].astype(str)
pd.testing.assert_index_equal(TCGA_sample_tissues.index, data_df[target_data_key].index)

## TRANSACT alignment

In [4]:
# Compute principal vectors
TRANSACT_clf = TRANSACT(kernel=kernel_name,
                        kernel_params=kernel_param,
                        n_components=number_pc,
                        n_jobs=n_jobs,
                        verbose=10)

TRANSACT_clf.fit(normalized_data_df[source_data_key],
                 normalized_data_df[target_data_key],  
                 n_pv=n_pv,
                 step=n_interpolation,
                 with_interpolation=True)

NameError: name 'normalized_data_df' is not defined

In [None]:
# Project data
source_consensus_features = TRANSACT_clf.transform(normalized_data_df[source_data_key])
target_consensus_features = TRANSACT_clf.transform(normalized_data_df[target_data_key])

# Put into format
source_consensus_features = pd.DataFrame(source_consensus_features,
                                         index=data_df[source_data_key].index)
target_consensus_features = pd.DataFrame(target_consensus_features,
                                         index=data_df[target_data_key].index)

## Proportion of consensus features

In [None]:
_, consensus_feature_contribution = compute_proportion(kernel_param['gamma'],
                                                       n_pc=number_pc, 
                                                       n_pv=n_pv, 
                                                       normalized_data_df=normalized_data_df, 
                                                       source_data_key=source_data_key,
                                                       target_data_key=target_data_key,
                                                       clf=TRANSACT_clf)

In [None]:
contribution_df = pd.DataFrame({d:l['consensus'] for d,l in consensus_feature_contribution.items()})
contribution_df.columns = ['offset', 'linear', 'interaction']
contribution_df['higher order'] = 1 - np.sum(contribution_df, axis=1)

contribution_df.plot.bar(stacked=True, figsize=(10,6.5), width=0.7)

plt.legend(bbox_to_anchor=(0.9, 1.05), loc=4, borderaxespad=0., fontsize=15, ncol=2)

yticks = np.linspace(0,1,6)
plt.yticks(yticks, ['%s%%'%(int(100*y)) for y in yticks], fontsize=25, color='black')
plt.xticks(np.arange(n_pv), np.arange(1,1+n_pv).astype(str), fontsize=20, color='black')
plt.xlabel('Consensus feature number', fontsize=30, color='black')
plt.ylabel('Geometric proportion \n of different contributions', fontsize=25, color='black')
plt.ylim(0,1.001)
plt.tight_layout()
plt.savefig('./figures/Fig_3A',
            dpi=300)

## UMAP plot

In [None]:
tissue_correspondance = [
    ['Breast', 'TCGA-BRCA'],
    ['Skin', 'TCGA-SKCM'],
    ['Large Intestine', 'Small Intestine', 'TCGA-COAD', 'TCGA-READ'],
    ['Lung', 'TCGA-MESO', 'TCGA-LUAD', 'TCGA-LUSC'],
    ['Head and Neck', 'TCGA-HNSC'],
    ['Prostate', 'TCGA-PRAD'],
    ['Pancreas', 'TCGA-PAAD'],
    ['Brain', 'Central Nervous System', 'TCGA-GBM', 'TCGA-LGG'],
    ['Blood', 'Haematopoietic and Lymphoid', 'TARGET-AML', 'TARGET-ALL-P2', 'TCGA-DLBC', 'TCGA-LAML'],
    ['Kidney', 'TCGA-KICH', 'TCGA-KIRC', 'TCGA-KIRP'],
    ['Ovary', 'TCGA-OV'],
    ['Uterus', 'TCGA-UCS', 'TCGA-UCEC'],
    ['Liver', 'TCGA-CHOL', 'TCGA-LIHC'],
    ['Thyroid', 'TCGA-THCA'],
    ['Stomach', 'TCGA-STAD'],
    ['Bladder', 'TCGA-BLCA'],
    ['Cervix', 'Vulva', 'TCGA-CESC'],
    ['Esophagus', 'TCGA-ESCA'],
    ['Peripheral Nervous System', 'Adrenal Gland', 'TCGA-PCPG', 'TCGA-ACC'],
    ['Bone', 'Soft Tissue', 'TCGA-SARC'],
    ['Testis', 'TCGA-TGCT'],
    ['Thymus', 'TCGA-THYM'],
    ['Eye', 'TCGA-UVM'],
    ['Endometrium']
]

colors = ['green',
          'red', 
          'darksalmon',
            'orchid',
          'gold', 
          'olivedrab', 
          'deepskyblue', 
          'navy',
          'm', 
          'sandybrown',
          'slategray',
          'chocolate',
          'silver',
          'coral', 
          'magenta',
          'orange',
          'violet', 
          'yellowgreen', 
          'pink', 
          'orchid',
          'darkblue',
          'plum',
          'khaki',
          'sienna',
          'tomato']
assert len(colors) >= len(tissue_correspondance)

### UMAP computation

In [None]:
n_neighbors = 100
n_epochs = 2000
metric = 'cosine'
umap_embedding = umap.UMAP(n_neighbors=n_neighbors,
                           n_epochs=n_epochs,
                           metric=metric,
                           verbose=True)

X = np.concatenate([np.array(source_consensus_features), 
                    np.array(target_consensus_features)])
X_projected_umap = umap_embedding.fit_transform(X)

### Cell lines vs TCGA

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(x=X_projected_umap[source_consensus_features.shape[0]:,0], 
                y=X_projected_umap[source_consensus_features.shape[0]:,1],
                label='TCGA', marker='x', alpha=0.7)
sns.scatterplot(x=X_projected_umap[:source_consensus_features.shape[0],0], 
                y=X_projected_umap[:source_consensus_features.shape[0],1],
                label='GDSC', marker='v', alpha=0.9)

plt.xticks(fontsize=12)
plt.xlabel('UMAP direction 1', fontsize=20, color='black')
plt.ylabel('UMAP direction 2', fontsize=20, color='black')
plt.legend(fontsize=12, 
           ncol=1,
           bbox_to_anchor=(1.25, 1.))
plt.tight_layout()
plt.savefig('figures/GDSC_vs_TCGA.png', dpi=300)
plt.show()

### UMAP colored by tissue

In [None]:
fig = pylab.figure(figsize=(20,14))
figlegend = pylab.figure(figsize=(20,14))
ax = fig.add_subplot(111)

for c, t in zip(colors, tissue_correspondance):
    X_source_index = np.where(np.isin(data_df[source_data_key].index.get_level_values(1), t))[0]
    X_target_index = np.where(np.isin(TCGA_sample_tissues['project'], t))[0] + source_consensus_features.shape[0]
    
    sns.scatterplot(x=X_projected_umap[X_target_index,0], 
                    y=X_projected_umap[X_target_index,1],
                    label='Tumor - %s'%(t[0]),
                    marker='x',
                    alpha=0.7,
                    color=c,
                    s=100, ax=ax)
    sns.scatterplot(x=X_projected_umap[X_source_index,0], 
                    y=X_projected_umap[X_source_index,1],
                    label='Cell line - %s'%(t[0]), marker='o', 
                    alpha=1, color=c, s=100, ax=ax)

# Tissue type not indicated
X_source_index = np.where(~np.isin(data_df[source_data_key].index.get_level_values(1), np.concatenate(tissue_correspondance)))[0]
sns.scatterplot(x=X_projected_umap[X_source_index,0], 
                y=X_projected_umap[X_source_index,1],
                label='GDSC - rest', marker='v', alpha=0.7, color='black', ax=ax)

ax.tick_params(axis='both', labelsize=30)
ax.set_xlabel('UMAP direction 1', fontsize=40, color='black')
ax.set_ylabel('UMAP direction 2', fontsize=40, color='black')

pylab.figlegend(*ax.get_legend_handles_labels(), loc = 'upper left', ncol=2, fontsize=20)
figlegend.tight_layout()
figlegend.savefig('./figures/GDSC_to_TCGA_UMAP_legend.png',
                  dpi=300)

ax.legend([])
plt.tight_layout()
fig.savefig('./figures/GDSC_to_TCGA_UMAP.png',
            dpi=300)
plt.show()