In [1]:
from sklearn import manifold
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from bokeh.models import Select, TextInput, ColumnDataSource
from bokeh.models import HoverTool, Legend
from bokeh.layouts import row,column,widgetbox
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.plotting import figure,show,save
from bokeh.io import output_notebook
output_notebook()

Upload TPM Gene Expression dataset as well as the Clinical review to obtain the name and site of each sample

In [None]:
genelev = pd.read_csv('https://xena.treehouse.gi.ucsc.edu:443/download/TumorCompendium_v10_PolyA_hugo_log2tpm_58581genes_2019-07-25.tsv',sep='\t')
genelev = genelev.loc[:, ~genelev.columns.duplicated()].set_index("Gene")
ex = genelev.T

clinical = pd.read_csv('https://xena.treehouse.gi.ucsc.edu:443/download/clinical_TumorCompendium_v10_PolyA_2019-07-25.tsv', sep="\t")
clinical = clinical[['th_sampleid','disease','site_id']]
clinical = clinical.loc[:, ~clinical.columns.duplicated()].set_index("th_sampleid")

Filter the dataset so that the TCGA samples (9806) and TARGET samples (784) are removed and only the TH sites remain

In [None]:
ex_th = ex.filter(regex='^TH', axis=0)
clinical_th = clinical.filter(regex='^TH', axis=0)

ex_th = pd.merge(ex_th, clinical_th, left_index=True, right_index=True)
x = ex_th[ex_th.columns[:-2]]
y = ex_th[ex_th.columns[-2:]]

In [None]:
tsne = manifold.TSNE(n_components=2, init="pca", random_state=0, perplexity=5)
Y = tsne.fit_transform(x)   #the gene expression are already log2(TPM + 1) normalized
X = pd.DataFrame(Y, index=x.index, columns=["tsne1", "tsne2"])
X = pd.merge(X, y, left_index=True, right_index=True)

2 columns corresponding to their disease's color and site's color are added

In [None]:
palette_site = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
             for i in range(31)]
palette_disease = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
             for i in range(79)]

X_bokeh = X.copy()

conditions = [
    X_bokeh['site_id'] == site_id for site_id in X_bokeh.site_id.unique()]
outputs = palette_site
res = np.select(conditions, outputs, 'Other')
X_bokeh['color_site'] = res

conditions = [
    X_bokeh['disease'] == disease for disease in X_bokeh.disease.unique()]
outputs = palette_disease
res = np.select(conditions, outputs, 'Other')
X_bokeh['color_disease'] = res

X_bokeh['Sample'] = X_bokeh.index

Plot of the interactive map

In [None]:
def plot(doc):
    sample = TextInput(value="",title="Sample :")
    site = TextInput(value="",title="Site :")
    disease = TextInput(value="",title="Disease :")
    site_list = Select(title="Site", value="All",
               options=['All']+sorted(list(X_bokeh.dropna().site_id.unique())))
    disease_list = Select(title="Disease", value="All",
               options=['All']+sorted(list(X_bokeh.dropna().disease.unique())))
    color_by = Select(title="Color by", value='Site',
               options=['Site','Disease'])   

    source = ColumnDataSource(data=dict(tsne1=[], tsne2=[], color=[], disease=[], site_id=[],alpha=[],Sample=[]))

    TOOLTIPS=[('Sample','@Sample'),('site_id', '@site_id'),
                        ('disease','@disease')]

    p = figure(title="t-SNE of TH sites",plot_width = 1000, plot_height = 800, tooltips=TOOLTIPS,sizing_mode="scale_both")
    p.circle(x="tsne1", y="tsne2", source=source, size=4, color="color",alpha='alpha')

    
    def select_patient():
        color_by_val = 'color_site' if color_by.value == 'Site' else 'color_disease'
        
        sample_val = sample.value.strip()
        site_list_val = site_list.value
        disease_val = disease.value.strip()
        site_val = site.value.strip()
        disease_list_val = disease_list.value
        selected = X_bokeh.dropna().copy()
        if (sample_val != ""):          
            selected = selected[selected.Sample.str.contains(sample_val)==True]
        if (disease_list_val != "All"):
            selected = selected[selected.disease.str.contains(disease_list_val)==True]
        if (site_list_val != "All"):
            selected = selected[selected.site_id.str.contains(site_list_val)==True]
        if (disease_val != ""):
            selected = selected[selected.disease.str.contains(disease_val)==True]
        if (site_val != ""):         
            selected = selected[selected.site_id.str.contains(site_val)==True]
        unselected = X_bokeh[~X_bokeh.isin(selected)].dropna()
        return selected,unselected,color_by_val

    def update():
        df,df_n,col = select_patient()
        df['alpha']=1
        df_n['alpha']=0.1
        p.title.text = "{} Patients selected, {} Patients non selected, {} Disease(s), {} Site(s)".format(len(df),len(df_n),len(df.disease.unique()),len(df.site_id.unique()))
        source.data = dict(
            tsne1=df["tsne1"].append(df_n["tsne1"]),
            tsne2=df["tsne2"].append(df_n["tsne2"]),
            color=df[col].append(df_n[col]),
            disease=df["disease"].append(df_n["disease"]),
            site_id=df["site_id"].append(df_n["site_id"]),
            alpha=df['alpha'].append(df_n["alpha"]),
            Sample=df['Sample'].append(df_n['Sample'])
        )
        
    controls = [color_by,sample,site_list,site,disease_list,disease]
    for control in controls:
        control.on_change('value', lambda attr, old, new: update())

    inputs = column(*controls, width=100, height=1000)
    inputs.sizing_mode = "fixed"
    l = row(inputs,p)
    update()  # initial load of the data
    doc.add_root(l)
        
handler = FunctionHandler(plot)
app = Application(handler)

show(app)