In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%%appyter markdown

<center>
    <h1 id = "top-of-app"> 
        <div style="font-size:3rem;font-weight:500"> <img src="{{ url_for('static', filename='cluster-icon.svg') }}" style="height:45px;padding:0 5px;display:inline"/> Patient Cohorts RNA-Seq Viewer</div>
    </h1>
    <br>
    <div style="font-size:2rem;font-weight:500">An appyter for the visualization and analysis of intra-cancer patient clusters based on RNA-Seq profiles and clinical data</div>
</center>

[The Cancer Genome Atlas (TCGA)](https://www.cancer.gov/about-nci/organization/ccg/research/structural-genomics/tcga) dataset contains multiomics profiling and clinical data from over 10,000 tumors collected from patients spanning several cancer types. Specifically, TCGA has bulk RNA-sequencing (RNA-Seq) profiling of tumors, which can provide insights into mechanisms and classify tumors by subtype.

By default, this appyter provides analysis and visualization of TCGA datasets. Users can optionally upload their own datasets.

The appyter provides analysis for RNA-Seq TCGA data for cancers with over 150 cases. The report automatically identifies clusters of patient and determines which clinical features and genes are most associated with each cluster.

For the TCGA data, each column in the RNA-Seq dataset corresponds to a row in the clinical dataset; both are referenced by the same identifier (here the case_id as provided by TCGA).

The RNA-Seq data loaded from TCGA is in the form of raw counts mapped to genes with the [htseq-count](https://htseq.readthedocs.io/en/release_0.9.0/count.html) analysis package; the same format should be followed for user-uploaded files. The analysis filters out lowly expressed genes, identifies the most variable genes, normalize the counts, and reduces the dimensionality of the dataset further with PCA and UMAP.

To determine the ideal number of clusters, the analysis tests a range of possible K clusters, and selects the optimal number based on a modified silhouette score that prioritizes more clusters to avoid missing out small clusters.

The appyter also identifies the top genes for each cluster, using these for enrichment analysis and suggestion for drugs and small molecules based on the drugs that mimic or reverse the signatures obtained for each cluster. Such drug suggestions are based on the L1000 dataset, using the L1000FWD API. It should be noted that these are speculative predictions and should not be applied to patients before carefully tested in cell based assays and animal models.

In [None]:
import os
import numpy as np
import pandas as pd
import requests
import time
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
from umap import UMAP
from scipy.stats import zscore
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
from maayanlab_bioinformatics.dge import characteristic_direction
from maayanlab_bioinformatics.normalization import log2_normalize, filter_by_var, zscore_normalize
import qnorm
from maayanlab_bioinformatics.utils import merge
import plotly.express as px
import plotly
import math
from collections import OrderedDict
import json
from jupyter_d3 import scatter_plot
from IPython.display import display, IFrame, Markdown, HTML
from textwrap import wrap
from lifelines import KaplanMeierFitter
from lifelines.statistics import multivariate_logrank_test, pairwise_logrank_test
import plotly.graph_objects as go

In [None]:
%%appyter hide_code_exec

{% do SectionField(
    name="DATASET",
    title="Dataset selection",
    subtitle='If using TCGA data, leave both file upload fields blank and simply select the desired cancer type.',
    img = "tcga-logo.png"
) %}


{% do SectionField(
    name='CONFIG',
    title='Analysis parameters',
    subtitle='Select various parameters for dimensionality reduction and dataset size. Defaults for n_neighbors and min_cluster_dist are based on the defaults used by the Seurat R Package for single cell genomics analysis.',
    img = "parameters-icon.svg"

) %}

{% do SectionField(
    name="ENRICHR_LIBS",
    title="Libraries to include in the Enrichr search",
    img = "enrichr-logo.png"
) %}


{% do SectionField(
    name="SURVIVAL",
    title="Survival analysis parameters",
    subtitle="If using TCGA data or not interested in survival analysis, leave any (or all) fields blank.",
    img = "survival-icon.svg"
) %}

{% do SectionField(
    name="L1000FWD",
    title="L1000FWD search parameters",
    img = "l1000fwd.png"
) %}

{% set data_filename = FileField(
    name='data_filename', 
    label='RNA-seq data file (.csv)', 
    description='Upload RNA-seq dataset in csv format. The index of the dataset are genes, the columns are samples.', 
    default='',
    examples={'Papillary adenocarcinoma, NOS_data.csv': 'https://tcga-enrichr-viewer.s3.us-east-2.amazonaws.com/Papillary+adenocarcinoma%2C+NOS_data.csv'},
    section='DATASET'
) %}

{% set clinical_data_filename = FileField(
    name='clinical_data_filename', 
    label='Clinical data file (.csv)', 
    description='Upload clinical data in csv format. The first column should contain patient IDs corresponding to those in the RNA-seq file. Each subsequent column is a clinical data field.', 
    default='',
    examples={'Papillary adenocarcinoma, NOS_clinical_data.csv': 'https://tcga-enrichr-viewer.s3.us-east-2.amazonaws.com/Papillary+adenocarcinoma%2C+NOS_clinical_data.csv'},
    section='DATASET'
) %}

In [None]:
%%appyter code_eval

data_filename = {{ data_filename }}
clinical_data_filename = {{ clinical_data_filename }}

cancer = '''{{ ChoiceField(
        name = "cancer",
        label = "Cancer type",
        description="The value provided as the primary diagnosis on cases in TCGA.",
        choices=["Infiltrating duct carcinoma, NOS","Squamous cell carcinoma, NOS","Lobular carcinoma, NOS","Acute myeloid leukemia, NOS","Hepatocellular carcinoma, NOS","Serous cystadenocarcinoma, NOS","Endometrioid adenocarcinoma, NOS","Adenocarcinoma, NOS","Clear cell adenocarcinoma, NOS","Glioblastoma","Mucinous adenocarcinoma","Transitional cell carcinoma","Malignant melanoma, NOS","Papillary adenocarcinoma, NOS"],
        section="DATASET",
        default="Papillary adenocarcinoma, NOS" 
) }}'''
    
n_pca_components = {{ IntField(
    name='n_pca_components',
    label='Number of PCA components to use for UMAP',
    default=10,
    min=0,
    max=100,
    section='CONFIG',
) }}

n_neighbors = {{ IntField(
    name='n_neighbors',
    label='Number of neighbors for each projected datapoint computed by UMAP',
    description='Smaller values preserve local manifold structure in the dataset as opposed to overall global structure',
    default=40,
    min=2,
    max=200,
    section='CONFIG',
) }}

min_cluster_dist = {{ FloatField(
    name='min_cluster_dist',
    label='Minimum distance between UMAP-projected points',
    description='How tightly packed points produced by dimensionality reduction with UMAP are permitted to be.',
    default=0.3,
    min=0.1,
    max=1,
    section='CONFIG',
) }}

top_n_genes = {{ IntField(
    name='top_n_genes',
    label='Number of top most variable genes to analyze',
    description="The number of top most variable genes to use for analysis",
    default=2500,
    min=100,
    max=19000,
    section='CONFIG',
) }}

max_clusters_calculation = {{ RadioField(
    name="max_clusters_calculation",
    label="Method for calculating the maximum possible number of clusters",
    description = "n is the sample size (number of cases)",
    choices = [
         {
            "label": '''
                <math xmlns="http://www.w3.org/1998/Math/MathML">
                  <mstyle displaystyle="true">
                    <mfrac>
                      <msqrt>
                        <mrow>
                          <mi>n</mi>
                        </mrow>
                      </msqrt>
                      <mn>2</mn>
                    </mfrac>
                  </mstyle>
                </math>
            ''', 
            "value":"root/2"
        },
        {
            "label": '''
            <math xmlns="http://www.w3.org/1998/Math/MathML">
                <mstyle displaystyle="true">
                <msqrt>
                  <mrow>
                    <mi>n</mi>
                  </mrow>
                </msqrt>
              </mstyle>
            </math>
            ''', 
            "value":"root"
        }
    ],
    default = "root/2",
    section = 'CONFIG'
)}}

use_weighted_silhouette_score = {{BoolField(
    name="use_weighted_silhouette_score",
    label="Use a weighted silhouette score to determine ideal number of clusters",
    description = "A weighted score is a weighted combination of the original score and the value of k, therefore encouraging more clusters.",
    default = True,
    section = 'CONFIG'
)}}

use_second_deriv_method = {{BoolField(
    name="use_second_deriv_method",
    label="Use the most concave down local maxima of the silhouette score to determine k",
    description = "This method uses the unweighted silhouette score regardless of the choice above. If no maxima are found, we resort to k as determined by the chosen silhouette score method. ",
    default = True,
    section = 'CONFIG'
)}}

top_n_genes_enrichment = {{ IntField(
    name='top_n_genes_enrichment',
    label='Number of top genes to use for enrichment analysis',
    description='The number of \'top\' genes to use for enrichment analysis',
    default=250,
    min=100,
    max=1000,
    section='CONFIG',
) }}

heatmap_top_n = {{ IntField(
    name='cluster_top_n',
    label='Number of up and down genes per cluster for heatmap visualizatiton',
    default=100,
    min=100,
    max=1000,
    section='CONFIG',
) }}

top_n_results = {{ IntField(
    name='top_n_results',
    label='Number of top enrichment results',
    default=5,
    min=1,
    max=100,
    section='CONFIG',
) }}

use_default_libraries = {{BoolField(
    name="use_default_libraries",
    label="Use default Enrichr libraries?",
    description = "The default libraries include:\nKEGG_2019_Human,\nKEGG_2019_Mouse,\nGO_Biological_Process_2018,\nMGI_Mammalian_Phenotype_Level_4_2019,\nGWAS_Catalog_2019,\nENCODE_TF_ChIP-seq_2015.",
    default = False,
    section = 'ENRICHR_LIBS'
)}}


transcription_libraries = {{MultiChoiceField(name='transcription_libraries', 
                                            label='Transcription',
                                            default=[], 
                                            section = 'ENRICHR_LIBS',
                                            choices=[
                                                'ARCHS4_TFs_Coexp',
                                                'ChEA_2016',
                                                'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
                                                'ENCODE_Histone_Modifications_2015',
                                                'ENCODE_TF_ChIP-seq_2015',
                                                'Epigenomics_Roadmap_HM_ChIP-seq',
                                                'Enrichr_Submissions_TF-Gene_Coocurrence',
                                                'Genome_Browser_PWMs',
                                                'lncHUB_lncRNA_Co-Expression',
                                                'miRTarBase_2017',
                                                'TargetScan_microRNA_2017',
                                                'TF-LOF_Expression_from_GEO',
                                                'TF_Perturbations_Followed_by_Expression',
                                                'Transcription_Factor_PPIs',
                                                'TRANSFAC_and_JASPAR_PWMs',
                                                'TRRUST_Transcription_Factors_2019',
                                            ])  }}

pathways_libraries = {{ MultiChoiceField(name='pathways_libraries', 
                                         label='Pathways', 
                                         default=[], 
                                         section='ENRICHR_LIBS',
                                         choices=[
                                                    'ARCHS4_Kinases_Coexp',
                                                    'BioCarta_2016',
                                                    'BioPlanet_2019',
                                                    'BioPlex_2017',
                                                    'CORUM',
                                                    'Elsevier_Pathway_Collection',
                                                    'HMS_LINCS_KinomeScan',
                                                    'HumanCyc_2016',
                                                    'huMAP',
                                                    'KEA_2015',
                                                    'KEGG_2019_Human',
                                                    'KEGG_2019_Mouse',
                                                    'Kinase_Perturbations_from_GEO_down',
                                                    'Kinase_Perturbations_from_GEO_up',
                                                    'L1000_Kinase_and_GPCR_Perturbations_down',
                                                    'L1000_Kinase_and_GPCR_Perturbations_up',
                                                    'NCI-Nature_2016',
                                                    'NURSA_Human_Endogenous_Complexome',
                                                    'Panther_2016',
                                                    'Phosphatase_Substrates_from_DEPOD',
                                                    'PPI_Hub_Proteins',
                                                    'Reactome_2016',
                                                    'SILAC_Phosphoproteomics',
                                                    'SubCell_BarCode',
                                                    'Virus-Host_PPI_P-HIPSTer_2020',
                                                    'WikiPathways_2019_Human',
                                                    'WikiPathways_2019_Mouse']) }}

ontologies_libraries = {{ MultiChoiceField(name='ontologies_libraries', 
                                           label='Ontologies', 
                                           default=[], 
                                           section = 'ENRICHR_LIBS',
                                           choices=[
                                                'GO_Biological_Process_2018',
                                                'GO_Cellular_Component_2018',
                                                'GO_Molecular_Function_2018',
                                                'Human_Phenotype_Ontology',
                                                'Jensen_COMPARTMENTS',
                                                'Jensen_DISEASES',
                                                'Jensen_TISSUES',
                                                'MGI_Mammalian_Phenotype_Level_4_2019'
                                           ])  }}
    
diseases_drugs_libraries = {{ MultiChoiceField(name='diseases_drugs_libraries', 
                                               label='Diseases/Drugs', 
                                               default=[], 
                                               section = 'ENRICHR_LIBS',
                                               choices=[    
                                                    'Achilles_fitness_decrease',
                                                    'Achilles_fitness_increase',
                                                    'ARCHS4_IDG_Coexp',
                                                    'ClinVar_2019',
                                                    'dbGaP',
                                                    'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019',
                                                    'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019',
                                                    'DisGeNET',
                                                    'DrugMatrix',
                                                    'DSigDB',
                                                    'GeneSigDB',
                                                    'GWAS_Catalog_2019',
                                                    'LINCS_L1000_Chem_Pert_down',
                                                    'LINCS_L1000_Chem_Pert_up',
                                                    'LINCS_L1000_Ligand_Perturbations_down',
                                                    'LINCS_L1000_Ligand_Perturbations_up',
                                                    'MSigDB_Computational',
                                                    'MSigDB_Oncogenic_Signatures',
                                                    'Old_CMAP_down',
                                                    'Old_CMAP_up',
                                                    'OMIM_Disease',
                                                    'OMIM_Expanded',
                                                    'PheWeb_2019',
                                                    'Rare_Diseases_AutoRIF_ARCHS4_Predictions',
                                                    'Rare_Diseases_AutoRIF_Gene_Lists',
                                                    'Rare_Diseases_GeneRIF_ARCHS4_Predictions',
                                                    'Rare_Diseases_GeneRIF_Gene_Lists',
                                                    'UK_Biobank_GWAS_v1',
                                                    'Virus_Perturbations_from_GEO_down',
                                                    'Virus_Perturbations_from_GEO_up',
                                                    'VirusMINT']) }}
    
cell_types_libraries = {{ MultiChoiceField(name='cell_types_libraries', 
                                           label='Cell Types', 
                                           default=[], 
                                           section = 'ENRICHR_LIBS',
                                           choices=[        
                                                'Allen_Brain_Atlas_down',
                                                'Allen_Brain_Atlas_up',
                                                'ARCHS4_Cell-lines',
                                                'ARCHS4_Tissues',
                                                'Cancer_Cell_Line_Encyclopedia',
                                                'CCLE_Proteomics_2020',
                                                'ESCAPE',
                                                'GTEx_Tissue_Sample_Gene_Expression_Profiles_down',
                                                'GTEx_Tissue_Sample_Gene_Expression_Profiles_up',
                                                'Human_Gene_Atlas',
                                                'Mouse_Gene_Atlas',
                                                'NCI-60_Cancer_Cell_Lines',
                                                'ProteomicsDB_2020',
                                                'Tissue_Protein_Expression_from_Human_Proteome_Map']) }}
    
    
miscellaneous_libraries = {{ MultiChoiceField(name='miscellaneous_libraries', 
                                              label='Miscellaneous', 
                                              default=[], 
                                              section = 'ENRICHR_LIBS',
                                              choices=[            
                                                    'Chromosome_Location_hg19',
                                                    'Data_Acquisition_Method_Most_Popular_Genes',
                                                    'Enrichr_Libraries_Most_Popular_Genes',
                                                    'Genes_Associated_with_NIH_Grants',
                                                    'HMDB_Metabolites',
                                                    'HomoloGene',
                                                    'InterPro_Domains_2019',
                                                    'NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions',
                                                    'NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions',
                                                    'NIH_Funded_PIs_2017_Human_AutoRIF',
                                                    'NIH_Funded_PIs_2017_Human_GeneRIF',
                                                    'Pfam_Domains_2019',
                                                    'Pfam_InterPro_Domains',
                                                    'Table_Mining_of_CRISPR_Studies'])  }}
    
    
legacy_libraries = {{ MultiChoiceField(name='legacy_libraries', 
                                       label='Legacy', 
                                       default=[], 
                                       section = 'ENRICHR_LIBS',
                                       choices=[                
                                            'BioCarta_2013',
                                            'BioCarta_2015',
                                            'ChEA_2013',
                                            'ChEA_2015',
                                            'Chromosome_Location',
                                            'Disease_Signatures_from_GEO_down_2014',
                                            'Disease_Signatures_from_GEO_up_2014',
                                            'Drug_Perturbations_from_GEO_2014',
                                            'ENCODE_Histone_Modifications_2013',
                                            'ENCODE_TF_ChIP-seq_2014',
                                            'GO_Biological_Process_2013',
                                            'GO_Biological_Process_2015',
                                            'GO_Biological_Process_2017',
                                            'GO_Biological_Process_2017b',
                                            'GO_Cellular_Component_2013',
                                            'GO_Cellular_Component_2015',
                                            'GO_Cellular_Component_2017',
                                            'GO_Cellular_Component_2017b',
                                            'GO_Molecular_Function_2013',
                                            'GO_Molecular_Function_2015',
                                            'GO_Molecular_Function_2017',
                                            'GO_Molecular_Function_2017b',
                                            'HumanCyc_2015',
                                            'KEA_2013',
                                            'KEGG_2013',
                                            'KEGG_2015',
                                            'KEGG_2016',
                                            'MGI_Mammalian_Phenotype_2013',
                                            'MGI_Mammalian_Phenotype_2017',
                                            'MGI_Mammalian_Phenotype_Level_3',
                                            'MGI_Mammalian_Phenotype_Level_4',
                                            'NCI-Nature_2015',
                                            'Panther_2015',
                                            'Reactome_2013',
                                            'Reactome_2015',
                                            'TargetScan_microRNA',
                                            'Tissue_Protein_Expression_from_ProteomicsDB',
                                            'WikiPathways_2013',
                                            'WikiPathways_2015',
                                            'WikiPathways_2016']) }}

crowd_libraries = {{ MultiChoiceField(name='crowd_libraries', 
                                      label='Crowd', 
                                      default=[],
                                      section = 'ENRICHR_LIBS',
                                      choices=[                
                                            'Aging_Perturbations_from_GEO_down',
                                            'Aging_Perturbations_from_GEO_up',
                                            'Disease_Perturbations_from_GEO_down',
                                            'Disease_Perturbations_from_GEO_up',
                                            'Drug_Perturbations_from_GEO_down',
                                            'Drug_Perturbations_from_GEO_up',
                                            'Gene_Perturbations_from_GEO_down',
                                            'Gene_Perturbations_from_GEO_up',
                                            'Ligand_Perturbations_from_GEO_down',
                                            'Ligand_Perturbations_from_GEO_up',
                                            'MCF7_Perturbations_from_GEO_down',
                                            'MCF7_Perturbations_from_GEO_up',
                                            'Microbe_Perturbations_from_GEO_down',
                                            'Microbe_Perturbations_from_GEO_up',
                                            'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
                                            'SysMyo_Muscle_Gene_Sets']) }}



top_n_drugs = {{IntField(
    name='top_n_drugs',
    label='Number of top drug treatments to suggest for each cluster',
    default=5,
    min=0,
    max=50,
    section='L1000FWD',
) }}

default_libraries = OrderedDict([
    ('Diseases/Drugs', ['GWAS_Catalog_2019']), 
    ('Ontologies', ['GO_Biological_Process_2018','MGI_Mammalian_Phenotype_Level_4_2019']),
    ('Pathways', ['KEGG_2019_Human','KEGG_2019_Mouse']),
    ('Transcription', ['ENCODE_TF_ChIP-seq_2015']),
])

time_to_death_col = {{ StringField(
    name="time_to_death_col",
    label="`Time to death` column name",
    section = 'SURVIVAL',
    default="",
)}}

time_to_last_followup_col = {{ StringField(
    name="time_to_last_followup_col",
    label="`Time to last follow-up` column name",
    section = 'SURVIVAL',
    default="",
)}}

vital_status_col = {{ StringField(
    name="vital_status_col",
    label="`Vital status` column name",
    section = 'SURVIVAL',
    default="",
)}}

alive_val = {{ StringField(
    name="alive_val",
    label="Value in vital status column corresponding  to 'alive'",
    section = 'SURVIVAL',
    default="",
)}}

dead_val = {{ StringField(
    name="dead_val",
    label="Value in vital status column corresponding  to 'dead'",
    section = 'SURVIVAL',
    default="",
)}}

survival_time_unit = {{ StringField(
    name="survival_time_unit",
    label="Unit of time used for survival analysis columns",
    description="Used for labeling the survival plot x-axis.",
    section = 'SURVIVAL',
    default="days",
)}}


if use_default_libraries:
    enrichr_libraries = default_libraries
else: 
    enrichr_libraries = OrderedDict([
        ('Diseases/Drugs', diseases_drugs_libraries), 
        ('Ontologies', ontologies_libraries),
        ('Cell Type', cell_types_libraries),
        ('Pathways', pathways_libraries),
        ('Transcription', transcription_libraries),
        ('Legacy', legacy_libraries),
        ('Crowd', crowd_libraries)
    ])

    all_empty = True
    for key,libs in enrichr_libraries.items():
        if len(libs) > 0:
            all_empty = False
            break
    if all_empty:
        enrichr_libraries = default_libraries


# 1. Import dataset <a class="anchor" id="import"></a>

If using user-uploaded data, we access those files and set the necessary index names.

Otherwise, we first download the data containing RNA-seq profiles for the selected cancer type from TCGA and the corresponding clinical data for those cases.

The RNA-seq data that is provided has already been processed using the HTSeq Python package, which calculates the number of mapped reads to each gene.

In [None]:
# Notebook display util functions

def download_button(content, label, filename):
    # Add download button
    outname = filename.split('.')[0]
    display(HTML('<textarea id="textbox_{outname}" style="display: none;">{content}</textarea> <button style="margin:10px 0;" id="create_{outname}">{label}</button> <a download="{filename}" id="downloadlink_{outname}" style="display: none">Download</a>'.format(**locals())))
    display(HTML('<script type="text/javascript">!function(){{var e=null,t=document.getElementById("create_{outname}"),n=document.getElementById("textbox_{outname}");t.addEventListener("click",function(){{var t,l,c=document.getElementById("downloadlink_{outname}");c.href=(t=n.value,l=new Blob([t],{{type:"text/plain"}}),null!==e&&window.URL.revokeObjectURL(e),e=window.URL.createObjectURL(l)),c.click()}},!1)}}();</script>'.format(**locals())))

def make_clickable(link):
    return f'<a target="_blank" href="{link}">{link}</a>'

def figure_header(label,title):
    display(HTML(f"<div style='font-size:2rem; padding:1rem 0;'><b>{label}</b>: {title}</div>"))
    
def figure_legend(label,title,content=""):
    display(HTML(f"<div><b>{label}</b>: <i>{title}</i>. {content} </div>"))

In [None]:
def load_dataframe(file):
    ''' Load a file by downloading it or reading it if already downloaded.
    '''
    ext = os.path.splitext(file)[1]
    if ext in {'.tsv', '.txt'}:
        df = pd.read_csv(file, sep='\t', index_col=0)
    elif ext == '.csv':
        df = pd.read_csv(file, index_col=0)
    else:
        raise Exception('Unrecognized file format', ext)

    # Fix any type coersion on identifiers
    df.index = df.index.astype(str)
    df.columns = df.columns.astype(str)

    return df

In [None]:
user_data = False
if (data_filename != "" and clinical_data_filename != ""):
    user_data = True

In [None]:
data_index = "symbol"
clinical_index = "case_id"

if user_data:
    print(f"Loading user-uploaded data...")
    df_data = load_dataframe(data_filename).sort_index()
    df_clinical = load_dataframe(clinical_data_filename).sort_index()

    df_data.index.name = "symbol"
    df_clinical.index.name = "case_id"
else:
    print(f"Loading data for patients diagnosed with {cancer}...")
    # Import the RNA-seq and clinical DataFrames with only entries corresponding to
    # the selected cancer type
    aws_url = f'https://appyters.maayanlab.cloud/storage/Patient_Cohorts_RNASeq_Viewer/{cancer.replace(" ", "%20")}'

    df_data = pd.read_csv(f'{aws_url}_data.csv')
    df_data = df_data.set_index("symbol") 

    df_clinical = pd.read_csv(f'{aws_url}_clinical_data.csv')
    df_clinical = df_clinical.set_index("case_id")
    df_clinical = df_clinical.replace("not reported", np.nan)    

print("Data loaded\n\n")

In [None]:
figure_header("Table 1", "RNA-Seq data")
display(df_data.head())
figure_legend("Table 1", "RNA-Seq data", "The RNA-Seq data contains a row per gene and a column per case. The column indices are case_ids (from TCGA or the user-uploaded dataset) while the row indices are Entrez gene symbols.")

In [None]:
# Get a list of the clinical features with multiple unique values
def has_unique_values(feature):
    unique = df_clinical[feature].dropna().unique()
    if len(unique) >= 1: return True
    return False

features = df_clinical.columns.values
if not user_data:
    features = list(filter(
        lambda x: not x in ["primary_diagnosis","submitter_id", "updated_datetime","created_datetime","diagnosis_id", "demographic_id"] 
        and has_unique_values(x), features))
    
    df_clinical = df_clinical[features]

    if "icd_10_code" in df_clinical.columns:
        
        # retrieve the file mapping codes to descriptions 
        aws_url = 'https://tcga-enrichr-viewer.s3.amazonaws.com/icd10cm_order_2020.txt'
        
        codes = pd.read_csv(aws_url,sep="\t",header=None)
        indeces = [ val.split(" ")[1] for val in codes[0] ]

        # The description of each row starts at character 77 in this file
        def get_description(val):
            return val[77:]

        descriptions = [get_description(val) for val in codes[0]]
        df_icd_codes = pd.DataFrame(descriptions, index=indeces)
        
        # Add descriptions to the clinical df
        codes = df_clinical["icd_10_code"] 
        
        def get_desc_by_code(code):
            code_ind = code.replace(".", "")
            if code_ind in df_icd_codes.index.values:
                return df_icd_codes.loc[code_ind,0]
            return code # map the code to itself if missing in the table
            
        
        df_clinical["icd_10_code_desc"] = [ get_desc_by_code(x) for x in codes]
        features = ["icd_10_code_desc", *features]

    # drop rows for case_ids already represented (i.e. don't have more than one clinical entry per patient)
    df_clinical = df_clinical.loc[~df_clinical.index.duplicated(keep='first')]
else:
    features = list(filter(
        lambda x: has_unique_values(x), features))
    df_clinical = df_clinical[features]


print(f"{df_data.shape[1]} cases, {len(features)} clinical features\n\n")

figure_header("Table 2","Clinical metadata")
display(df_clinical)
figure_legend("Table 2","Clinical metadata", "The column indices of the clinical dataset are clinical features and the row indices are case_ids corresponding to the column indices of the RNA-Seq dataset.")

In [None]:
print("Clinical features loaded:\n")
print_features = [ print(f) for f in features  ]

# 2. Process data and compute clusters <a class="anchor" id="processing"></a>

Next, we'll need to normalize the RNA-seq data, reduce its dimensionality using PCA and UMAP, and compute clusters of RNA-seq profiles.

In [None]:
# Data size stats

df_library_size = pd.DataFrame(
    {
        'n_expressed_genes': df_data[df_data > 0].count(),
        'log_n_reads': np.log2(df_data.sum() + 1),
        'n_reads': df_data.sum(),
    }).sort_values('n_reads', ascending=False)

df_library_size.index.name = "case_id"

figure_header("Table 3","Library size")
display(df_library_size.head()) 
figure_legend("Table 3","Library size", "By default, the first five entries are shown. A gene read is counted toward n_reads for a single patient if its value is greater than 0.")

figure_header("Figure 1","Library size distribution")
sns.distplot(df_library_size["n_reads"]); plt.show()
figure_legend("Figure 1","Library size distribution")

We normalize two versions of the dataset: one with just the `top_n_genes` most variable genes and one with all genes. This is because we need the former to compute clusters in the first place (after dimensionality reduction), and the latter to compute the characteristic direction of each gene in the dataset for each computed cluster.

In [None]:
# Normalization

# maintain the unfiltered dataset to compute characteristic directions later
df_data_norm_all_genes = df_data.copy()

# compute log normalization of matrix
df_data_norm = log2_normalize(df_data, offset=1)
df_data_norm_all_genes = log2_normalize(df_data_norm_all_genes, offset=1)

#quantile normalize
df_data_norm = qnorm.quantile_normalize(df_data_norm, axis=1)
df_data_norm_all_genes = qnorm.quantile_normalize(df_data_norm_all_genes, axis=1)

# take top_n_genes most variable rows
df_data_norm = filter_by_var(df_data,top_n = top_n_genes)

# convert to zscores
df_data_norm = zscore_normalize(df_data_norm.T).T
df_data_norm_all_genes = zscore_normalize(df_data_norm_all_genes.T).T

In [None]:
print("Normalized dataset with all genes has shape ",df_data_norm_all_genes.shape)
print(f"Normalized dataset with {top_n_genes} most variable genes has shape ", df_data_norm.shape)

In [None]:
figure_header("Table 4","Normalized RNA-Seq data")
display(df_data_norm)
figure_legend("Table 4","Normalized RNA-Seq data", "Counts are filtered for the <i>top_n_genes</i> most variable genes. A log transform and normalization is performed on the resultitng dataset, which is then converted to z-scores.")

In [None]:
# plot the first gene distribution
gene = df_data_norm.index.values[0]
figure_header("Figure 2",f"Sample gene expression distibution for {gene}")
sns.distplot(df_data_norm.iloc[0, :]); plt.show()
figure_legend("Figure 2",f"Sample gene expression distibution for {gene}", f"In this dataset, {gene} is the most variably expressed across all samples.")

# plot the last gene distribution
gene = df_data_norm.index.values[-1]
figure_header("Figure 3",f"Sample gene expression distibution for {gene}")
sns.distplot(df_data_norm.iloc[-1, :]); plt.show()
figure_legend("Figure 3",f"Sample gene expression distibution for {gene}", f"In this dataset, {gene} is the least variably expressed across all samples among the filtered (most variably expressed) genes.")


figure_header("Figure 4","Sample individual RNA-Seq profile distribution")
# plot a single RNA seq profile distribution
sns.distplot(df_data_norm.iloc[:, 0]); plt.show()
figure_legend("Figure 4","Sample individual RNA-Seq profile distribution")


Now let's visualize expression of the most variable genes across the dataset with hierachical clustering and a heatmap.

In [None]:
# Normalization
# take top 800 most variable rows

# the "i" in the variable here stands for "initial", since we'll do a different method 
# for selecting genes for a heatmap later
df_data_norm_heatmap_i = filter_by_var(df_data,top_n=800)

# compute log normalization of matrix
df_data_norm_heatmap_i = log2_normalize(df_data_norm_heatmap_i)

# convert to zscores
df_data_norm_heatmap_i = zscore_normalize(df_data_norm_heatmap_i) 

# Plot heatmap
figure_header("Figure 5","Heatmap of normalized expression for top 800 most variable genes")
sns.clustermap(df_data_norm_heatmap_i,xticklabels=False); plt.show()
figure_legend("Figure 5","Heatmap of normalized expression for top 800 most variable genes", "This initial heatmap considers only the top most variable genes across the entire dataset, rather than the most differentially expressed genes among potential clusters. As such, there may seem to be a lot of noise preventing us from seeing clear clusters. We will ammend this in a later heatmap.")


We use PCA to initially reduce the dimensionality of the dataset before clustering while still maintaining most of the variability.

In [None]:
# PCA
data_norm_pca = PCA(
  random_state=42,
)

data_norm_pca.fit(df_data_norm.values.T)

df_data_norm_pca = pd.DataFrame(
    data_norm_pca.transform(df_data_norm.values.T),
    index=df_data_norm.T.index
)

df_data_norm_pca.columns = [
    f'PCA-{c}' # ({r:.3f})'
    for c, r in zip(df_data_norm_pca.columns, data_norm_pca.explained_variance_ratio_)
]

df_data_norm_pca.index.name = "case_id"

figure_header("Table 5","Principle components of RNA-Seq data")
display(df_data_norm_pca.head())
figure_legend("Table 5","Principle components of RNA-Seq data", "The top principle components are the projections of each datapoint onto the axes along which there is the most variation in the dataset.")

In the two plots below (PCA and UMAP projections), datapoints are color-coded by the selected feature. The size of each point represents the number of reads for that sample.

In [None]:
pca_data = merge(
        df_data_norm_pca[["PCA-0", "PCA-1"]],
        df_library_size,
        df_clinical
      )

axes = {
    "x": [min(df_data_norm_pca["PCA-0"])*1.1, max(df_data_norm_pca["PCA-0"])*1.1],
    "y": [min(df_data_norm_pca["PCA-1"])*1.1, max(df_data_norm_pca["PCA-1"])*1.1]
}

figure_header("Figure 6","Projection of RNA-Seq data onto first two principle directions")
scatter_plot(pca_data,"PCA-0", "PCA-1",axes,features)
figure_legend("Figure 6","Projection of RNA-Seq data onto first two principle directions", "The size of datapoints corresponds to the number of gene reads. Points can be color-coded by any of the clinical features using the dropdown menu.")

We further reduce the dimensionality of the dataset using the UMAP (Uniform Manifold Approximation and Projection) technique on the data projected onto the first `n_pca_components` PCA components.

In [None]:
# UMAP
data_norm_umap = UMAP(
  random_state=42,
  n_components=2,
  n_neighbors=n_neighbors,
  metric='cosine',
  min_dist=min_cluster_dist,
)

# use top n_pca_components components of PCA
n_pca_components = min(n_pca_components,df_data_norm_pca.shape[1])
data_norm_umap.fit(df_data_norm_pca.iloc[:, :n_pca_components].values)

df_data_norm_umap = pd.DataFrame(
  data_norm_umap.transform(df_data_norm_pca.iloc[:, :n_pca_components].values),
  columns=['UMAP-0', 'UMAP-1'],
  index=df_data_norm_pca.index,
)

In [None]:
# project data onto its first 2 UMAP components for visualization
umap_data = merge(
        df_data_norm_umap[["UMAP-0", "UMAP-1"]],
        df_library_size,
        df_clinical
      )

axes = {
    "x": [min(df_data_norm_umap["UMAP-0"])*1.1, max(df_data_norm_umap["UMAP-0"])*1.1],
    "y": [min(df_data_norm_umap["UMAP-1"])*1.1, max(df_data_norm_umap["UMAP-1"])*1.1]
}

figure_header("Figure 7","First two UMAP components of RNA-Seq data")
scatter_plot(umap_data,"UMAP-0", "UMAP-1",axes,features)
figure_legend("Figure 7","First two UMAP components of RNA-Seq data", "The size of datapoints corresponds to the number of gene reads. Points can be color-coded by any of the clinical features using the dropdown menu.")

We continue using the first 2 UMAP components.

To compute clusters, we use the Kmeans method, which requires us to define a total number of clusters. We test a range for the number of total clusters. 

For each number of clusters, we compute silhouette scores, which are a measure of how similar an entry is to its own cluster compared to other clusters. We want to maximize similarity within a cluster and differrences between clusters, so the ideal number of clusters is that which produces the highest silhouette score.

Here, we modify the selection of $k$ slightly to preserve nuances in the data by preferring more clusters over fewer. We also calcuulated a *modified* silhouette score, which takes into account the number of clusters, $k$; each original score is bumped by a factor linear with respect to $k$. 

In [None]:
def plot_silhouette_analysis(n_clusters, data, cluster_labels, avg_score, sample_values, centers):
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

In [None]:
silhouette_scores = []

# Set max clusters as a function of the sample size and the user-selected option
max_clusters = math.ceil(df_data_norm_umap.shape[0]**0.5)
if (max_clusters_calculation == "root/2"):
    max_clusters = int(math.ceil(max_clusters/2))


def calc_weighted_score(silhouette_score, k, max_k):
    return silhouette_avg*0.7 + k/max_k*0.3

cluster_range = range(2, max_clusters)
for n in cluster_range:
    X = df_data_norm_umap.values
    clusterer = KMeans(n_clusters=n, random_state=42).fit(X)
    y_pred = clusterer.predict(X)
    
    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, y_pred, metric='cosine')
    
    # Compute a weighted score that rewards higher numbers of clusters
    weighted_score = calc_weighted_score(silhouette_avg, n, max_clusters)
    
    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, y_pred)

    silhouette_scores.append({
        "N Clusters": n,
        "Silhouette Score": silhouette_avg,
        "Weighted Score": weighted_score
    })
    
    # Labeling the clusters
    centers = clusterer.cluster_centers_
    
    # Plot the analysis for this number of clusters
    #plot_silhouette_analysis(n, X, y_pred, silhouette_avg, sample_silhouette_values, centers)

If using the "second derivative method" for ideal $k$ calculation, we also compute estimates of the second derivatives at each posisble point to see if there are local maxima that may present better values of $k$ than the silhouette scores suggest. If such maxima exist, we take the value of $k$ that presents the most negative approximated second derivative (meaning the plot of scores is most concave *down* at this value of $k$). 

To calculate the second derivatives, we use the following approximation:

$$ f''(x) \approx \frac{f(x+h) - 2f(x) + f(x-h)}{h^2}$$

where, in our case, $h = 1$.

In [None]:
points = {}
threshold = 0.3
score_type = "Silhouette Score"
if use_weighted_silhouette_score:
    score_type = "Weighted Score"
    
for s in silhouette_scores:
    points[s["N Clusters"]] = s[score_type]

# No local maxima - resort to previous method of best score calculation
k = None

if (use_second_deriv_method):
    local_maxima = []
    # Collect all local maxima
    for n in range(cluster_range[1], cluster_range[-2]):
        if points[n] > threshold and points[n-1] < points[n] and points[n+1] < points[n]:
            local_maxima.append(n)
    
    if len(local_maxima)>0:
        print("K values for local maxima and positive silhouette score: ", local_maxima)
    else:
        print("No local maxima with positive silhouette scores found")

    # Otherwise, calculate which of the local max is most concave down (i.e. largest second derivative)
    if len(local_maxima) > 0:
        running_max = 0
        for maxima in local_maxima:
            # approximate the second derivative, with step size = 1
            second_deriv = points[maxima + 1] -  2*points[maxima] + points[maxima-1]
            if second_deriv < running_max:
                running_max = second_deriv
                k = maxima

In [None]:
silhouette_scores = pd.DataFrame(silhouette_scores)
figure_header("Table 6", "Silhouette scores by number of clusters")
display(silhouette_scores.sort_values(["Silhouette Score", "Weighted Score"], ascending=False).reset_index().head())
figure_legend("Table 6", "Silhouette scores by number of clusters", "Values are sorted by the highest Silhouette Score.")


best_score = silhouette_scores.sort_values('Silhouette Score').iloc[-1].to_dict()
best_weighted = silhouette_scores.sort_values('Weighted Score').iloc[-1].to_dict()

best = {"Silhouette Score": best_score, "Weighted Score": best_weighted}

In [None]:
if not k:
    k = int(best["Silhouette Score"]["N Clusters"])
    if (use_weighted_silhouette_score):
        k = int(best["Weighted Score"]["N Clusters"])
    
print(f"Ideal k: {k} clusters")

In [None]:
# plot both the uneqeighted and weighted scores as a function of # of clusters
colors = {"Silhouette Score": "#7C88FB", "Weighted Score": "#00CC96"}

for score_type in ['Silhouette Score', 'Weighted Score']:
    plt.plot(silhouette_scores['N Clusters'], silhouette_scores[score_type], label=score_type,color=colors[score_type])
    plt.scatter([best[score_type]['N Clusters']], [best[score_type][score_type]], label=f"Best {score_type}: {int(best[score_type]['N Clusters'])} clusters",color=colors[score_type])
plt.axvline(k, label = f"Ideal k: {k} clusters", color ="#EF553B", alpha=0.8,dashes=(3,3))
plt.legend()
plt.ylabel('Score')
plt.xlabel('Number of Clusters')
figure_header("Figure 8", "Cluster size selection")
plt.show()
figure_legend("Figure 8", "Cluster size selection", "The dotted line indicates the value of the 'ideal' <i>k</i> as chosen by the selected scoring method. This value will be used in subsequent clustering.")

In [None]:
# Compute the Kmeans dataframe using the ideal number of clusters
km = KMeans(n_clusters=k, random_state=42)
km_clusters = km.fit_predict(df_data_norm_umap.values)

df_data_norm_km = pd.DataFrame({
'Cluster': [
    str(c)
    for c in km_clusters
]}, index=df_data_norm_umap.index)


print(f'Computed {len(df_data_norm_km["Cluster"].unique())} clusters')
df_data_norm_km.head()

In [None]:
# Map each cluster to a color for later plots
clusters = df_data_norm_km["Cluster"].unique()
#plotly_colors = px.colors.qualitative.Plotly
plotly_colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']
cluster_colors = {}
i = 0
for c in clusters:
    cluster_colors[c] = plotly_colors[i % len(plotly_colors)]
    i += 1

def cluster_heading(cluster):
    display(HTML(f'''
    <center>
    <div style='background-color:{cluster_colors[cluster] + '98'};
        width:100%;height:3rem;display:flex;align-items:center;
        justify-content:center;color:white;font-size:2rem'>
        <center>Cluster {cluster}</center>
    </div>
    </center>'''))
    

Next, we compute differential expression for each cluster. We use the <a href="http://www.maayanlab.net/CD/">Characteristic Direction method</a> for identifying differentially expressed genes among the different clusters.

In [None]:
# Get differential expression for each cluster, using the dataset containing all genes
diff_expr = {}
for cluster, samples in df_data_norm_km.groupby('Cluster'):
    diff_expr[f"Cluster {cluster} CD"] = characteristic_direction(
        # expression outside of this cluster
        df_data_norm_all_genes.loc[:, df_data_norm_all_genes.columns.difference(samples.index)],
        # expression in this cluster
        df_data_norm_all_genes.loc[:, samples.index],
      )['CD-coefficient']

df_diff_expr = pd.DataFrame(diff_expr)
df_diff_expr = df_diff_expr.sort_values(by='Cluster 0 CD',ascending=True)
df_diff_expr['Symbol'] = df_diff_expr.index.values

figure_header("Table 7", "Differential expression of genes by cluster")
display(df_diff_expr.head())
figure_legend("Table 7", "Differential expression of genes by cluster", "By default, the top 5 most differentially expressed genes are shown, along with the corresponding characteristic directions for each cluster.")

We fit a logistic regression on each clinical feature to see which most accurately predict the cluster each data point falls into. We also plot ROC curves for features that resulted in the top five highest AUC scores.

In [None]:
# LR
aucs = {}
num_rocs_plotted = 0
rocs = {}

for cluster, samples in df_data_norm_km.groupby('Cluster'): 
    aucs[cluster] = {}
    rocs[cluster] = []

    for feature in features:
        lr = LogisticRegression()
        X = df_clinical.copy()
        X = X[feature]
        X = pd.merge(X, df_data_norm_km, left_index = True, right_index = True)

        # drop NAs, and move on if dataset is empty
        X.replace("not reported", None)
        X = X.dropna()
        if (X.shape[0] == 0): continue

        cluster_data = X["Cluster"]
        X = X.drop(columns= ["Cluster"])

        # one-hot encode non numerical data
        if (not isinstance(X[feature][0], (int, float, complex))):
            X = pd.get_dummies(X[feature], prefix=feature)

        y_true = (cluster_data == cluster)
        
        if (len(y_true.unique()) < 2): # if we only have one class in the dataset
            print(f"Not enough data to classify cluster {cluster} based on feature {feature}")
            aucs[cluster][feature] = np.nan
            continue 
                  
        lr.fit(X, y_true)

        y_score = lr.predict_proba(X)[:, 1]
        auc_score = roc_auc_score(y_true, y_score)
        aucs[cluster][feature] = auc_score
        
        # save the ROCs
       
        rocs[cluster].append({"auc":auc_score, "lr": lr, "X": X, "y_true":y_true, "title": f'Predictions of cluster {cluster} by feature {feature}'})
        
df_cluster_aucs = pd.DataFrame(aucs)
df_cluster_aucs.index.name="Feature"

# sort features by avg AUC across all clusters
df_cluster_aucs["avg"] = [ np.mean(df_cluster_aucs.T[f]) for f in df_cluster_aucs.index.values ]
df_cluster_aucs = df_cluster_aucs.sort_values(by = "avg", ascending=False)
df_cluster_aucs = df_cluster_aucs.drop(columns = "avg")

cols = [('Cluster', col) for col in df_cluster_aucs.columns ]
df_cluster_aucs.columns = pd.MultiIndex.from_tuples(cols)

figure_header("Table 8", "Average AUC scores for top-predicting clinical features, by cluster")
display(df_cluster_aucs.head(10))
figure_legend("Table 8", "Average AUC scores for top-predicting clinical features, by cluster", "Scores for the top 10 clinical features, as determined by the average AUC score across all clusters, are shown. Higher AUC scores correspond to better classifiers for distinguishing whether or not a datapoint belongs to a certain cluster.")

In [None]:
# plot top 2 (or however many exist) ROCs for each cluster
matplotlib.rc('font', size=16)
figure_header("Figure 9", "ROCs for top cluster-predicting clinical features")

for cluster, plots in rocs.items():
    plots.sort(reverse=True, key=lambda x: x["auc"])
    cluster_heading(cluster)
    
    if len(plots) < 2:
        best_rocs = plots
    else:
        best_rocs = plots[:2]

    num_plots = len(best_rocs)
    figure,axes = plt.subplots(int(math.ceil(num_plots / 2.)), 2, figsize=(15,(len(best_rocs)*3.5)))
    
    axes = axes.flatten()
    for i in range(len(axes)):
        if i >= len(best_rocs):
            axes[i].remove()
        else:
            plot = best_rocs[i]
            fig = metrics.plot_roc_curve(plot["lr"], plot["X"], plot["y_true"], ax=axes[i])

            axes[i].set_title('\n'.join(wrap(plot["title"], 40)))

    figure.tight_layout(pad=2)
    plt.show()
    
figure_legend("Figure 9", "ROCs for top cluster-predicting clinical features")

matplotlib.rcdefaults()

Next we find the top most up and downregulated genes for each cluster, both to select which data to display in the heatmap below and for Enrichment analysis.

In [None]:
# Merge data
df_clustered_umap = pd.merge(left=df_data_norm_km, left_on="case_id", right=df_data_norm_umap, right_on="case_id")
df_clustered_pca = pd.merge(left=df_data_norm_km, left_on="case_id", right=df_data_norm_pca, right_on="case_id")

In [None]:
# Get top Genes for each cluster
top_genes = {}
all_top_genes = []
heatmap_top_n = min(heatmap_top_n, top_n_genes_enrichment)
for cluster in df_clustered_umap['Cluster'].unique():
    cd_col = f'Cluster {cluster} CD'
    if cd_col in df_diff_expr.columns:
        # top up genes
        up_genes = df_diff_expr.loc[df_diff_expr[cd_col].sort_values(ascending=False).iloc[:top_n_genes_enrichment].index, 'Symbol'].values
        # top down genes
        dn_genes = df_diff_expr.loc[df_diff_expr[cd_col].sort_values(ascending=True).iloc[:top_n_genes_enrichment].index, 'Symbol'].values
    else:
        raise Exception('Cant find col for cluster')
    all_top_genes.append(up_genes[:heatmap_top_n])
    all_top_genes.append(dn_genes[:heatmap_top_n])
    # save results
    top_genes[cluster] = (up_genes, dn_genes)
all_top_genes = [item for sublist in all_top_genes for item in sublist]  # flatten all genes to one list

We select data corresponding to only the `heatmap_top_n` up and downregulated genes for each cluster. We log-transform and normalize as before, plotting a heatmap for the results.

In [None]:
df_data_norm_heatmap_f = df_data.loc[all_top_genes, :]

# compute log normalization of matrix
df_data_norm_heatmap_f = log2_normalize(df_data_norm_heatmap_f)

# convert to zscores
df_data_norm_heatmap_f = zscore_normalize(df_data_norm_heatmap_f) 

# Plot heatmap
cases = df_data_norm_heatmap_f.columns
heatmap_cluster_colors = [ cluster_colors[x] for x in df_clustered_umap.loc[cases, :]["Cluster"] ]

figure_header("Figure 10", "Heatmap of top most differentially expressed genes")
sns.clustermap(df_data_norm_heatmap_f,xticklabels=False,col_colors = heatmap_cluster_colors); plt.show()
figure_legend("Figure 10", "Heatmap of top most differentially expressed genes", "Color coding along the top edge indicates cluster designation of the corresponding case.")

# 3. Perform survival analysis <a class="anchor" id="survival"></a>

In [None]:
if not user_data:
    time_to_death_col = "days_to_death"
    time_to_last_followup_col = "days_to_last_follow_up"
    vital_status_col = "vital_status"
    alive_val = "Alive"
    dead_val = "Dead"
    survival_time_unit = "days"
    
    
if "" in [time_to_death_col, 
              time_to_last_followup_col, 
              vital_status_col, 
              alive_val, 
              dead_val, 
              survival_time_unit]:
    survival_cols_valid = False
else:
    survival_cols_valid = True
    
    try:
        df_lifeline = df_clinical[[time_to_death_col,vital_status_col,time_to_last_followup_col]]
        df_lifeline = pd.merge(left=df_lifeline,left_on="case_id", right=df_data_norm_km, right_on="case_id")

        # map alive and dead values in vital_status_col to 0 and 1, respectively
        df_lifeline[vital_status_col] = df_lifeline[vital_status_col].replace({alive_val: 0, dead_val: 1 })

        # use the time_to_death_col for "dead" cases, 
        # and use time_to_last_followup_col for "alive" (right-censored) cases
        time_col = np.zeros(df_lifeline.shape[0])
    
    except KeyError:
        survival_cols_valid = False
        
    def map_to_time(entry):
        vital_status = entry[vital_status_col]
        if vital_status == 1: # dead
            return entry[time_to_death_col]
        elif vital_status == 0: # alive
            return entry[time_to_last_followup_col]
        else:
            return np.nan
    
    if survival_cols_valid:
        df_lifeline["time"] = df_lifeline.apply(map_to_time, axis=1)

        df_lifeline = df_lifeline[["Cluster","time",vital_status_col]]
        df_lifeline = df_lifeline.sort_values(by = "time", ascending = True)
        df_lifeline = df_lifeline.dropna() # if NaNs remain after all this, drop them (missing vital status)

        figure_header("Table 9", "Dataframe constructed for survival analysis")
        display(df_lifeline)
        figure_legend("Table 9", "Dataframe constructed for survival analysis", "For patients listed as living, the time used for survival analysis is the time to their last followup. For deceased patients, the time used is the time to death. Both of these values are counted from the date of diagnosis. Living patients are included as right-censored datapoints.")

        # Add the KM plot for a cluster to the existing figure
        def plot_km(cluster):
            kmf = KaplanMeierFitter()
            f = df_lifeline["Cluster"] == c
            T = df_lifeline[f]["time"].apply(lambda x: float(x))
            C = df_lifeline[f][vital_status_col].apply(lambda x: float(x))

            censored_points = df_lifeline[f]
            censored_points = censored_points[censored_points[vital_status_col] == 0]["time"].values
            censored_points_x = [int(x) for x in censored_points]

            kmf.fit(T, event_observed=C, label = f'Cluster {c}')
            data = pd.DataFrame(kmf.survival_function_).reset_index()

            # Add the plot curve
            fig.add_trace(go.Scatter(x=data["timeline"], 
                                     y=data[f"Cluster {c}"], 
                                     name=f"Cluster {c}",
                                     line_shape='hv', 
                                     line_color=cluster_colors[c],
                                    ))


            # Display 95% confidence intervals
            ci = kmf.confidence_interval_
            ci_time = ci.index.values
            ci_lower = ci[f"Cluster {c}_lower_0.95"]
            ci_upper = ci[f"Cluster {c}_upper_0.95"]
            fig.add_trace(go.Scatter(x=np.concatenate((ci_time,ci_time[::-1])), 
                                     y=np.concatenate((ci_upper,ci_lower[::-1])),  
                                     fill='toself', 
                                     line_shape='hv', 
                                     fillcolor=cluster_colors[c],
                                     line_color=cluster_colors[c],
                                     opacity=0.2,
                                     showlegend=False,
                                     name=f"Cluster {c}"))

            # Add markers to indicate the right-censored datapoints
            data = data.set_index("timeline")
            censored_ests = data.loc[censored_points_x,f"Cluster {c}"]

            fig.add_trace(go.Scatter(x=censored_points, 
                                     y=censored_ests,
                                     mode="markers",
                                     marker_symbol="line-ns-open",
                                     showlegend=False,
                                     marker_size = 8, 
                                     marker_color=cluster_colors[c],
                                     name="Censored point"))


        clusters = df_lifeline["Cluster"].unique()
        fig = go.Figure()

        for c in clusters:
            plot_km(c)

        fig.update_layout(
            xaxis_title=f"Time ({survival_time_unit})",
            yaxis_title="Probability of survival",
        )

        figure_header("Figure 11","Kaplan-Meier survival plots")
        fig.show()
        figure_legend("Figure 11", "Kaplan-Meier survival plots", "Right-censored datapoints (patients with a vital status listed as alive as of their last followup) and indicated by the vertical lines. The 95% confidence interval for each curve is shown.")

In [None]:
if not survival_cols_valid:
    print("The proper column names and values necessary for survival analysis were not provided, so K-M curves cannot be generated.")
else:
    args = [df_lifeline['time'], df_lifeline['Cluster'], df_lifeline[vital_status_col]]

    figure_header("Table 10","Multivariate log-rank test")
    result_multi = multivariate_logrank_test(*args)
    display(result_multi.summary.round({"test_statistic":3}))
    figure_legend("Table 10","Multivariate log-rank test", "Comparison of all clusters' survival functions.")

    figure_header("Table 11","Pairwise log-rank test")
    result_pairwise = pairwise_logrank_test(*args)
    display(result_pairwise.summary.round({"test_statistic":3}))
    figure_legend("Table 11","Pairwise log-rank test", "Pairwise comparisons of any two clusters' survival functions.")



#  4. Load Enrichr Data  <a class="anchor" id="enrichr"></a>

We query the Enrichr API for enrichment analysis of the top most upregulated and downregulated genes for each cluster calculated above.

In [None]:
# Util functions
def enrichr_link_from_genes(genes, description='', enrichr_link='https://maayanlab.cloud/Enrichr'):
    ''' Functional access to Enrichr API
    '''
    time.sleep(1)
    resp = requests.post(enrichr_link + '/addList', files={
    'list': (None, '\n'.join(genes)),
    'description': (None, description),
    })
    if resp.status_code != 200:
        raise Exception('Enrichr failed with status {}: {}'.format(
          resp.status_code,
          resp.text,
        ))
    # wait a tinybit before returning link (backoff)
    time.sleep(3)
    result = resp.json()
    return dict(result, link=enrichr_link + '/enrich?dataset=' + resp.json()['shortId'])

def enrichr_get_top_results(userListId, bg, enrichr_link='https://maayanlab.cloud/Enrichr'):
    time.sleep(1)
    resp = requests.get(enrichr_link + '/enrich?userListId={}&backgroundType={}'.format(userListId, bg))
    if resp.status_code != 200:
        raise Exception('Enrichr failed with status {}: {}'.format(
          resp.status_code,
          resp.text,
        ))
    time.sleep(3)
    return pd.DataFrame(resp.json()[bg], columns=['rank', 'term', 'pvalue', 'zscore', 'combinedscore', 'overlapping_genes', 'adjusted_pvalue', '', ''])

def save_enrichr_data(cancer_type,df_diff_expr,df_clustered_umap,df_all_results,df_cluster_aucs):
    output=f"appyter_data/{cancer_type}"
    os.makedirs(output, exist_ok=True)
    df_diff_expr.to_csv(
        f'{output}/df.tsv',
        sep='\t',
        index=None
    )
    df_clustered_umap.to_csv(
        f'{output}/df_umap.tsv',
        sep='\t',
        index=None
    )
    df_all_results.to_csv(
        f'{output}/df_enrich.tsv',
        sep='\t',
        index=None
    )
    df_cluster_aucs.to_csv(
        f'{output}/cluster_aucs.csv',
        sep='\t',
        index=None
    )

In [None]:
# Get Enrichr links for each cluster
enrichr_links = {}

for cluster, (up_genes, dn_genes) in top_genes.items():
    up_link, dn_link = None, None
    if up_genes.size:
        try:
            up_link = enrichr_link_from_genes(up_genes, f'cluster {cluster} up')
        except:
            print(f'Enrichr failed for cluster {cluster} up genes')
    else:
        print(f'cluster {cluster} up: empty')
    if dn_genes.size:
        try:
            dn_link = enrichr_link_from_genes(dn_genes, f'cluster {cluster} down')
        except:
            print(f'Enrichr failed for cluster {cluster} down genes')
    else:
        print(f'cluster {cluster} down: empty')
    enrichr_links[cluster] = (up_link, dn_link)

# Grab top results for each cluster
all_enrichr_results = []
for cluster, (up_link, dn_link) in enrichr_links.items():
    for link_type, link in [('up', up_link), ('down', dn_link)]:
        if link is None:
            continue
        for category, libraries in enrichr_libraries.items():
            for library in libraries:
                try:
                    results = enrichr_get_top_results(link['userListId'], library).sort_values('pvalue').iloc[:top_n_results]
                    results['link'] = link['link']
                    results['library'] = library
                    results['category'] = category
                    results['direction'] = link_type
                    results['cluster'] = cluster
                    all_enrichr_results.append(results)
                except:
                    print('{}: {} {} {} cluster {} failed, continuing'.format(link, library, category, link_type, cluster))

df_enrichr_results = pd.concat(all_enrichr_results).reset_index()

In [None]:
# Display a dataframe with clickable enrichr links
figure_header("Table 12", "Enrichment analysis results from Enrichr")
df_clickable = df_enrichr_results.copy()
df_clickable['link'] = df_clickable["link"].apply(make_clickable)
table_html = df_clickable.to_html(escape=False)
display(HTML(f'<div style="max-height: 250px; overflow-y: auto; margin-bottom: 25px;">{table_html}</div>'))
download_button(df_enrichr_results.to_csv(), 'Download Enrichr results', 'Enrichr results.csv')
figure_legend("Table 12","Enrichment analysis results from Enrichr", "Results are grouped by  expression direction (up/down) and gene set library. Within groups, results are sorted by lowest p-value (highest rank) first.")

In [None]:
figure_header("Figure 12", "First two UMAP components of RNA-Seq data")
fig = px.scatter(
    df_clustered_umap,
    x = df_clustered_umap['UMAP-0'],
    y = df_clustered_umap['UMAP-1'],
    color = "Cluster",
)

fig.update_traces(marker=dict(
            size=12,
            opacity=0.8,
            line=dict(
                color='white',
                width=1
            )))
fig.show()
figure_legend("Figure 12", "First two UMAP components of RNA-Seq data", "Datapoints are color-coded by cluster for reference in the following tables.")

figure_header("Figure 13", "Enichment results, by cluster")

# Make horizontal barplots to visualize top Enrichr results
clusters = df_enrichr_results["cluster"].unique()
for cluster in clusters:
    cluster_results = df_enrichr_results.loc[df_enrichr_results["cluster"] == cluster, :]
    libraries = cluster_results["library"].unique()
    num_rows = len(libraries)

    count = 1 # keep track of which subplot we're on
    fig = plt.figure(figsize=(15,5*num_rows))
    
    for library in cluster_results["library"].unique():
            library_results = cluster_results.loc[cluster_results["library"] == library, :]
            for direction in library_results["direction"].unique():
                plot_results = library_results.loc[cluster_results["direction"] == direction, :]
                plot_results = plot_results.sort_values("pvalue",ascending=False)
                labels = plot_results["term"]
                labels = [ '\n'.join(wrap(l, 20)) for l in labels ]
                values = plot_results["pvalue"]
                values = -np.log(values)
                
                # normalize values to map from 0-1 -> color, with opacity also based on normalized pvalue
                cmap = plt.get_cmap('cool')
                norm_values = [ 0.3 + (x - min(values))/(max(values) - min(values))*0.7 for x in values]
                colors = [ [*cmap(val)[:3], 0.4  + 0.2*val] for val in norm_values]
                
                # plot result
                ax = fig.add_subplot(num_rows,2,count)
                ax.barh(labels,values,color = colors)
                ax.set_title(f'{library}\n{direction} genes')
                ax.set_xlabel(' – log(pvalue)')
                count += 1
                
    cluster_heading(cluster)
    #title = plt.suptitle(f'Cluster {cluster}',fontsize=24, color="white",bbox=(dict(facecolor=cluster_colors[cluster], alpha=0.8,pad=5)))
    fig.tight_layout(pad=3, w_pad=2, h_pad=6)
    plt.show()
    display(HTML("<br><br>"))
    
figure_legend("Figure 13", "Enichment results, by cluster", "Bar plots indicate the negative log of the p-value for the specified term. One plot is presented per cluster, per gene-set library, per expression direction (up/down).")



# 5. Load L1000 Data  <a class="anchor" id="l1000"></a>

We query the <a href="https://amp.pharm.mssm.edu/L1000FWD/" target="_blank">L1000 Fireworks Display</a> (L1000FWD) API to find the most similar and most dissimilar gene expression signatures from the L1000 database for each cluster based on the up and down genes. We also provide links to the L1000FWD interactive projection of those results for each cluster.

In [None]:
# Util functions
def l1000fwd_results_from_genes(up_genes, down_genes, description='', l100fwd_link='http://maayanlab.cloud/L1000FWD/'):
    ''' Functional access to L1000FWD API
    '''
    import time
    time.sleep(1)
    response = requests.post(l100fwd_link + 'sig_search', json={
    'up_genes': list(up_genes),
    'down_genes': list(down_genes),
    })
    l1000fwd_results =  {}
    if response.status_code != 200:
        raise Exception('L1000FWD failed with status {}: {}'.format(
          response.status_code,
          response.text,
        ))
    if 'KeyError' in response.text:
        l1000fwd_results['result_url'] = None
    else:
        # Get ID and URL
        result_id = response.json()['result_id']
        l1000fwd_results['result_url'] = 'https://maayanlab.cloud/l1000fwd/vanilla/result/'+result_id
        l1000fwd_results['result_id'] = result_id

        # Get Top
        l1000fwd_results['signatures'] = requests.get(l100fwd_link + 'result/topn/' + result_id).json()

    # wait a tinybit before returning link (backoff)
    time.sleep(1)
    return l1000fwd_results

def l1000fwd_sig_link(sig_id):
    return 'https://maayanlab.cloud/dmoa/sig/' + sig_id

def get_signature_by_id(sig_id):
    response = requests.get("http://maayanlab.cloud/L1000FWD/sig/" + sig_id)
    if response.status_code != 200:
        raise Exception('L1000FWD signature query  failed with status {}: {}'.format(
          response.status_code,
          response.text,
        ))
    return response.json()

In [None]:
def display_l1000fwd_results(l1000fwd_results, plot_counter,cluster_id,nr_drugs=7, height=300):
    # Check if results
    if l1000fwd_results['result_url']:

        # Display cluster title
        display(HTML('<br><br>'))
        cluster_heading(cluster)

        # Display IFrae
        display(HTML(f"<a href='{l1000fwd_results['result_url']}' target='_blank'> View L1000FWD for cluster {cluster_id}</a>"))
    
        # Display tables
        for direction, signature_list in l1000fwd_results['signatures'].items():

            # Fix dataframe
            rename_dict = {'sig_id': 'Signature ID', 'pvals': 'P-value', 'qvals': 'FDR', 'zscores': 'Z-score', 'combined_scores': 'Combined Score'}
            signature_dataframe = pd.DataFrame(signature_list)[list(rename_dict.keys())].rename(columns=rename_dict).sort_values('P-value').rename_axis('Rank')
            signature_dataframe.index = [x + 1 for x in range(len(signature_dataframe.index))]
            signature_csv = signature_dataframe.to_csv(sep=",")

            # Display table
            pd.set_option('max.colwidth', None)
            signature_dataframe['Signature ID'] = [f'<a href={l1000fwd_sig_link(x)} target="_blank">{x}</a>' for x in signature_dataframe['Signature ID']]
            table_html = signature_dataframe.to_html(escape=False, classes='w-100')
            display(HTML(f'<h3>{direction.title()} Signatures: </h3>'))
            display(HTML(f'<style>.w-100{{width: 100% !important;}}</style><div style="max-height: 250px; overflow-y: auto; margin-bottom: 25px;">{table_html}</div>'))

            # Display download button
            download_button(signature_csv, f'Download {direction.title()} Signatures', f'Cluster {cluster_id} L1000FWD {direction.title()} signatures.csv')
        # Link
        display(HTML('Full results available at: <a href="{result_url}" target="_blank">{result_url}</a>.'.format(**l1000fwd_results)))
        
    # Display error
    else:
        display(Markdown('### No results were found.\n This is likely due to the fact that the gene identifiers were not recognized by L1000FWD. Please note that L1000FWD currently only supports HGNC gene symbols (https://www.genenames.org/). If your dataset uses other gene identifier systems, such as Ensembl IDs or Entrez IDs, consider converting them to HGNC. Automated gene identifier conversion is currently under development.'))


In [None]:
plot_counter = 0
all_l1000fwd_results = {}
figure_header("Figure 14", "Most similar and opposite L1000 signatures, by cluster")
for cluster, (up_genes, dn_genes) in top_genes.items():
    try:
        results = l1000fwd_results_from_genes(up_genes,dn_genes)
        all_l1000fwd_results[cluster] = results
        display_l1000fwd_results(results,plot_counter,cluster)
        plot_counter += 1
    except:
        print(f'L1000FWD API failed for cluster {cluster}, continuing')

        
figure_legend("Figure 14", "Most similar and opposite L1000 signatures, by cluster", "Results are sorted by smallest p-value.")

Based on the signatures most "opposite" to the profile of a given cluster (i.e. up and down genes are reversed), we can obtain a set of drugs that may "perturb" that cluster into the healthy direction by upregulating its downregulated genes and downregulating the upregulated genes. These may present effective treatments for patients belonging to that cluster.

In [None]:
df_drugs = pd.read_csv("https://maayanlab.cloud/l1000fwd/download/Drugs_metadata.csv")

# Load top drug suggestions for each cluster based on the drugs used to produce the top_n_drugs opposite signatures
drug_results = {}
for cluster, results in all_l1000fwd_results.items():
    opposite_sigs = results["signatures"]["opposite"][:top_n_drugs]
    sig_ids = [sig["sig_id"] for sig in opposite_sigs]
    pert_ids = []
    for sig_id in sig_ids:
        try:
            signature = get_signature_by_id(sig_id)
            pert_ids.append(signature["pert_id"])
        except: 
            print(f'L1000FWD API failed for cluster {cluster}, sig_id {sig_id}, continuing')
    
    df_cluster_drugs = df_drugs[df_drugs["pert_id"].isin(pert_ids)].copy()
    df_cluster_drugs["cluster"] = cluster
    df_cluster_drugs = df_cluster_drugs[["cluster", *list(filter(lambda x: x!="cluster", df_cluster_drugs.columns))]]
    drug_results[cluster] = df_cluster_drugs
    
df_all_drugs = pd.concat(drug_results).reset_index()

In [None]:
# Display a dataframe with clickable L1000FWD links
figure_header("Table 13", "Drugs used to produce most opposite signatures for each cluster")
df_clickable = df_all_drugs.copy()
df_clickable['pert_url'] = df_clickable["pert_url"].apply(make_clickable)
table_html = df_clickable.to_html(escape=False)
display(HTML(f'<div style="max-height: 250px; overflow-y: auto; margin-bottom: 25px;">{table_html}</div>'))
download_button(df_all_drugs.to_csv(), 'Download L1000FWD drug results', 'L1000FWD drugs.csv')
figure_legend("Table 13", "Drugs used to produce most opposite signatures for each cluster", "Each entry is a drug/chemical used for perturbation in the L1000 experiments that resulted in a gene-expression signature most opposite to that of the specified cluster.")

In [None]:
%%appyter hide_code
data_filename = ''
clinical_data_filename = ''
cancer = '''Papillary adenocarcinoma, NOS'''
n_pca_components = 10
n_neighbors = 40
min_cluster_dist = 0.3
top_n_genes = 2500
max_clusters_calculation = 'root/2'
use_weighted_silhouette_score = True
use_second_deriv_method = True
top_n_genes_enrichment = 250
heatmap_top_n = 100
top_n_results = 5
use_default_libraries = False
transcription_libraries = []
pathways_libraries = []
ontologies_libraries = []
diseases_drugs_libraries = []
cell_types_libraries = []
miscellaneous_libraries = []
legacy_libraries = []
crowd_libraries = []
top_n_drugs = 5
default_libraries = OrderedDict([
    ('Diseases/Drugs', ['GWAS_Catalog_2019']), 
    ('Ontologies', ['GO_Biological_Process_2018','MGI_Mammalian_Phenotype_Level_4_2019']),
    ('Pathways', ['KEGG_2019_Human','KEGG_2019_Mouse']),
    ('Transcription', ['ENCODE_TF_ChIP-seq_2015']),
])
time_to_death_col = ''
time_to_last_followup_col = ''
vital_status_col = ''
alive_val = ''
dead_val = ''
survival_time_unit = 'days'
if use_default_libraries:
    enrichr_libraries = default_libraries
else: 
    enrichr_libraries = OrderedDict([
        ('Diseases/Drugs', diseases_drugs_libraries), 
        ('Ontologies', ontologies_libraries),
        ('Cell Type', cell_types_libraries),
        ('Pathways', pathways_libraries),
        ('Transcription', transcription_libraries),
        ('Legacy', legacy_libraries),
        ('Crowd', crowd_libraries)
    ])
    all_empty = True
    for key,libs in enrichr_libraries.items():
        if len(libs) > 0:
            all_empty = False
            break
    if all_empty:
        enrichr_libraries = default_libraries

In [None]:
%%appyter hide_code

cancers=["Infiltrating duct carcinoma, NOS",
         "Squamous cell carcinoma, NOS",
         "Lobular carcinoma, NOS",
         "Acute myeloid leukemia, NOS",
         "Hepatocellular carcinoma, NOS",
         "Serous cystadenocarcinoma, NOS",
         "Endometrioid adenocarcinoma, NOS",
         "Adenocarcinoma, NOS",
         "Clear cell adenocarcinoma, NOS",
         "Glioblastoma","Mucinous adenocarcinoma",
         "Transitional cell carcinoma",
         "Malignant melanoma, NOS",
         "Papillary adenocarcinoma, NOS"]
cancer = cancers[-1]