In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# RNA-seq Data and Metadata Analysis Appyter

This notebook template provides a pipeline for the visualization and analysis of RNA-seq gene read counts. 

### Analysis Overview 

The RNA-seq data first undergoes normalization and dimensionality reduction via Principle Component Analysis (PCA) and Uniform Manifold Approximation and Projection (UMAP). Samples are then clustered based on their most-associated highly-variable genes and metadata features. The number of clusters is determined based on a modified silhouette score which prioritizes having more clusters over having larger clusters. Clusters are visualized using the [React-Scatter-Board](https://github.com/MaayanLab/react-scatter-board) package. 

The most up-regulated and down-regulated genes are also identified for each cluster. These genes are used to perform enrichment analysis via the [Enrichr](https://maayanlab.cloud/Enrichr/) API. The enrichment results are visualized with the [React-GSEA](https://github.com/MaayanLab/react-GSEA/tree/simplified) package. 

Finally, similar and opposite drug/small molecule signatures are queried using the [L1000FWD](https://maayanlab.cloud/L1000FWD/) API. 

*Note: If using GTEx data or other healthy tissue sample data for which querying drug signatures is not relevant, please use the GTEx Tissue-Specific RNA-seq Analysis Appyter instead. If using GEO data, please use the [Bulk RNA-seq Analysis Appyter](https://appyters.maayanlab.cloud/Bulk_RNA_seq/).*

## 0. Notebook Setup
Import packages and set appropriate file names.

In [None]:
import os
import numpy as np
import pandas as pd
import requests
import time
from matplotlib import pyplot as plt 
import seaborn as sns
from umap import UMAP
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, silhouette_samples, silhouette_score, plot_roc_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.cm as cm
from maayanlab_bioinformatics.dge import characteristic_direction
from maayanlab_bioinformatics.normalization import log2_normalize, filter_by_var, zscore_normalize
from maayanlab_bioinformatics.utils import merge
import math
from collections import OrderedDict
import json
from react_scatter_board.jupyter_compat import ScatterBoard
from IPython.display import display, IFrame, Markdown, HTML
from textwrap import wrap
from react_gsea import ReactGSEA, dataFromResult
from react_gsea.jupyter_compat import ReactGSEA

In [None]:
# Notebook display util functions
def download_button(content, label, filename):
    # Add download button
    outname = filename.split('.')[0]
    display(HTML('<textarea id="textbox_{outname}" style="display: none;">{content}</textarea> <button style="margin:10px 0;" id="create_{outname}">{label}</button> <a download="{filename}" id="downloadlink_{outname}" style="display: none">Download</a>'.format(**locals())))
    display(HTML('<script type="text/javascript">!function(){{var e=null,t=document.getElementById("create_{outname}"),n=document.getElementById("textbox_{outname}");t.addEventListener("click",function(){{var t,l,c=document.getElementById("downloadlink_{outname}");c.href=(t=n.value,l=new Blob([t],{{type:"text/plain"}}),null!==e&&window.URL.revokeObjectURL(e),e=window.URL.createObjectURL(l)),c.click()}},!1)}}();</script>'.format(**locals())))

def make_clickable(link):
    return f'<a target="_blank" href="{link}">{link}</a>'

def figure_header(label,title):
    display(HTML(f"<div style='font-size:2rem; padding:1rem 0;'><b>{label}</b>: {title}</div>"))
    
def figure_legend(label,title,content=""):
    display(HTML(f"<div style='font-size:1.5rem;'><b>{label}</b>: <i>{title}</i>. {content} </div>"))

In [None]:
%%appyter hide
{% do SectionField(
    name = 'DATASETS',
    title = 'Dataset Selection',
    subtitle = 'Upload datasets for visualization and analysis. Both file uploads are required to run the analysis.'
) %}

{% do SectionField(
    name = 'PARAMETERS',
    title = 'Analysis Parameters',
    subtitle = 'Set parameters for analysis.'
) %}

{% do SectionField(
    name = "ENRICHR_LIBS",
    title = "Enrichment Analysis Library Selection",
    subtitle = "Choose Enrichr geneset libraries for comparison against input genes. Multiple libraries can be selected from each section. If nothing is selected, default libraries will be used."
) %}

{% set data_filename = FileField(
    name='data_filename',
    label='RNA-seq data file',
    description='TSV or CSV file containing RNA-seq read counts. Index should be Entrez gene symbols, and columns should be individual samples.',
    default='',
    examples = {
        'GSE159266 Data': 'https://appyters.maayanlab.cloud/storage/RNAseq_Data_Metadata_Analysis/GSE159266_data_cleaned.txt'
    },
    section='DATASETS'
) %}

{% set metadata_filename = FileField(
    name='metadata_filename',
    label='Sample metadata file',
    description='TSV or CSV file containing sample metadata. Index should be sample IDs corresponding to columns of RNA-seq data file, and columns should be different sample attributes.',
    default='',
    examples = {
        'GSE159266 Metadata': 'https://appyters.maayanlab.cloud/storage/RNAseq_Data_Metadata_Analysis/GSE159266_metadata_cleaned.txt'
    },
    section='DATASETS'
) %}

{% set n_neighbors = IntField(
    name = 'n_neighbors',
    label = 'Number of neighbors to use for UMAP calculations',
    description = 'Smaller values preserve local structure, while larger values emphasize global structure.',
    default = 40,
    min = 2,
    max = 200,
    section = 'PARAMETERS'
) %}

{% set min_cluster_dist = FloatField(
    name = 'min_cluster_dist',
    label = 'Minimum distance between UMAP-projected points',
    description = 'Determines how close/distant points belonging to different clusters are from each other.',
    default = 0.3,
    min = 0.1,
    max = 1,
    section = 'PARAMETERS'
) %}

{% set top_n_genes = IntField(
    name = 'top_n_genes',
    label = 'Number of genes to analyze',
    description = 'Number of top variable genes to use in analysis.',
    default = 2500,
    section = 'PARAMETERS'
) %}

{% set top_n_genes_enrichment = IntField(
    name = 'top_n_genes_enrichment',
    label = 'Number of genes to use for enrichment analysis',
    description = 'Number of top variable genes to use for enrichment analysis; must be less than top_n_genes.',
    default = 250,
    section = 'PARAMETERS'
) %}

{% set do_l1000 = BoolField(
    name = 'do_l1000',
    label = 'Query L1000 signatures?',
    description = 'Option to query opposite and similar L1000 signatures to input data using L1000FWD.',
    default = True,
    section = 'PARAMETERS'
) %}

{% set transcription_libraries = MultiChoiceField(
    name = 'transcription_libraries',
    label = 'Transcription Libraries',
    description = 'Default library is ENCODE_TF_ChIP-seq_2015',
    choices = [
        'ARCHS4_TFs_Coexp',
        'ChEA_2016',
        'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
        'ENCODE_Histone_Modifications_2015',
        'ENCODE_TF_ChIP-seq_2015',
        'Epigenomics_Roadmap_HM_ChIP-seq',
        'Enrichr_Submissions_TF-Gene_Coocurrence',
        'Genome_Browser_PWMs',
        'lncHUB_lncRNA_Co-Expression',
        'miRTarBase_2017',
        'TargetScan_microRNA_2017',
        'TF-LOF_Expression_from_GEO',
        'TF_Perturbations_Followed_by_Expression',
        'Transcription_Factor_PPIs',
        'TRANSFAC_and_JASPAR_PWMs',
        'TRRUST_Transcription_Factors_2019'
    ],
    default = [
        'ENCODE_TF_ChIP-seq_2015'
    ],
    section = 'ENRICHR_LIBS'
) %}

{% set pathway_libraries = MultiChoiceField(
    name = "pathway_libraries",
    label = "Pathway Libraries",
    description = 'Default libraries are KEGG_2019_Human and KEGG_2019_Mouse',
    choices = [
        'ARCHS4_Kinases_Coexp',
        'BioCarta_2016',
        'BioPlanet_2019',
        'BioPlex_2017',
        'CORUM',
        'Elsevier_Pathway_Collection',
        'HMS_LINCS_KinomeScan',
        'HumanCyc_2016',
        'huMAP',
        'KEA_2015',
        'KEGG_2019_Human',
        'KEGG_2019_Mouse',
        'Kinase_Perturbations_from_GEO_down',
        'Kinase_Perturbations_from_GEO_up',
        'L1000_Kinase_and_GPCR_Perturbations_down',
        'L1000_Kinase_and_GPCR_Perturbations_up',
        'NCI-Nature_2016',
        'NURSA_Human_Endogenous_Complexome',
    ],
    default = [
        'KEGG_2019_Human', 
        'KEGG_2019_Mouse'
    ],
    section = 'ENRICHR_LIBS'
) %}

{% set ontology_libraries = MultiChoiceField(
    name = 'ontology_libraries',
    label = 'Ontology Libraries',
    description = 'Default libraries are GO_Biological_Process_2018 and MGI_Mammalian_Phenotype_Level_4_2019',
    choices = [
        'GO_Biological_Process_2018',
        'GO_Cellular_Component_2018',
        'GO_Molecular_Function_2018',
        'Human_Phenotype_Ontology',
        'Jensen_COMPARTMENTS',
        'Jensen_DISEASES',
        'Jensen_TISSUES',
        'MGI_Mammalian_Phenotype_Level_4_2019'
    ],
    default = [
        'GO_Biological_Process_2018', 
        'MGI_Mammalian_Phenotype_Level_4_2019'],
    section = 'ENRICHR_LIBS'
) %}

{% set disease_drug_libraries = MultiChoiceField(
    name = 'disease_drug_libraries',
    label = 'Disease Drug Libraries',
    description = 'Default library is GWAS_Catalog_2019',
    choices = [
        'Achilles_fitness_decrease',
        'Achilles_fitness_increase',
        'ARCHS4_IDG_Coexp',
        'ClinVar_2019',
        'dbGaP',
        'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019',
        'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019',
        'DisGeNET',
        'DrugMatrix',
        'DSigDB',
        'GeneSigDB',
        'GWAS_Catalog_2019',
        'LINCS_L1000_Chem_Pert_down',
        'LINCS_L1000_Chem_Pert_up',
        'LINCS_L1000_Ligand_Perturbations_down',
        'LINCS_L1000_Ligand_Perturbations_up',
        'MSigDB_Computational',
        'MSigDB_Oncogenic_Signatures',
        'Old_CMAP_down',
        'Old_CMAP_up',
        'OMIM_Disease',
        'OMIM_Expanded',
        'PheWeb_2019',
        'Rare_Diseases_AutoRIF_ARCHS4_Predictions',
        'Rare_Diseases_AutoRIF_Gene_Lists',
        'Rare_Diseases_GeneRIF_ARCHS4_Predictions',
        'Rare_Diseases_GeneRIF_Gene_Lists',
        'UK_Biobank_GWAS_v1',
        'Virus_Perturbations_from_GEO_down',
        'Virus_Perturbations_from_GEO_up',
        'VirusMINT'
    ],
    default = [
        'GWAS_Catalog_2019'
    ],
    section = 'ENRICHR_LIBS'
) %}

{% set cell_type_libraries = MultiChoiceField(
    name = 'cell_type_libraries',
    label = 'Cell Type Libraries',
    description = 'No libraries selected by default',
    choices = [
        'Allen_Brain_Atlas_down',
        'Allen_Brain_Atlas_up',
        'ARCHS4_Cell-lines',
        'ARCHS4_Tissues',
        'Cancer_Cell_Line_Encyclopedia',
        'CCLE_Proteomics_2020',
        'ESCAPE',
        'GTEx_Tissue_Sample_Gene_Expression_Profiles_down',
        'GTEx_Tissue_Sample_Gene_Expression_Profiles_up',
        'Human_Gene_Atlas',
        'Mouse_Gene_Atlas',
        'NCI-60_Cancer_Cell_Lines',
        'ProteomicsDB_2020',
        'Tissue_Protein_Expression_from_Human_Proteome_Map'
    ],
    default = [],
    section = 'ENRICHR_LIBS'
) %}

{% set misc_libraries = MultiChoiceField(
    name = 'misc_libraries',
    label = 'Miscellaneous Libraries',
    description = 'No libraries selected by default',
    choices = [
        'Chromosome_Location_hg19',
        'Data_Acquisition_Method_Most_Popular_Genes',
        'Enrichr_Libraries_Most_Popular_Genes',
        'Genes_Associated_with_NIH_Grants',
        'HMDB_Metabolites',
        'HomoloGene',
        'InterPro_Domains_2019',
        'NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions',
        'NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions',
        'NIH_Funded_PIs_2017_Human_AutoRIF',
        'NIH_Funded_PIs_2017_Human_GeneRIF',
        'Pfam_Domains_2019',
        'Pfam_InterPro_Domains',
        'Table_Mining_of_CRISPR_Studies'
    ],
    default = [],
    section = 'ENRICHR_LIBS'
) %}

{% set legacy_libraries = MultiChoiceField(
    name = 'legacy_libraries',
    label = 'Legacy Libraries',
    description = 'No libraries selected by default',
    choices = [
        'BioCarta_2013',
        'BioCarta_2015',
        'ChEA_2013',
        'ChEA_2015',
        'Chromosome_Location',
        'Disease_Signatures_from_GEO_down_2014',
        'Disease_Signatures_from_GEO_up_2014',
        'Drug_Perturbations_from_GEO_2014',
        'ENCODE_Histone_Modifications_2013',
        'ENCODE_TF_ChIP-seq_2014',
        'GO_Biological_Process_2013',
        'GO_Biological_Process_2015',
        'GO_Biological_Process_2017',
        'GO_Biological_Process_2017b',
        'GO_Cellular_Component_2013',
        'GO_Cellular_Component_2015',
        'GO_Cellular_Component_2017',
        'GO_Cellular_Component_2017b',
        'GO_Molecular_Function_2013',
        'GO_Molecular_Function_2015',
        'GO_Molecular_Function_2017',
        'GO_Molecular_Function_2017b',
        'HumanCyc_2015',
        'KEA_2013',
        'KEGG_2013',
        'KEGG_2015',
        'KEGG_2016',
        'MGI_Mammalian_Phenotype_2013',
        'MGI_Mammalian_Phenotype_2017',
        'MGI_Mammalian_Phenotype_Level_3',
        'MGI_Mammalian_Phenotype_Level_4',
        'NCI-Nature_2015',
        'Panther_2015',
        'Reactome_2013',
        'Reactome_2015',
        'TargetScan_microRNA',
        'Tissue_Protein_Expression_from_ProteomicsDB',
        'WikiPathways_2013',
        'WikiPathways_2015',
        'WikiPathways_2016'
    ],
    default = [],
    section = 'ENRICHR_LIBS'
) %}

{% set crowd_libraries = MultiChoiceField(
    name = 'crowd_libraries',
    label = 'Crowd Libraries',
    description = 'No libraries selected by default',
    choices = [
        'Aging_Perturbations_from_GEO_down',
        'Aging_Perturbations_from_GEO_up',
        'Disease_Perturbations_from_GEO_down',
        'Disease_Perturbations_from_GEO_up',
        'Drug_Perturbations_from_GEO_down',
        'Drug_Perturbations_from_GEO_up',
        'Gene_Perturbations_from_GEO_down',
        'Gene_Perturbations_from_GEO_up',
        'Ligand_Perturbations_from_GEO_down',
        'Ligand_Perturbations_from_GEO_up',
        'MCF7_Perturbations_from_GEO_down',
        'MCF7_Perturbations_from_GEO_up',
        'Microbe_Perturbations_from_GEO_down',
        'Microbe_Perturbations_from_GEO_up',
        'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
        'SysMyo_Muscle_Gene_Sets'
    ],
    default = [],
    section = 'ENRICHR_LIBS'
) %}

In [None]:
%%appyter code_exec

data_filename = {{ data_filename }}
metadata_filename = {{ metadata_filename }}

n_neighbors = {{ n_neighbors }}
min_cluster_dist = {{ min_cluster_dist }}
top_n_genes = {{ top_n_genes }}
top_n_genes_enrichment = {{ top_n_genes_enrichment }}
do_l1000 = {{ do_l1000 }}

transcription_libraries = {{ transcription_libraries }}
pathway_libraries = {{ pathway_libraries }}
ontology_libraries = {{ ontology_libraries }}
disease_drug_libraries = {{ disease_drug_libraries }}
cell_type_libraries = {{ cell_type_libraries }}
misc_libraries = {{ misc_libraries }}
legacy_libraries = {{ legacy_libraries }}
crowd_libraries = {{ crowd_libraries }}

In [None]:
if data_filename == '' or metadata_filename == '':
    print("One or both user-uploaded files missing, use example GEO data.")
    data_filename = 'https://appyters.maayanlab.cloud/storage/RNAseq_Data_Metadata_Analysis/GSE159266_data_cleaned.txt'
    metadata_filename = 'https://appyters.maayanlab.cloud/storage/RNAseq_Data_Metadata_Analysis/GSE159266_metadata_cleaned.txt'
    print(data_filename + '\n' + metadata_filename)

## 1. Import Datasets
Load RNA-seq gene read counts and associated sample metadata into dataframes.

In [None]:
def load_dataframe(file):
    ''' Load a file by downloading it or reading it if already downloaded.
    '''
    ext = os.path.splitext(file)[1]
    if ext in {'.tsv', '.txt'}:
        df = pd.read_csv(file, sep='\t', index_col=0)
    elif ext == '.csv':
        df = pd.read_csv(file, index_col=0)
    else:
        raise Exception('Unrecognized file format', ext)

    # Fix any type coersion on identifiers
    df.index = df.index.astype(str)
    df.columns = df.columns.astype(str)

    return df

In [None]:
data_index = "symbol"
metadata_index = "sample_id"

print(f"Loading user-uploaded data...")
df_data = load_dataframe(data_filename).sort_index()
df_metadata = load_dataframe(metadata_filename).sort_index()

df_data.index.name = "symbol"
df_metadata.index.name = "sample_id" 

print("Data loaded!")

### 1a. RNA-seq Data

In [None]:
figure_legend("Table 1", "RNA-seq data", "The RNA-seq data contains a row per gene and a column per sample.")
display(df_data.head())

### 1b. Metadata

In [None]:
figure_legend("Table 2","Metadata", "The column indices are sample metadata attributes, while the row indices are sample IDs corresponding to the columns of the RNA-seq data.")
display(df_metadata.head())

Listed below are all the metadata categories. These categories will be used to cluster samples later in the analysis.

In [None]:
features = df_metadata.columns.values
print(features)

## 2. Normalize Data
Given the highly variable nature of expression level between different genes, it is necessary to normalize the read counts before proceeding.

In [None]:
# create dataframe to display sample stats
df_library_size = pd.DataFrame(
    {
        'n_reads': df_data[df_data > 0].count(),
        'log_n_reads': np.log2(df_data[df_data > 0].count() + 1),
        'n_expressed_genes': df_data.sum(),
    }).sort_values('n_reads', ascending=False)

df_library_size.index.name = "sample_id"

In [None]:
figure_legend("Table 3","Library size", "By default, the first five entries are shown. A gene read is counted toward n_reads for a single sample if its value is greater than 0.")
display(df_library_size.head()) 

Below, the overall library distribution is shown.

In [None]:
sns.distplot(df_library_size["n_reads"]); plt.show()
figure_legend("Figure 1","Library size distribution")

Two versions of the dataset are normalized: one with just the `top_n_genes` most variable genes and one with all genes. The former will be used to compute clusters after dimensionality reduction, and the latter to compute the characteristic direction (up or down) of each gene in a cluster. 

In [None]:
# copy full dataset for computing characteristic directions later
df_data_norm_all_genes = df_data.copy()

# take top_n_genes most variable rows
df_data_norm = filter_by_var(df_data,top_n = top_n_genes)

# compute log normalization of matrix
df_data_norm = log2_normalize(df_data_norm)
df_data_norm_all_genes = log2_normalize(df_data_norm_all_genes)

# convert to zscores
df_data_norm = zscore_normalize(df_data_norm)
df_data_norm_all_genes = zscore_normalize(df_data_norm_all_genes)

In [None]:
figure_legend("Table 4","Normalized RNA-seq data for most variable genes", "Counts are filtered for the most variable genes. The resulting dataset is log transformed and normalized, then converted to z-scores.")
display(df_data_norm.head())

In [None]:
# plot the first gene distribution
gene = df_data_norm.index.values[0]
sns.distplot(df_data_norm.iloc[0, :]); plt.show()
figure_legend("Figure 2",f"Sample gene expression distibution for {gene}", f"In this dataset, {gene} is the most variably expressed across all samples.")

# plot the last gene distribution
gene = df_data_norm.index.values[-1]
sns.distplot(df_data_norm.iloc[-1, :]); plt.show()
figure_legend("Figure 3",f"Sample gene expression distibution for {gene}", f"In this dataset, {gene} is the least variably expressed across all samples among the most variably expressed genes.")

# plot a single RNA-seq sample distribution
sns.distplot(df_data_norm.iloc[:, 0]); plt.show()
figure_legend("Figure 4",f"RNA-seq profile distribution for sample {df_data_norm.columns[0]}")

## 3. Reduce Data Dimensionality

Now that the data has been loaded and normalized, the most variable genes across the dataset can be identified and visualized with hierachical clustering and heatmaps. Dimensionality reduction facilitates the differentiation of the data in a more efficient manner by reducing the number of attributes to be considered. 

### 3a. Principle Component Analysis 
PCA is used first to reduce the dimensionality of the dataset, while still maintaining most of the variability. In PCA, a large number of dimensions -- in this case, the different sample metadata attributes -- can be reduced to a few new dimensions that capture the relevant information of the original attributes. 

First, all data values are scaled to (0, 1).

In [None]:
pca_scaler = MinMaxScaler()
df_data_norm[df_data_norm.columns.tolist()] = pca_scaler.fit_transform(df_data_norm[df_data_norm.columns.tolist()])
df_data_norm.head()

Instead of manually setting the number of PCA components, the number of components is chosen automatically to maximize variance (> 95%).

In [None]:
# PCA 
data_norm_pca = PCA(
  random_state=42,
  n_components=0.95
)

data_norm_pca.fit(df_data_norm.values.T)

df_data_norm_pca = pd.DataFrame(
    data_norm_pca.transform(df_data_norm.values.T),
    index=df_data_norm.T.index
)

df_data_norm_pca.columns = [
    f'PCA-{c}' # ({r:.3f})'
    for c, r in zip(df_data_norm_pca.columns, data_norm_pca.explained_variance_ratio_)
]

df_data_norm_pca.index.name = "sample_id"

In [None]:
figure_legend("Table 5","Principle components of RNA-seq data", "The top principle components are the projections of each datapoint onto the axes along which there is the most variation in the dataset.")
display(df_data_norm_pca.head())

The data can now be plotted with the [React-Scatter-Board](https://github.com/MaayanLab/react-scatter-board) package. The points can be shaped and colored by various metadata categories, with the default being the first two metadata columns. They can also be individually searched by sample_id.

In [None]:
# combine metadata with RNA-seq data; note this will fail if sample_ids are
# not exactly matched between both datasets
pca_data = merge(
        df_data_norm_pca[["PCA-0", "PCA-1"]],
        df_library_size,
        df_metadata
      )

# name columns for plotting purposes
pca_data = pca_data.rename(columns={'PCA-0': 'x', 'PCA-1': 'y'})
pca_data['sample_id'] = pca_data.index

# normalize dimensions to -10, 10
pca_min, pca_max = -10, 10

pca_x_min, pca_x_max = pca_data['x'].min(), pca_data['x'].max()
pca_y_min, pca_y_max = pca_data['y'].min(), pca_data['y'].max()
pca_data['x'] = (pca_data['x'] - pca_x_min) / (pca_x_max - pca_x_min) * (pca_max - pca_min) + pca_min
pca_data['y'] = (pca_data['y'] - pca_y_min) / (pca_y_max - pca_y_min) * (pca_max - pca_min) + pca_min

In [None]:
pca_scatter_data = pca_data.to_dict('records')

color_def = df_metadata.columns.values[0]
shape_def = df_metadata.columns.values[1]

ScatterBoard(
    id='pca-scatterboard',
    is3d=False,
    data=pca_scatter_data,
    shapeKey=shape_def,
    colorKey=color_def,
    labelKeys=['sample_id'],
    searchKeys=['sample_id'],
    width=600,
    height=600
)


**Figure 5:** *First two PCA components of RNA-seq data.* Points are labeled by Sample ID and can be color- or shape-coded by any of the metadata categories using the dropdown menus. Points can also be isolated by searching by sample ID. Scroll to zoom, drag to move around.

### 3b. Uniform Manifold Approximation and Projection

The dimensionality of the dataset is further reduced by performing UMAP on the PCA components. Parameters such as `n_neighbors` and `min_dist` are set according to defaults used by the Seurat R Package for single cell genomics analysis.

In [None]:
data_norm_umap = UMAP(
  random_state=42,
  n_components=2,
  n_neighbors=n_neighbors,
  metric='cosine',
  min_dist=min_cluster_dist,
)

# use top 10 components of PCA
n_pca_components = min(10,df_data_norm_pca.shape[1])
data_norm_umap.fit(df_data_norm_pca.iloc[:, :n_pca_components].values)

# keep only first two UMAP components
df_data_norm_umap = pd.DataFrame(
  data_norm_umap.transform(df_data_norm_pca.iloc[:, :n_pca_components].values),
  columns=['UMAP-0', 'UMAP-1'],
  index=df_data_norm_pca.index,
)

In [None]:
# project data onto first two UMAP components for visualization
umap_data = merge(
        df_data_norm_umap[["UMAP-0", "UMAP-1"]],
        df_library_size,
        df_metadata
      )

umap_data = umap_data.rename(columns={'UMAP-0': 'x', 'UMAP-1': 'y'})
umap_data['sample_id'] = umap_data.index

# normalize to (-10, 10)
umap_min, umap_max = -10, 10

umap_x_min, umap_x_max = umap_data['x'].min(), umap_data['x'].max()
umap_y_min, umap_y_max = umap_data['y'].min(), umap_data['y'].max()
umap_data['x'] = (umap_data['x'] - umap_x_min) / (umap_x_max - umap_x_min) * (umap_max - umap_min) + umap_min
umap_data['y'] = (umap_data['y'] - umap_y_min) / (umap_y_max - umap_y_min) * (umap_max - umap_min) + umap_min

In [None]:
umap_scatter_data = umap_data.to_dict('records')

color_def = df_metadata.columns.values[0]
shape_def = df_metadata.columns.values[1]

ScatterBoard(
    id='umap-scatterboard',
    is3d=False,
    data=umap_scatter_data,
    shapeKey=shape_def,
    colorKey=color_def,
    labelKeys=['sample_id'],
    searchKeys=['sample_id'],
    width=600,
    height=600
)

**Figure 6:** *First two UMAP components of RNA-seq data.* The datapoints are again labeled by sample ID, and can be color- or shape-coded by any of the metadata categories using the dropdown menu. Points can also be isolated by searching by sample ID. Scroll to zoom, drag to move around.

## 4. Clustering

The first two UMAP components will be used from here on out. 

To compute sample clusters, the k-means method is used. The total number of clusters must be determined, by first testing a range for the number of total clusters, and then computing silhouette scores, which are a measure of how similar an entry is to its own cluster versus other clusters. The goal is to maximize both the similarity within a cluster and the differences between clusters, so the ideal number of clusters is that which produces the highest silhouette score.

In [None]:
silhouette_scores = []

# Set max clusters as a function of the sample size and the user-selected option
max_clusters = math.ceil(df_data_norm_umap.shape[0]**0.5)
max_clusters = int(math.ceil(max_clusters/2))

cluster_range = range(2, (max(max_clusters, 3)))
for n in cluster_range:
    # apply k-means clustering for each possible k
    X = df_data_norm_umap.values
    clusterer = KMeans(n_clusters=n, random_state=42).fit(X)
    y_pred = clusterer.predict(X)
    
    # The silhouette_score gives the average value for all the samples
    silhouette_avg = silhouette_score(X, y_pred, metric='cosine')
    
    # Compute a weighted score that rewards higher numbers of clusters
    # weighted_score = calc_weighted_score(silhouette_avg, n, max_clusters)

    silhouette_scores.append({
        "N Clusters": n,
        "Silhouette Score": silhouette_avg
        # "Weighted Score": weighted_score
    })
    
    # Labeling the clusters
    centers = clusterer.cluster_centers_

In [None]:
# use weighted scores
points = {}
threshold = 0.3
    
for s in silhouette_scores:
    points[s["N Clusters"]] = s["Silhouette Score"]

silhouette_scores = pd.DataFrame(silhouette_scores)

figure_legend("Table 6", "Silhouette scores by number of clusters", "Values are sorted by the highest weighted score.")
display(silhouette_scores.head().sort_values(["Silhouette Score"], ascending=False).reset_index())

In [None]:
k = int(silhouette_scores.sort_values(["Silhouette Score"], ascending=False)['N Clusters'].iloc[0])
    
print(f"Ideal k: {k} clusters")

In [None]:
# plot the silhouette score as a function of # of clusters
plt.plot(silhouette_scores['N Clusters'], silhouette_scores['Silhouette Score'], label='Silhouette Score', color='#7C88FB')
plt.scatter(silhouette_scores['N Clusters'], silhouette_scores['Silhouette Score'], color='#7C88FB')
plt.axvline(k, label = f"Ideal k: {k} clusters", color ="#EF553B", alpha=0.8,dashes=(3,3))
plt.legend()
plt.ylabel('Score')
plt.xlabel('Number of Clusters')
plt.show()
figure_legend("Figure 7", "Cluster size selection", "The dotted line indicates the value of the 'ideal' <i>k</i>. This value will be used in subsequent clustering.")

In [None]:
# Compute the k-means dataframe using the ideal number of clusters
km = KMeans(n_clusters=k, random_state=42)
km_clusters = km.fit_predict(df_data_norm_umap.values)

df_data_norm_km = pd.DataFrame({
'Cluster': [
    str(c)
    for c in km_clusters
]}, index=df_data_norm_umap.index)

print(f'Computed {len(df_data_norm_km["Cluster"].unique())} clusters')

In [None]:
# Map each cluster to a color for later plots
clusters = df_data_norm_km["Cluster"].unique()
plotly_colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']
cluster_colors = {}
i = 0
for c in clusters:
    cluster_colors[c] = plotly_colors[i % len(plotly_colors)]
    i += 1

def cluster_heading(cluster):
    display(HTML(f'''
    <center>
    <div style='background-color:{cluster_colors[cluster] + '98'};
        width:100%;height:3rem;display:flex;align-items:center;
        justify-content:center;color:white;font-size:2rem'>
        <center>Cluster {cluster}</center>
    </div>
    </center>'''))
    

## 5. Differential Expression

Next, the differential expression for each cluster is computed. The <a href="http://www.maayanlab.net/CD/">Characteristic Direction method</a> is used for identifying differentially expressed genes among the different clusters.

In [None]:
# Get differential expression for each cluster, using the dataset containing all genes
diff_expr = {}
for cluster, samples in df_data_norm_km.groupby('Cluster'):
    diff_expr[f"Cluster {cluster} CD"] = characteristic_direction(
        # expression outside of this cluster
        df_data_norm_all_genes.loc[:, df_data_norm_all_genes.columns.difference(samples.index)],
        # expression in this cluster
        df_data_norm_all_genes.loc[:, samples.index],
      )['CD-coefficient']

df_diff_expr = pd.DataFrame(diff_expr)
df_diff_expr = df_diff_expr.sort_values(by='Cluster 0 CD',ascending=True)
df_diff_expr['Symbol'] = df_diff_expr.index.values

In [None]:
figure_legend("Table 7", "Differential expression of genes by cluster", "By default, the top 5 most differentially expressed genes are shown, along with the corresponding characteristic directions for each cluster.")
display(df_diff_expr.head())

Logistic regression is performed for each metadata category to determine which categories most accurately predict cluster designations for each data point. ROC curves are also plotted for categories with the top two highest AUC scores.

In [None]:
# LR
aucs = {}
rocs = {}

for cluster, samples in df_data_norm_km.groupby('Cluster'): 
    aucs[cluster] = {}
    rocs[cluster] = []

    for feature in features:
        lr = LogisticRegression()
        X = df_metadata.copy()
        X = X[feature]
        X = pd.merge(X, df_data_norm_km, left_index = True, right_index = True)

        # drop NAs, and move on if dataset is empty
        X.replace("not reported", None)
        X = X.dropna()
        if (X.shape[0] == 0): continue

        cluster_data = X["Cluster"]
        X = X.drop(columns= ["Cluster"])

        # one-hot encode non numerical data
        if (not isinstance(X[feature][0], (int, float, complex))):
            X = pd.get_dummies(X[feature], prefix=feature)

        y_true = (cluster_data == cluster)
        
        if (len(y_true.unique()) < 2): # if there is only one class in the dataset
            print(f"Not enough data to classify cluster {cluster} based on category {feature}")
            aucs[cluster][feature] = np.nan
            continue 
                  
        lr.fit(X, y_true)

        y_score = lr.predict_proba(X)[:, 1]
        auc_score = roc_auc_score(y_true, y_score)
        aucs[cluster][feature] = auc_score
        
        # save the ROCs
       
        rocs[cluster].append({"auc":auc_score, "lr": lr, "X": X, "y_true":y_true, "title": f'Predictions of cluster {cluster} by category {feature}'})
        
df_cluster_aucs = pd.DataFrame(aucs)
df_cluster_aucs.index.name="Category"

# sort features by avg AUC across all clusters
df_cluster_aucs["avg"] = [ np.mean(df_cluster_aucs.T[f]) for f in df_cluster_aucs.index.values ]
df_cluster_aucs = df_cluster_aucs.sort_values(by = "avg", ascending=False)
df_cluster_aucs = df_cluster_aucs.drop(columns = "avg")

cols = [('Cluster', col) for col in df_cluster_aucs.columns ]
df_cluster_aucs.columns = pd.MultiIndex.from_tuples(cols)

In [None]:
figure_legend("Table 8", "Average AUC scores for top predictive metadata categories, by cluster", "Scores for the top 5 metadata categories for predicting clusters, as determined by the average AUC score across all clusters, are shown. Higher AUC scores correspond to better classifiers for distinguishing whether or not a datapoint belongs to a certain cluster.")
display(df_cluster_aucs.head(5))

In [None]:
# plot top 2 ROCs for each cluster
plt.rc('font', size=16)

for cluster, plots in rocs.items():
    plots.sort(reverse=True, key=lambda x: x["auc"])
    cluster_heading(cluster)
    
    if len(plots) < 2:
        best_rocs = plots
    else:
        best_rocs = plots[:2]

    num_plots = len(best_rocs)
    figure,axes = plt.subplots(int(math.ceil(num_plots / 2.)), 2, figsize=(15,(len(best_rocs)*3.5)))
    
    axes = axes.flatten()
    for i in range(len(axes)):
        if i >= len(best_rocs):
            axes[i].remove()
        else:
            plot = best_rocs[i]
            fig = plot_roc_curve(plot["lr"], plot["X"], plot["y_true"], ax=axes[i])

            axes[i].set_title('\n'.join(wrap(plot["title"], 40)))

    figure.tight_layout(pad=2)
    plt.show()
    
figure_legend("Figure 8", "ROCs for top cluster-predicting metadata categories")

plt.rcdefaults()

## 6. Identify Up- and Down-Regulated Genes
Find the most up- and down-regulated genes for each cluster for visualization in heatmap, and for enrichment analysis. 

In [None]:
# Merge data
df_clustered_umap = pd.merge(left=df_data_norm_km, left_on="sample_id", right=df_data_norm_umap, right_on="sample_id")

In [None]:
# Get top Genes for each cluster
top_genes = {}
all_top_genes = []
heatmap_top_n = 100
for cluster in df_clustered_umap['Cluster'].unique():
    cd_col = f'Cluster {cluster} CD'
    if cd_col in df_diff_expr.columns:
        # top up genes
        up_genes = df_diff_expr.loc[df_diff_expr[cd_col].sort_values(ascending=False).iloc[:top_n_genes_enrichment].index, 'Symbol'].values
        # top down genes
        dn_genes = df_diff_expr.loc[df_diff_expr[cd_col].sort_values(ascending=True).iloc[:top_n_genes_enrichment].index, 'Symbol'].values
    else:
        raise Exception('Cant find col for cluster')
    all_top_genes.append(up_genes[:heatmap_top_n])
    all_top_genes.append(dn_genes[:heatmap_top_n])
    # save results
    top_genes[cluster] = (up_genes, dn_genes)
all_top_genes = [item for sublist in all_top_genes for item in sublist]  # flatten all genes to one list

Data corresponding to only the top 100 up- and down-regulated genes for each cluster is selected for visualization in a heatmap, with log-transformation and normalization proceeding as before. 

In [None]:
df_data_norm_heatmap_f = df_data.loc[all_top_genes, :]

# compute log normalization of matrix
df_data_norm_heatmap_f = log2_normalize(df_data_norm_heatmap_f)

# convert to zscores
df_data_norm_heatmap_f = zscore_normalize(df_data_norm_heatmap_f) 

# Plot heatmap
cases = df_data_norm_heatmap_f.columns
heatmap_cluster_colors = [ cluster_colors[x] for x in df_clustered_umap.loc[cases, :]["Cluster"] ]

In [None]:
sns.clustermap(df_data_norm_heatmap_f,xticklabels=False,col_colors = heatmap_cluster_colors); plt.show()
figure_legend("Figure 9", "Heatmap of most differentially expressed genes", "Color coding along the top edge indicates cluster designation of the corresponding sample.")

##  7. Enrichment Analysis with Enrichr

Perform enrichment analysis for each cluster by querying the [Enrichr](https://maayanlab.cloud/Enrichr/) API. The background libraries are the default libraries from Enrichr. A link is provided to download the results. 

In [None]:
# enrichment analysis libraries
enrichr_libraries = OrderedDict([
    ('Diseases/Drugs', disease_drug_libraries), 
    ('Ontologies', ontology_libraries),
    ('Cell Type', cell_type_libraries),
    ('Pathways', pathway_libraries),
    ('Transcription', transcription_libraries),
    ('Legacy', legacy_libraries),
    ('Crowd', crowd_libraries)
])

# handle no selected libraries
all_empty = True
for key, libs in enrichr_libraries.items():
    if len(libs) > 0:
        all_empty = False
        break

if all_empty:
    enrichr_libraries = OrderedDict([
        ('Diseases/Drugs', ['GWAS_Catalog_2019']),
        ('Ontologies', ['GO_Biological_Process_2018', 'MGI_Mammalian_Phenotype_Level_4_2019']),
        ('Pathways', ['KEGG_2019_Human', 'KEGG_2019_Mouse']),
        ('Transcription', ['ENCODE_TF_ChIP-seq_2015'])
    ])

In [None]:
# Util functions
def enrichr_link_from_genes(genes, description='', enrichr_link='https://amp.pharm.mssm.edu/Enrichr'):
    ''' Functional access to Enrichr API
    '''
    time.sleep(1)
    resp = requests.post(enrichr_link + '/addList', files={
    'list': (None, '\n'.join(genes)),
    'description': (None, description),
    })
    if resp.status_code != 200:
        raise Exception('Enrichr failed with status {}: {}'.format(
          resp.status_code,
          resp.text,
        ))
    # wait a tinybit before returning link (backoff)
    time.sleep(3)
    result = resp.json()
    return dict(result, link=enrichr_link + '/enrich?dataset=' + resp.json()['shortId'])

def enrichr_get_top_results(userListId, bg, enrichr_link='https://amp.pharm.mssm.edu/Enrichr'):
    time.sleep(1)
    resp = requests.get(enrichr_link + '/enrich?userListId={}&backgroundType={}'.format(userListId, bg))
    if resp.status_code != 200:
        raise Exception('Enrichr failed with status {}: {}'.format(
          resp.status_code,
          resp.text,
        ))
    time.sleep(3)
    return pd.DataFrame(resp.json()[bg], columns=['rank', 'term', 'pvalue', 'zscore', 'combinedscore', 'overlapping_genes', 'adjusted_pvalue', '', ''])

In [None]:
# Get Enrichr links for each cluster
enrichr_links = {}

for cluster, (up_genes, dn_genes) in top_genes.items():
    up_link, dn_link = None, None
    if up_genes.size:
        try:
            up_link = enrichr_link_from_genes(up_genes, f'cluster {cluster} up')
        except:
            print(f'Enrichr failed for cluster {cluster} up genes')
    else:
        print(f'cluster {cluster} up: empty')
    if dn_genes.size:
        try:
            dn_link = enrichr_link_from_genes(dn_genes, f'cluster {cluster} down')
        except:
            print(f'Enrichr failed for cluster {cluster} down genes')
    else:
        print(f'cluster {cluster} down: empty')
    enrichr_links[cluster] = (up_link, dn_link)

# Grab top results for each cluster
all_enrichr_results = []
for cluster, (up_link, dn_link) in enrichr_links.items():
    for link_type, link in [('up', up_link), ('down', dn_link)]:
        if link is None:
            continue
        for category, libraries in enrichr_libraries.items():
            for library in libraries:
                try:
                    results = enrichr_get_top_results(link['userListId'], library).sort_values('pvalue').iloc[:5]
                    results['link'] = link['link']
                    results['library'] = library
                    results['category'] = category
                    results['direction'] = link_type
                    results['cluster'] = cluster
                    all_enrichr_results.append(results)
                except:
                    print('{}: {} {} {} cluster {} failed, continuing'.format(link, library, category, link_type, cluster))

df_enrichr_results = pd.concat(all_enrichr_results).reset_index()

In [None]:
# Display a dataframe with clickable enrichr links
figure_legend("Table 10","Enrichment analysis results from Enrichr", "Results are grouped by expression direction (up/down) and gene set library. Within groups, results are sorted by lowest p-value (highest rank) first.")
df_clickable = df_enrichr_results.copy()
df_clickable['link'] = df_clickable["link"].apply(make_clickable)
table_html = df_clickable.to_html(escape=False)
display(HTML(f'<div style="max-height: 250px; overflow-y: auto; margin-bottom: 25px;">{table_html}</div>'))
download_button(df_enrichr_results.to_csv(), 'Download Enrichr results', 'Enrichr results.csv')

### 7a. Barplots
Horizontal barplots are used to display the top Enrichr results for each cluster, by library and characteristic expression direction.

In [None]:
# Make horizontal barplots to visualize top Enrichr results
clusters = df_enrichr_results["cluster"].unique()
for cluster in clusters:
    cluster_results = df_enrichr_results.loc[df_enrichr_results["cluster"] == cluster, :]
    libraries = cluster_results["library"].unique()
    num_rows = len(libraries)

    count = 1 # keep track of which subplot we're on
    fig = plt.figure(figsize=(15,5*num_rows))
    
    for library in cluster_results["library"].unique():
            library_results = cluster_results.loc[cluster_results["library"] == library, :]
            for direction in library_results["direction"].unique():
                plot_results = library_results.loc[cluster_results["direction"] == direction, :]
                plot_results = plot_results.sort_values("pvalue",ascending=False)
                labels = plot_results["term"]
                labels = [ '\n'.join(wrap(l, 20)) for l in labels ]
                values = plot_results["pvalue"]
                values = -np.log(values)
                
                # normalize values to map from 0-1 -> color, with opacity also based on normalized pvalue
                cmap = plt.get_cmap('cool')
                norm_values = [ 0.3 + (x - min(values))/(max(values) - min(values))*0.7 for x in values]
                colors = [ [*cmap(val)[:3], 0.4  + 0.2*val] for val in norm_values]
                
                # plot result
                ax = fig.add_subplot(num_rows,2,count)
                ax.barh(labels,values,color = colors)
                ax.set_title(f'{library}\n{direction} genes')
                ax.set_xlabel(' – log(pvalue)')
                count += 1
                
    cluster_heading(cluster)
    fig.tight_layout(pad=3, w_pad=2, h_pad=6)
    plt.show()
    display(HTML("<br><br>"))
    
figure_legend("Figure 11", "Enrichment results by cluster", "Bar plots indicate the negative log of the p-value for the specified term. One plot is presented per cluster, per gene-set library, per expression direction (up/down).")

### 7b. Running Sum Visualizations
While the above barplots display the top enriched terms for each cluster in each direction, individual enriched terms can also be compared to the tissue data using a random walk [GSEA running sum visualization](https://github.com/MaayanLab/react-GSEA/tree/master).

First, each of the four default background libraries from Enrichr can be queried and saved as a JSON object which maps terms to their complete genesets.

In [None]:
libresp = {}
for lib in df_enrichr_results['library'].unique():
    resp = requests.get('https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=json&libraryName=' + lib)
    if resp.status_code == 200:
        libresp[lib] = resp.json()[lib]['terms']
    else: 
        print(f"Failed to access library {lib}, continuing")

For each cluster, the most enriched term for that cluster from each library can then be compared against the most up-regulated genes in the cluster. Below, GSEA plots display the overlap between the genes from each cluster and their most enriched genesets. 

In [None]:
# iterate through each cluster
for cluster in clusters:
    cluster_heading(cluster)

    # iterate through each library for each cluster
    for lib in libresp.keys():

        # obtain the most enriched library term for the cluster in the up direction
        up_df = df_enrichr_results[df_enrichr_results.direction.isin(['up'])
                                    & df_enrichr_results.cluster.isin([cluster])
                                    & df_enrichr_results.library.isin([lib])]
        top_up_term = up_df[up_df['rank'] == 1]['term'].iloc[0]

        # store the geneset for the most enriched term
        top_up_set = list(libresp[lib][top_up_term])

        display(HTML(f"<div style='font-size:1.25rem;'><b>Comparison of up-regulated genes in Cluster {cluster} to most enriched {lib} term</b> </div>"))
        print(f"Most enriched {lib} geneset for up-regulated genes:", top_up_term)

        # display the GSEA plot comparing the enriched genes and the top up-regulated cluster genes
        display(ReactGSEA(
            data=dataFromResult(
                input_set=top_up_set,
                ranked_entities=df_diff_expr['Cluster ' + cluster + ' CD'].sort_values(ascending=False).iloc[:math.ceil((df_diff_expr.shape[0]/2))].index.tolist()
            )
        ))

## 8. L1000 Analysis

If selected during user input, the most up- and down-regulated genes from each cluster, as identified from above, can be input into the [L1000FWD](https://amp.pharm.mssm.edu/L1000FWD/) API, which will then return the most similar and opposite gene expression signatures from the L1000 database. Links are provided to the interactive L1000FWD projections for each set of results. 

In [None]:
def l1000fwd_results_from_genes(up_genes, down_genes, description='', l100fwd_link='http://amp.pharm.mssm.edu/L1000FWD/'):
    ''' Functional access to L1000FWD API
    '''
    import time
    time.sleep(1)
    response = requests.post(l100fwd_link + 'sig_search', json={
    'up_genes': list(up_genes),
    'down_genes': list(down_genes),
    })
    l1000fwd_results =  {}
    if response.status_code != 200:
        raise Exception('L1000FWD failed with status {}: {}'.format(
          response.status_code,
          response.text,
        ))
    if 'KeyError' in response.text:
        l1000fwd_results['result_url'] = None
    else:
        # Get ID and URL
        result_id = response.json()['result_id']
        l1000fwd_results['result_url'] = 'https://amp.pharm.mssm.edu/l1000fwd/vanilla/result/'+result_id
        l1000fwd_results['result_id'] = result_id

        # Get Top
        l1000fwd_results['signatures'] = requests.get(l100fwd_link + 'result/topn/' + result_id).json()

    # wait a tinybit before returning link (backoff)
    time.sleep(1)
    return l1000fwd_results

def l1000fwd_sig_link(sig_id):
    return 'https://amp.pharm.mssm.edu/dmoa/sig/' + sig_id

def get_signature_by_id(sig_id):
    response = requests.get("http://amp.pharm.mssm.edu/L1000FWD/sig/" + sig_id)
    if response.status_code != 200:
        raise Exception('L1000FWD signature query  failed with status {}: {}'.format(
          response.status_code,
          response.text,
        ))
    return response.json()

In [None]:
def display_l1000fwd_results(l1000fwd_results, plot_counter,cluster_id,nr_drugs=7, height=300):
    # Check if results
    if l1000fwd_results['result_url']:

        # Display cluster title
        display(HTML('<br><br>'))
        cluster_heading(cluster)

        # Display IFrae
        display(HTML(f"<a href='{l1000fwd_results['result_url']}' target='_blank'> View L1000FWD for cluster {cluster_id}</a>"))
    
        # Display tables
        for direction, signature_list in l1000fwd_results['signatures'].items():

            # Fix dataframe
            rename_dict = {'sig_id': 'Signature ID', 'pvals': 'P-value', 'qvals': 'FDR', 'zscores': 'Z-score', 'combined_scores': 'Combined Score'}
            signature_dataframe = pd.DataFrame(signature_list)[list(rename_dict.keys())].rename(columns=rename_dict).sort_values('P-value').rename_axis('Rank')
            signature_dataframe.index = [x + 1 for x in range(len(signature_dataframe.index))]
            signature_csv = signature_dataframe.to_csv(sep=",")

            # Display table
            pd.set_option('max.colwidth', None)
            signature_dataframe['Signature ID'] = [f'<a href={l1000fwd_sig_link(x)} target="_blank">{x}</a>' for x in signature_dataframe['Signature ID']]
            table_html = signature_dataframe.to_html(escape=False, classes='w-100')
            display(HTML(f'<h3>{direction.title()} Signatures: </h3>'))
            display(HTML(f'<style>.w-100{{width: 100% !important;}}</style><div style="max-height: 250px; overflow-y: auto; margin-bottom: 25px;">{table_html}</div>'))

            # Display download button
            download_button(signature_csv, f'Download {direction.title()} Signatures', f'Cluster {cluster_id} L1000FWD {direction.title()} signatures.csv')
        # Link
        display(HTML('Full results available at: <a href="{result_url}" target="_blank">{result_url}</a>.'.format(**l1000fwd_results)))
        
    # Display error
    else:
        display(Markdown('### No results were found.\n This is likely due to the fact that the gene identifiers were not recognized by L1000FWD. Please note that L1000FWD currently only supports HGNC gene symbols (https://www.genenames.org/). If your dataset uses other gene identifier systems, such as Ensembl IDs or Entrez IDs, consider converting them to HGNC. Automated gene identifier conversion is currently under development.'))

In [None]:
if do_l1000:
    plot_counter = 0
    all_l1000fwd_results = {}
    figure_header("Figure 14", "Most similar and opposite L1000 signatures, by cluster")
    for cluster, (up_genes, dn_genes) in top_genes.items():
        try:
            results = l1000fwd_results_from_genes(up_genes,dn_genes)
            all_l1000fwd_results[cluster] = results
            display_l1000fwd_results(results,plot_counter,cluster)
            plot_counter += 1
        except:
            print(f'L1000FWD API failed for cluster {cluster}, continuing')

            
    figure_legend("Figure 14", "Most similar and opposite L1000 signatures, by cluster", "Results are sorted by smallest p-value.")

In the case of disease state RNA-seq data, the reverse signatures provide a potential set of drugs that could perturb the cells/tissues towards a "healthy" direction. These may present novel treatments for patients whose samples belong to a certain cluster.

In [None]:
if do_l1000:
    df_drugs = pd.read_csv("https://amp.pharm.mssm.edu/l1000fwd/download/Drugs_metadata.csv")

    # Load top drug suggestions for each cluster based on the drugs used to produce the top five opposite signatures
    drug_results = {}
    for cluster, results in all_l1000fwd_results.items():
        opposite_sigs = results["signatures"]["opposite"][:5]
        sig_ids = [sig["sig_id"] for sig in opposite_sigs]
        pert_ids = []
        for sig_id in sig_ids:
            try:
                signature = get_signature_by_id(sig_id)
                pert_ids.append(signature["pert_id"])
            except: 
                print(f'L1000FWD API failed for cluster {cluster}, sig_id {sig_id}, continuing')
        
        df_cluster_drugs = df_drugs[df_drugs["pert_id"].isin(pert_ids)].copy()
        df_cluster_drugs["cluster"] = cluster
        df_cluster_drugs = df_cluster_drugs[["cluster", *list(filter(lambda x: x!="cluster", df_cluster_drugs.columns))]]
        drug_results[cluster] = df_cluster_drugs
        
    df_all_drugs = pd.concat(drug_results).reset_index()

In [None]:
if do_l1000:
    figure_header("Table 13", "Drugs used to produce most opposite signatures for each cluster")
    df_clickable = df_all_drugs.copy()
    df_clickable['pert_url'] = df_clickable["pert_url"].apply(make_clickable)
    table_html = df_clickable.to_html(escape=False)
    display(HTML(f'<div style="max-height: 250px; overflow-y: auto; margin-bottom: 25px;">{table_html}</div>'))
    download_button(df_all_drugs.to_csv(), 'Download L1000FWD drug results', 'L1000FWD drugs.csv')
    figure_legend("Table 13", "Drugs used to produce most opposite signatures for each cluster", "Each entry is a drug/chemical used for perturbation in the L1000 experiments that resulted in a gene-expression signature most opposite to that of the specified cluster.")