In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
# Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
import multiprocessing as mp
from MulticoreTSNE import MulticoreTSNE as TSNE
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from IPython.display import HTML, display, Markdown
import umap

from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Category20
from bokeh.transform import factor_cmap

from react_scatter_board import ReactScatterBoard

output_notebook() #allow bokeh plot show on the notebook

In [None]:
%%appyter hide_code_exec
{% do SectionField(
    name='Data_Section',
    title='Load your Data',
    subtitle='Load your metadata and expression data in comma/tab separated formats. Genes should be in rows and samples shoud be in columns. Example files are downloadable here: <a href="https://chanzuckerberg.github.io/scRNA-python-workshop/preprocessing/00-tabula-muris.html"> link </a>',
    img='load.png'
    
) %}


{% do SectionField(
    name='Visualization_Section',
    title='Select Visualization Parameters',
    subtitle='',
    img='select.png'
    
) %}

In [None]:
%%appyter code_exec
{% set meta_data_filename = FileField(
    name='meta_data_filename', 
    label='Meta data file (.csv or .txt)', 
    default='mouse_brain_cells_metadata.csv',

    examples={'mouse_brain_cells_metadata.csv': "https://appyters.maayanlab.cloud/storage/dimensionality_reduction_visualization/mouse_brain_cells_metadata.csv"}, 
    description='Upload metadata as two-column comma seperated or tab seperated format. One column contains sample ids and the other column contains sample labels', 
    section='Data_Section')

%}
{% set rnaseq_data_filename = FileField(
    name='rnaseq_data_filename', 
    label='RNA-seq data file (.csv or .txt)', 
    default='mouse_brain_cells_gene_counts.csv',
    examples={'mouse_brain_cells_gene_counts.csv': "https://appyters.maayanlab.cloud/storage/dimensionality_reduction_visualization/mouse_brain_cells_gene_counts.csv"}, 
    description='Upload RNA-seq expression data as comma seperated or tab seperated format. The index of the dataset are genes, the columns are samples.', 
    section='Data_Section')

%}
{% set meta_class_column_name = StringField(
    name='meta_class_column_name', 
    label='Class column name in metadata', 
    default='cell_ontology_class', 
    description='class column name of metadata', 
    section='Data_Section')
%}


In [None]:
%%appyter code_exec
{% set plot_3D = ChoiceField(
    name='3D_plot', 
    label='3D plots?', 
    choices = {'2D': '2D', '3D and 2D': '3D'},
    default='3D and 2D', 
    description='Check if User wants 3D plots', 
    section='Visualization_Section')
%}

{% set visualization_method = ChoiceField(
    name='visualization_method', 
    label='Visualization Methods', 
    choices = {'All': 'All','PCA': 'PCA', 'UMAP': 'UMAP', 't-SNE': 't-SNE'},
    default='All', 
    description='Select a visualization method', 
    section='Visualization_Section')
%}


In [None]:
%%appyter code_exec
rnaseq_data_filename = {{rnaseq_data_filename}}
meta_data_filename = {{meta_data_filename}}
meta_class_column_name = {{meta_class_column_name}}

plot_3D = "{{plot_3D}}"
clustering_topk = 50
method = "{{visualization_method}}"


# PCA, t-SNE, and UMAP Appyter

This Appyter [1] loads data in a matrix format, it then performs dimensionality reduction algorithms on the data; and then visualizes the dimensionality reduced data with with static and interactive 2D and 3D PCA [2], t-SNE [3], and UMAP [4] plots.  

## PCA
Principal Component Analysis (PCA) [2] is a statistical technique ussed to identify global patterns in high-dimensional datasets. It is commonly used to explore the similarity of biological samples in omics datasets. To perform PCA, variable values are transformed into Principal Components (PCs), a set of linearly orthogonal features which represent the most relevant sources of variance in the data. The top PCs subsequently visualized using a scatter plot for each variable to estimate similarity and differences between the variables in higher dimensions.


## t-SNE
t-distributed stochastic neighbor embedding (t-SNE) [3] is a statistical method used to identify global patterns in high-dimensional datasets. It is a non-linear Dimensionality reduction technique, and it embeds the points from a higher dimension to a lower dimension trying to preserve the neighborhood of that point. It then provides each datapoint a location in a two or three-dimensional space visualized using a scatter plot.

## UMAP
Uniform Manifold Approximation and Projection (UMAP) [4] is a statistical method used to identify global patterns in high-dimensional datasets. It creates a graph that accurately reflects the topology of the true high dimensional graph. It then calculates the weight for edges of this graph and builds the low dimensional graph.

## Analysis Overview
The input data matrix that has been normalized first undergoes dimensionality reduction using PCA [2], t-SNE [3], and UMAP [4] with the Python library. Columns are then clustered based on their most-associated highly-variable variables and metadata features. Clusters are visualized using the Bokeh package [5] for the 2D plots, and the React-Scatter-Board package [6] developed by the Ma'ayan lab for interactive visualization of the 3D plots.



# Read Data

In [None]:
def load_file(filename, **kwargs):
    if filename.endswith(".csv"):
        temp_df = pd.read_csv(filename, **kwargs).sort_index()
    elif filename.endswith(".txt") or filename.endswith(".tsv"):
        temp_df = pd.read_csv(filename, sep="\t", **kwargs).sort_index()
    else:
        raise Exception('Error! Please load file in txt, tsv or csv format')
    
    return temp_df

expr_df = load_file(rnaseq_data_filename, index_col=0)
meta_df = load_file(meta_data_filename)

assert meta_class_column_name in meta_df.columns, f"Error! Column '{meta_class_column_name}' is not in metadata"

In [None]:
meta_df.head()

In [None]:
expr_df.head()

# Visualize Samples

In [None]:
def diplay_caption(counter, caption):
    display(Markdown("*Figure {}. {}*".format(counter, caption)))
    
def create_df(transformed_data):
    df = pd.DataFrame()
    df["y"] = meta_df[meta_class_column_name]
    df["pc1"] = transformed_data[:,0]
    df["pc2"] = transformed_data[:,1]
    return df
    
def create_df_3D(transformed_data):
    scaler = StandardScaler().fit(transformed_data)
    X_scaled = scaler.transform(transformed_data)
    df = pd.DataFrame()
    df["y"] = meta_df[meta_class_column_name]
    df["pc1"] = X_scaled[:,0]
    df["pc2"] = X_scaled[:,1]
    df["pc3"] = X_scaled[:,2]
    data1=[dict(x=r['pc1'], y=r['pc2'], z=r['pc3'],Type=r['y'], opacity=0.7) for _, r in df.iterrows()]
    
    return data1


In [None]:
def display_PCA_variance(pca):
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    source = ColumnDataSource(data=dict(
        x=range(len(cumulative_variance)),
        y=cumulative_variance,
    ))

    TOOLTIPS = [
        ("index", "$index"),
        ("(x,y)", "($x, $y)"),
    ]

    p = figure(width=500, height=400, tooltips=TOOLTIPS, x_axis_label='principal components', 
               y_axis_label='cumulative proportion of variance')
    p.line('x', 'y', source=source)
    p.output_backend = "svg"
    return p

def model_PCA_feature():
    numbers = 3 if plot_3D == "3D" else 2
    pca = PCA(n_components=numbers)
    transformed_df = pca.fit_transform(expr_df)
    return transformed_df

def model_t_SNE_feature():
    numbers = 3 if plot_3D == "3D" else 2
    n_jobs=mp.cpu_count()
    tsne = TSNE(n_jobs = n_jobs, n_components=numbers)
    transformed_df = tsne.fit_transform(expr_df) 
    return transformed_df

def model_Umap_feature():
    numbers = 3 if plot_3D == "3D" else 2
    umap_model = umap.UMAP(n_components=numbers)
    transformed_df = umap_model.fit_transform(expr_df)
    return transformed_df

def interactive_circle_plot(input_df, x_lab, y_lab):
    source = ColumnDataSource(data=dict(
        x=input_df["pc1"],
        y=input_df["pc2"],
        column=input_df["y"],
    ))

    TOOLTIPS = [
        ("index", "$index"),
        ("(x,y)", "($x, $y)"),
        ("column", "@column"),
    ]

    p = figure(width=800, height=400, tooltips=TOOLTIPS,x_axis_label=x_lab, y_axis_label=y_lab)

    color = factor_cmap('column', palette=Category20[20], factors=input_df["y"].unique())
    p.circle('x', 'y', size=3, source=source, legend_group="column",
             fill_color= color, line_color=color)

    p.add_layout(p.legend[0], 'right')
    p.output_backend = "svg"
    return p


In [None]:
%%appyter code_exec
counter = 1

{% if plot_3D.value == "2D" and (visualization_method.value == "All" or visualization_method.value == "PCA") %}
pca = PCA(n_components=clustering_topk)
transformed_pca = pca.fit_transform(expr_df)

p1 = display_PCA_variance(pca)    
p2 = interactive_circle_plot(create_df(transformed_pca), "PC-1", "PC-2")

show(p1)
caption = "Cumulative proportion of variance explained as a function of the number of principal components. If the cumulative proportion of variance equal to 1, it explains 100% of the variance within the data."
diplay_caption(counter, caption)

counter += 1

show(p2)
caption = "PCA 2D scatter plot when the select feature to highlight is {}. Each point presents a column from the input data matrix.".format(meta_class_column_name)
diplay_caption(counter, caption)
{% endif %}

{% if plot_3D.value == "2D" and (visualization_method.value == "All" or visualization_method.value == "t-SNE") %}
transformed_df = model_t_SNE_feature()
p = interactive_circle_plot(create_df(transformed_df), "t-SNE-1", "t-SNE-2")
counter += 1
show(p)
caption = "t-SNE 2D scatter plot when the select feature to highlight is {}. Each point presents a column from the input data matrix.".format(meta_class_column_name)
diplay_caption(counter, caption)
{% endif %}

{% if plot_3D.value == "2D" and (visualization_method.value == "All" or visualization_method.value == "UMAP") %}
transformed_df = model_Umap_feature()
p = interactive_circle_plot(create_df(transformed_df),"UMAP-1","UMAP-2")
counter += 1
show(p)
caption = "UMAP 2D scatter plot when the select feature to highlight is {}. Each point presents a column from the input data matrix.".format(meta_class_column_name)
diplay_caption(counter, caption)
{% endif %}

In [None]:
%%appyter code_exec
{% if plot_3D.value == "3D" and (visualization_method.value == "All" or visualization_method.value == "PCA") %}
transformed_df = model_PCA_feature()

display(ReactScatterBoard(
  is3d=True,
  data=create_df_3D(transformed_df),
  colorKey="Type",
  scale=5
))

caption = "PCA 3D scatter plot when the select feature to highlight is {}. Each point presents a column from the input data matrix. Scroll to zoom, drag to move around.".format(meta_class_column_name)
diplay_caption(counter, caption)
{% endif %}

In [None]:
%%appyter code_exec
{% if plot_3D.value == "3D" and (visualization_method.value == "All" or visualization_method.value == "t-SNE") %}
transformed_df = model_t_SNE_feature()

display(ReactScatterBoard(
  is3d=True,
  data=create_df_3D(transformed_df),
  colorKey="Type",
  scale=4
))

counter += 1
caption = "t-SNE 3D scatter plot when the select feature to highlight is {}. Each point presents a column from the input data matrix. Scroll to zoom, drag to move around.".format(meta_class_column_name)
diplay_caption(counter, caption)
{% endif %}

In [None]:
%%appyter code_exec
{% if plot_3D.value == "3D" and (visualization_method.value == "All" or visualization_method.value == "UMAP") %}
transformed_df = model_Umap_feature()

display(ReactScatterBoard(
  is3d=True,
  data=create_df_3D(transformed_df),
  colorKey="Type",
  scale=4
))

counter += 1
caption = "UMAP 3D scatter plot when the select feature to highlight is {}. Each point presents a column from the input data matrix. Scroll to zoom, drag to move around.".format(meta_class_column_name)
diplay_caption(counter, caption)
{% endif %}

## Reference



    4. Becht E, McInnes L, Healy J, Dutertre CA, Kwok IWH, Ng LG, Ginhoux F, Newell EW. Dimensionality reduction for visualizing single-cell data using UMAP. Nat Biotechnol. 2018 Dec 3.
    
    5. Bokeh Development Team (2018). Bokeh: Python library for interactive visualization URL http://www.bokeh.pydata.org.
    
    1. Clarke DJB, Jeon M, Stein DJ, Moiseyev N, Kropiwnicki E, Dai C, Xie Z, Wojciechowicz ML, Litz S, Hom J, Evangelista JE, Goldman L, Zhang S, Yoon C, Ahamed T, Bhuiyan S, Cheng M, Karam J, Jagodnik KM, Shu I, Lachmann A, Ayling S, Jenkins SL, Ma'ayan A. Appyters: Turning Jupyter Notebooks into data-driven web apps. Patterns (N Y). 2021 Mar 4;2(3):100213.
    
    2. Clark NR, Ma'ayan A. Introduction to statistical methods to analyze large data sets: principal components analysis. Sci Signal. 2011 Sep 6;4(190):tr3.
    
    3. Melit Devassy B, George S, Nussbaum P. Unsupervised Clustering of Hyperspectral Paper Data Using t-SNE. J Imaging. 2020 May 5;6(5):29.
    
    6. MaayanLab. react-scatter-board: Python library for interactive 3 Dimension plot URL https://github.com/MaayanLab/react-scatter-board
