In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
# Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import multiprocessing as mp
try: from MulticoreTSNE import MulticoreTSNE as TSNE
except ImportError: from sklearn.manifold import TSNE
import seaborn as sns
from IPython.display import HTML, display, Markdown

from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, NumeralTickFormatter
from bokeh.transform import factor_cmap
from bokeh.models import Select
from bokeh.models.callbacks import CustomJS
from bokeh.layouts import column

import scanpy as sc

from react_scatter_board import ReactScatterBoard
from scipy.stats import zscore
import qnorm
import itertools

output_notebook() #allow bokeh plot show on the notebook

In [None]:
%%appyter hide_code_exec
{% do SectionField(
    name='Data_Section',
    title='Load Your Data',
    subtitle='Load your metadata and data matrix in comma/tab separated formats. Genes should be in rows and samples shoud be in columns.',
    img='load.png'
    
) %}


{% do SectionField(
    name='Visualization_Section',
    title='Select Visualization Parameters',
    subtitle='',
    img='select.png'
    
) %}

In [None]:
%%appyter code_exec
{% set rnaseq_data_filename = FileField(
    name='rnaseq_data_filename', 
    label='RNA-seq data file (.csv, .tsv, .tpm or .txt)', 
    default='Papillary+adenocarcinoma,+NOS_data.csv',
    examples={'Papillary+adenocarcinoma,+NOS_data.csv': "https://appyters.maayanlab.cloud/storage/dimensionality_reduction_visualization/Papillary+adenocarcinoma,+NOS_data.csv"}, 
    description='Upload RNA-seq data matrix as comma seperated or tab seperated format. The index of the dataset are cells, the columns are samples.', 
    section='Data_Section')

%}

{% set meta_data_filename = FileField(
    name='meta_data_filename', 
    label='Metadata file (.csv, .tsv, .tpm or .txt)', 
    default='Papillary+adenocarcinoma,+NOS_clinical_metadata.csv',
    examples={'Papillary+adenocarcinoma,+NOS_clinical_metadata.csv': "https://appyters.maayanlab.cloud/storage/dimensionality_reduction_visualization/Papillary+adenocarcinoma,+NOS_clinical_metadata.csv"}, 
    description='Upload metadata as two-column comma seperated or tab seperated format. One column contains sample ids and the other column contains sample labels', 
    section='Data_Section')

%}

{% set meta_class_column_name = StringField(
    name='meta_class_column_name', 
    label='Class column name in metadata', 
    default='case_id', 
    description='One of the column names in metadata that aligns with main data', 
    section='Data_Section')
%}


In [None]:
%%appyter code_exec

{% set need_transpose = BoolField(
    name='need_transpose', 
    label='Samples are the columns?',
    default='true',
    description='Select true if samples are columns and rows are genes', 
    section='Visualization_Section') 
%}

{% set need_normalize = BoolField(
    name='need_normalize', 
    label='Normalization?',
    default='true',
    description='Select true if you would like to normalize the data matrix', 
    section='Visualization_Section') 
%}

{% set need_leiden = BoolField(
    name='need_leiden', 
    label='Leiden graph for clustering?',
    default='true',
    description='Leiden graph automatic identifies clusters and color them', 
    section='Visualization_Section') 
%}

{% set plot_3D = ChoiceField(
    name='3D_plot', 
    label='3D plots?', 
    choices = {'2D': '2D', '3D': '3D', 'All':'All'},
    default='All', 
    description='Choose to have 2D, 3D, or all plots', 
    section='Visualization_Section')
%}

{% set visualization_method = ChoiceField(
    name='visualization_method', 
    label='Visualization Methods', 
    choices = {'All': 'All','PCA': 'PCA', 't-SNE': 't-SNE', 'UMAP': 'UMAP'},
    default='All', 
    description='Select a visualization method', 
    section='Visualization_Section')
%}


In [None]:
%%appyter code_exec
rnaseq_data_filename = {{rnaseq_data_filename}}
meta_data_filename = {{meta_data_filename}}
meta_class_column_name = {{meta_class_column_name}}
need_leiden = {{need_leiden}}
need_transpose = {{need_transpose}}

clustering_topk = 50

# PCA, t-SNE, and UMAP Appyter

This Appyter [1] loads data in a matrix format, it then performs dimensionality reduction algorithms on the data; and then visualizes the dimensionality reduced data with with static and interactive 2D and 3D PCA [2], t-SNE [3], and UMAP [4] plots.  

In [None]:
%%appyter markdown
{% if visualization_method.value == "All" or visualization_method.value == "PCA" %}
## PCA
Principal Component Analysis (PCA) [2] is a statistical technique ussed to identify global patterns in high-dimensional datasets. It is commonly used to explore the similarity of biological samples in omics datasets. To perform PCA, variable values are transformed into Principal Components (PCs), a set of linearly orthogonal features which represent the most relevant sources of variance in the data. The top PCs subsequently visualized using a scatter plot for each variable to estimate similarity and differences between the variables in higher dimensions.
{% endif %}

In [None]:
%%appyter markdown
{% if visualization_method.value == "All" or visualization_method.value == "t-SNE" %}
## t-SNE
t-distributed stochastic neighbor embedding (t-SNE) [3] is a statistical method used to identify global patterns in high-dimensional datasets. It is a non-linear Dimensionality reduction technique, and it embeds the points from a higher dimension to a lower dimension trying to preserve the neighborhood of that point. It then provides each datapoint a location in a two or three-dimensional space visualized using a scatter plot.
{% endif %}

In [None]:
%%appyter markdown
{% if visualization_method.value == "All" or visualization_method.value == "UMAP" %}
## UMAP
Uniform Manifold Approximation and Projection (UMAP) [4] is a statistical method used to identify global patterns in high-dimensional datasets. It creates a graph that accurately reflects the topology of the true high dimensional graph. It then calculates the weight for edges of this graph and builds the low dimensional graph.
{% endif %}

In [None]:
%%appyter markdown
{% if need_leiden.value %}
## Leiden Algorithm
Leiden algorithm [5] identifies well-connected clusters in networks. In this procedure, the appyter takes into account both modalities of the data by integrating connectivity graphs generated from each modality (ref https://scanpy-tutorials.readthedocs.io/en/multiomics/cite-seq/pbmc5k.html#Clustering). It visualizes the samples colored by clusters.
{% endif %}

## Analysis Overview
The input data matrix that has been normalized first undergoes dimensionality reduction using PCA [2], t-SNE [3], and UMAP [4] with the Python library. Columns are then clustered based on their most-associated highly-variable variables and metadata features. Clusters are visualized using the Bokeh package [6] for the 2D plots, and the React-Scatter-Board package [7] developed by the Ma'ayan lab for interactive visualization of the 3D plots. In addition, user may select to upload data without metadata and use Leiden's algorithm [5] to visualize clusters.

# Read Data

In [None]:
def load_file(filename, **kwargs):
    if filename.endswith(".csv"):
        temp_df = pd.read_csv(filename, **kwargs).sort_index()
    elif filename.endswith(".txt") or filename.endswith(".tsv") or filename.endswith(".tpm"):
        temp_df = pd.read_csv(filename, sep="\t", **kwargs).sort_index()
    else:
        raise Exception('Error! Incorrect format. Please load file in txt, tsv, tpm, or csv format')
    
    return temp_df

In [None]:
if rnaseq_data_filename == '':
    raise Exception('Please upload data matrix.')
    
expr_df = load_file(rnaseq_data_filename, index_col=0)
display(expr_df.head())

if need_transpose:
    # transpose input matrix so that column names are genes
    expr_df = expr_df.transpose()
    
expr_df = expr_df.dropna(axis=1)

In [None]:
%%appyter code_exec
{% if meta_data_filename.value%}
meta_df = load_file(meta_data_filename)
assert meta_class_column_name in meta_df.columns, f"Error! Column '{meta_class_column_name}' is not in metadata"

# remove duplicates in index column
meta_df = meta_df.drop_duplicates(subset=[meta_class_column_name])

# align metedata and main data
temp = meta_df.merge(expr_df,left_on=meta_class_column_name, right_index=True)

df2 = temp[expr_df.columns].copy()
expr_df = df2.set_index(temp[meta_class_column_name])

df1 = temp[meta_df.columns].copy()
meta_df = df1.set_index(meta_class_column_name)

for i in range(meta_df.shape[0]):
    if expr_df.index[i] != meta_df.index[i]:
        raise Exception('Error! Metadata is not aligned with main data. Please check if you enter the correct metadata column name')
        
# category NAN value as 'missing' label
for feature in meta_df.columns:
    if meta_df[feature].isna().any():
        meta_df[feature].fillna('Missing', inplace =True)
        
display(meta_df.head())
{% endif %}

# Data Processing 

In [None]:
%%appyter markdown
{% if need_normalize.value %}
# Normalize Data
{% endif %}

In [None]:
%%appyter code_exec
{% if need_normalize.value%}
# Normalize Data
def log2_normalize(x, offset=1.):
    return np.log2(x + offset)

df_data_norm = log2_normalize(expr_df, offset=1)
df_data_norm = qnorm.quantile_normalize(df_data_norm, axis=0)

#convert to zscores
expr_df = pd.DataFrame(zscore(df_data_norm, axis=0), index=df_data_norm.index, columns=df_data_norm.columns)


# check if data have been normalized correctly
missing_percent = np.mean(expr_df.isnull().mean() * 100)
print("Missing value is {} %.".format(missing_percent))
if missing_percent > 10:
    raise Exception("Error in normalization! Too many NA values. Try to unselect normalization field.")
    
{% endif %}

In [None]:
%%appyter code_exec
{% if need_leiden.value %}
# Compute label and pca based on Leiden Algorithm 
leiden_df = sc.AnnData(expr_df,dtype=np.float32)

try:
    sc.pp.pca(leiden_df)
except:
    raise Exception("Error! Incorrect format. Please check if the data have been read/seperated correctly.")
    
sc.pp.neighbors(leiden_df)  
sc.tl.leiden(leiden_df, key_added="leiden")
{% endif %}

In [None]:
# Process metadata
df_y = pd.DataFrame()

if meta_data_filename:
    # exclude features with too many categories or only one feature or non string type
    feature_selection = [feature for feature in meta_df
                      if meta_df[feature].dtype == object and len(meta_df[feature].unique()) < 50 and len(meta_df[feature].unique()) > 1] 
    df_y = meta_df[feature_selection].copy()
    df_y = df_y.astype(str)

if need_leiden:
    df_y['leiden'] = list(leiden_df.obs['leiden'].values)
else:
    df_y['y'] = expr_df.index

    
for feature in df_y:
    # assign 'Unclassified' to low frequency categories
    if len(df_y[feature].unique()) > 10:
        top9 = df_y[feature].value_counts()[:9].index.tolist()
        remain = set(df_y[feature].unique()) - set(top9)
        df_y[feature].loc[df_y[feature].isin(remain)] = 'Unclassified'


In [None]:
%%appyter code_exec
# Data transformation for 2D visualization
{% if visualization_method.value == "All" or visualization_method.value == "PCA" %}
# Normalize transformed data to have a better visualization on 3D plot
leiden_df.obsm['X_pca'] = zscore(leiden_df.obsm['X_pca'],axis=0)

pca_data = pd.DataFrame({'x':leiden_df.obsm['X_pca'][:,0],
                       'y':leiden_df.obsm['X_pca'][:,1],
                       'z':leiden_df.obsm['X_pca'][:,2]})
pca_df = df_y.reset_index(drop=True).join(pca_data).set_index(df_y.index)
{% endif %}

{% if visualization_method.value == "All" or visualization_method.value == "t-SNE" %}
n_jobs=mp.cpu_count()
tsne = TSNE(n_jobs = n_jobs, n_components=3)
leiden_df.obsm['X_tsne'] = tsne.fit_transform(leiden_df.obsm['X_pca'])
leiden_df.obsm['X_tsne'] = zscore(leiden_df.obsm['X_tsne'],axis=0)

tsne_data = pd.DataFrame({'x':leiden_df.obsm['X_tsne'][:,0],
                       'y':leiden_df.obsm['X_tsne'][:,1],
                       'z':leiden_df.obsm['X_tsne'][:,2]})
tsne_df = df_y.reset_index(drop=True).join(tsne_data).set_index(df_y.index)
{% endif %}

{% if visualization_method.value == "All" or visualization_method.value == "UMAP" %}
sc.tl.umap(leiden_df, n_components=3)
leiden_df.obsm['X_umap'] = zscore(leiden_df.obsm['X_umap'],axis=0)

umap_data = pd.DataFrame({'x':leiden_df.obsm['X_umap'][:,0],
                       'y':leiden_df.obsm['X_umap'][:,1],
                       'z':leiden_df.obsm['X_umap'][:,2]})
umap_df = df_y.reset_index(drop=True).join(umap_data).set_index(df_y.index)
{% endif %}

In [None]:
%%appyter code_exec
# Data processing for 3D visualization
def create_3D_df(df_2D):
    return [dict(r, opacity=0.7) for i, r in df_2D.iterrows()]

{% if (plot_3D.value == "3D" or plot_3D.value == "All") and 
 (visualization_method.value == "All" or visualization_method.value == "PCA") %}
pca_df_3D = create_3D_df(pca_df)
{% endif %}

{% if (plot_3D.value == "3D" or plot_3D.value == "All") and 
 (visualization_method.value == "All" or visualization_method.value == "t-SNE") %}
tsne_df_3D = create_3D_df(tsne_df)
{% endif %}

{% if (plot_3D.value == "3D" or plot_3D.value == "All") and 
 (visualization_method.value == "All" or visualization_method.value == "UMAP") %}
umap_df_3D = create_3D_df(umap_df)
{% endif %}

# Visualize Samples

In [None]:
counter = iter(itertools.count(start=1))
def display_caption(caption):
    display(Markdown("*Figure {}. {}*".format(next(counter), caption)))

In [None]:
%%appyter code_exec
{% if plot_3D.value == "2D" or plot_3D.value == "All" %}

def generate_colors(input_df, feature):
    pal = sns.color_palette()
    color = factor_cmap(feature, palette=pal.as_hex(), factors=np.array(input_df[feature].unique()))

    return color 
    
def interactive_circle_plot(input_df, x_lab, y_lab, feature):
    input_df['legend'] = input_df[feature]
    source = ColumnDataSource(input_df)

    TOOLTIPS = [
        ("index", "$index"),
        ("(x,y)", "($x, $y)"),
        ("feature", '@'+feature),
    ]
    
    n = input_df[feature].nunique()
    numberOfPoints = input_df.shape[0]
    point_size = 10 if numberOfPoints < 100 else 25*np.log(numberOfPoints)/np.sqrt(numberOfPoints)
    
    p = figure(height=400, tooltips=TOOLTIPS,x_axis_label=x_lab, y_axis_label=y_lab,sizing_mode="scale_width")
       
    if meta_data_filename or need_leiden:
        color = generate_colors(input_df, feature)
        p1 = p.circle('x', 'y', size=point_size, source=source, legend_field='legend',fill_color= color, line_color=color)
        p.add_layout(p.legend[0], 'right')
    else:
        p1 = p.circle('x', 'y', size=3, source=source)
        
    p.output_backend = "svg"
    p.xgrid.visible = False
    p.ygrid.visible = False
    p.xaxis.minor_tick_line_color = None 
    p.yaxis.minor_tick_line_color = None 
    p.xaxis.axis_label_text_font_size = '12pt'
    p.yaxis.axis_label_text_font_size = '12pt'
    p.xaxis[0].formatter = NumeralTickFormatter(format="0.0")
    p.yaxis[0].formatter = NumeralTickFormatter(format="0.0")
        
    return p, p1, source

def generate_plot(p, p_circle, source_data, input_df):
    column_options = list(input_df.columns[:-4])

    my_colors = {}
    for feature in column_options:
        my_colors[feature] = generate_colors(input_df, feature)
        
    arg = dict(colors=my_colors, plot=p_circle, source = source_data, 
           data_category = input_df.reset_index().to_dict(orient='list'), feature = input_df.columns[0])
    
    select = Select(title="Plot to show:", value=column_options[0], options=column_options)
    select.js_on_change("value", CustomJS(args=arg, code="""

    plot.glyph.line_color = colors[this.value]
    plot.glyph.fill_color = colors[this.value]
    source.data['legend'] = data_category[this.value]
    source.data[feature] = data_category[this.value]
    
    """))

    layout = column(select, p)
    layout.sizing_mode = "stretch_both"
    return layout
{% endif %}

In [None]:
%%appyter code_exec
{% if (plot_3D.value == "2D" or plot_3D.value == "All") and 
(visualization_method.value == "All" or visualization_method.value == "PCA") %}
# 2D PCA plot
p, p_circle, source_data = interactive_circle_plot(pca_df, "PC-1", "PC-2", pca_df.columns[0])

if meta_data_filename or need_leiden:
    pca_plot = generate_plot(p, p_circle, source_data, pca_df)
    show(pca_plot)
else:
    show(p)
caption = "PCA 2D scatter plot. Each point presents a column from the input data matrix."
display_caption(caption)

{% endif %}

In [None]:
%%appyter code_exec
{% if (plot_3D.value == "2D" or plot_3D.value == "All") and 
 (visualization_method.value == "All" or visualization_method.value == "t-SNE") %}
# t-SNE scatter plot

p, p_circle, source_data = interactive_circle_plot(tsne_df, "t-SNE-1", "t-SNE-2", tsne_df.columns[0])

if meta_data_filename or need_leiden:
    tsne_plot = generate_plot(p, p_circle, source_data, tsne_df)
    show(tsne_plot)
else:
    show(p)

caption = "t-SNE 2D scatter plot. Each point presents a column from the input data matrix."
display_caption(caption)
{% endif %}

In [None]:
%%appyter code_exec
{% if (plot_3D.value == "2D" or plot_3D.value == "All") and 
 (visualization_method.value == "All" or visualization_method.value == "UMAP") %}
# Umap scatter plot

p, p_circle, source_data = interactive_circle_plot(umap_df, "UMAP-1", "UMAP-2", umap_df.columns[0])
if meta_data_filename or need_leiden:
    umap_plot = generate_plot(p, p_circle, source_data, umap_df)
    show(umap_plot)
else:
    show(p)

caption = "UMAP 2D scatter plot. Each point presents a column from the input data matrix."
display_caption(caption)
{% endif %}

In [None]:
%%appyter code_exec
{% if (plot_3D.value == "3D" or plot_3D.value == "All") and 
 (visualization_method.value == "All" or visualization_method.value == "PCA") %}
# PCA 3D scatter plot

display(ReactScatterBoard(
  is3d=True,
  data=pca_df_3D,
  shapeKey=pca_df.columns[0],
  colorKey=pca_df.columns[0],
  scale=4
))

caption = "PCA 3D scatter plot. Each point presents a column from the input data matrix. Scroll to zoom, drag to move around."

display_caption(caption)
{% endif %}

In [None]:
%%appyter code_exec
{% if (plot_3D.value == "3D" or plot_3D.value == "All") and (visualization_method.value == "All" or visualization_method.value == "t-SNE") %}
# t-SNE 3D scatter plot

display(ReactScatterBoard(
  is3d=True,
  data=tsne_df_3D,
  shapeKey=tsne_df.columns[0],
  colorKey=tsne_df.columns[0],
  scale=4
))

caption = "t-SNE 3D scatter plot. Each point presents a column from the input data matrix. Scroll to zoom, drag to move around."

display_caption(caption)
{% endif %}

In [None]:
%%appyter code_exec
{% if (plot_3D.value == "3D" or plot_3D.value == "All") and (visualization_method.value == "All" or visualization_method.value == "UMAP") %}
# Umap 3D scatter plot

display(ReactScatterBoard(
  is3d=True,
  data=umap_df_3D,
  shapeKey=umap_df.columns[0],
  colorKey=umap_df.columns[0],
  scale=4
))

caption = "UMAP 3D scatter plot. Each point presents a column from the input data matrix. Scroll to zoom, drag to move around."

display_caption(caption)
{% endif %}

# Reference



1. Clarke DJB, Jeon M, Stein DJ, Moiseyev N, Kropiwnicki E, Dai C, Xie Z, Wojciechowicz ML, Litz S, Hom J, Evangelista JE, Goldman L, Zhang S, Yoon C, Ahamed T, Bhuiyan S, Cheng M, Karam J, Jagodnik KM, Shu I, Lachmann A, Ayling S, Jenkins SL, Ma'ayan A. Appyters: Turning Jupyter Notebooks into data-driven web apps. Patterns (N Y). 2021 Mar 4;2(3):100213.
<br>
2. Clark NR, Ma'ayan A. Introduction to statistical methods to analyze large data sets: principal components analysis. Sci Signal. 2011 Sep 6;4(190):tr3.
<br>
3. Maaten, Laurens van der and Geoffrey E. Hinton. “Visualizing Data using t-SNE.” Journal of Machine Learning Research 9 (2008): 2579-2605.
<br>
4. McInnes, L., Healy, J., Saul, N. & Großberger, L. UMAP: uniform manifold approximation and projection. J. Open Source Softw. 3, 861 (2018).
<br>
5. Traag, Vincent A., Ludo Waltman, and Nees Jan van Eck. "From Louvain to Leiden: guaranteeing well-connected communities." Scientific reports 9.1 (2019): 1-12.
<br>
6. Bokeh Development Team (2018). Bokeh: Python library for interactive visualization. URL http://www.bokeh.pydata.org.
<br>
7. MaayanLab. react-scatter-board: Python library for interactive 3 Dimension plot. URL https://github.com/MaayanLab/react-scatter-board