In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# Single Cell Enrichment

We prepare single cell data, computing clusters, differential expression, and enrichment analysis.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import plotly.express as px
import scipy.sparse as sp_sparse
import seaborn as sns
from matplotlib_venn import venn2
from geode import chdir
from sklearn.preprocessing import StandardScaler
from IPython.display import display
from umap import UMAP
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from collections import OrderedDict
from sklearn.metrics import silhouette_score
from maayanlab_bioinformatics.api import enrichr_link_from_genes, enrichr_get_top_results
from maayanlab_bioinformatics.normalization import log2_normalize, zscore_normalize, quantile_normalize, filter_by_expr
from maayanlab_bioinformatics.dge import characteristic_direction, up_down_from_characteristic_direction
from maayanlab_bioinformatics.utils import merge, fetch_save_read

## Configure analysis

In [None]:
%%appyter hide_code_exec

{% do SectionField(
    name='INPUT',
    label='Upload your single-cell data',
    description='In various data formats, see file descriptions.',
) %}

{% set expression = FileField(
    name='expression_matrix',
    label='Expression matrix file',
    description='[REQUIRED] Expression matrix file need to be a csv or txt file with genes by samples matrix. The values in the matrix can be read counts or normalized read counts such as CPM, RPKM, FPKM, TPM and etc',
    default='biojupies_example_matrix.txt',
    examples={'biojupies_example_matrix.txt': 'https://amp.pharm.mssm.edu/biojupies/app/static/data/biojupies_example_matrix.txt'},
    section='INPUT',
) %}

{% set features = FileField(
    name='features',
    label='Features file',
    description='[OPTIONAL] Features file need to be a csv or txt file with the transcripts on the rows and attributes on the columns, `symbol` is mandatory if provided and should contain a gene symbol.',
    default='',
    section='INPUT',
) %}

{% set barcodes = FileField(
    name='barcodes',
    label='Barcodes file',
    description='[OPTIONAL] Barcodes file need to be a csv or txt file with the samples on the rows and attributes on the columns.',
    default='biojupies_example_metadata.txt',
    examples={'biojupies_example_metadata.txt': 'https://amp.pharm.mssm.edu/biojupies/app/static/data/biojupies_example_metadata.txt'},
    section='INPUT',
) %}

{% do SectionField(
    name='CONFIG',
    label='Configuration',
    description='Configure various parameters for the analysis',
) %}

{% set top_n_genes = IntField(
    name='top_n_genes',
    label='Number of Genes',
    description='The number of \'top\' genes to use for differential expression',
    default=250,
    min=100,
    max=1000,
    section='CONFIG',
) %}

{% set top_n_results = IntField(
    name='top_n_results',
    label='Number of Top Enrichment Results',
    description='The number of \'top\' results to keep from enrichment analysis',
    default=5,
    min=1,
    max=100,
    section='CONFIG',
) %}

In [None]:
%%appyter code_exec

# random state for reproducible output
random_state = 42

# the single cell data
expression = {{ expression }}

{% if features.value %}
features = {{ features }}
{% endif %}

{% if barcodes.value %}
barcodes = {{ barcodes }}
{% endif %}

# The number of 'top' genes to use for differential expression
top_n_genes = {{ top_n_genes }}

# The number of 'top' results to keep from enrichment analysis
top_n_results = {{ top_n_results }}

# TODO: add enrichr libraries as categories as fields
useful_libs = OrderedDict([
  ('cell_type', ['Human_Gene_Atlas', 'Mouse_Gene_Atlas', 'ARCHS4_Tissues']),
  ('pathways', ['WikiPathways_2019_Mouse', 'WikiPathways_2019_Human']),
  ('transcription', ['ARCHS4_TFs_Coexp', 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X']),
])

## Fetch and load data

In [None]:
%%appyter code_exec

def load_dataframe(file):
    ''' Load a file by downloading it or reading it if already downloaded.
    '''
    if not os.path.exists(file):
        import urllib.request
        urllib.request.urlretrieve(f"{{ url_for(_session, filename='', public=True) }}/{file}")

    ext = os.path.splitext(file)[1]
    if ext in {'.tsv', '.txt'}:
        df = pd.read_csv(file, sep='\t', index_col=0)
    elif ext == '.csv':
        df = pd.read_csv(file, index_col=0)
    else:
        raise Exception('Unrecognized file format', ext)

    # Fix any type coersion on identifiers
    df.index = df.index.astype(str)
    df.columns = df.columns.astype(str)

    return df

In [None]:
%%appyter code_exec

df_expression = load_dataframe(expression)
display(df_expression.head())

{% if features.value %}
df_features = load_dataframe(features)
display(df_expression.head())
{% endif %}

{% if barcodes.value %}
df_barcodes = load_dataframe(barcodes)
display(df_barcodes.head())
{% endif %}

In [None]:
%%appyter code_exec
{% if barcodes.value %}
venn2([
    set(df_expression.columns),
    set(df_barcodes.index),
], [
    'expression matrix (column)',
    'barcodes (index)',
])
plt.show()
assert set(df_expression.columns) & set(df_barcodes.index), "There should be overlap or we won't have any barcodes!"
{% endif %}

In [None]:
%%appyter code_exec
{% if features.value %}
venn2([
    set(df_expression.index),
    set(df_features.index),
], [
    'expression matrix (index)',
    'features (index)',
])
plt.show()
assert set(df_expression.index) & set(df_features.index), "There should be overlap or we won't have any features!"
{% endif %}

In [None]:
%%appyter markdown
{% if features.value %}
## Map transcripts to Genes
{% endif %}

In [None]:
%%appyter code_exec
{# TODO: allow organism to be configured #}
{% if features.value %}
# Get NCBI Gene information
ncbi = pd.read_csv('ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', sep='\t')
# Ensure nulls are treated as such
ncbi = ncbi.applymap(lambda v: float('nan') if type(v) == str and v == '-' else v)
# Break up lists
split_list = lambda v: v.split('|') if type(v) == str else []
ncbi['dbXrefs'] = ncbi['dbXrefs'].apply(split_list)
ncbi['Synonyms'] = ncbi['Synonyms'].apply(split_list)
ncbi['LocusTag'] = ncbi['LocusTag'].apply(split_list)
ncbi['Other_designations'] = ncbi['Other_designations'].apply(split_list)

# Map existing entities to NCBI Genes
ncbi_lookup = {
  sym.upper(): row['Symbol'].upper()
  for _, row in ncbi.iterrows()
  for sym in [row['Symbol']] + row['Synonyms']
}
{% endif %}

In [None]:
%%appyter code_exec
{% if features.value %}
# Select transcripts with highest variance corresponding to genes
df_transcript_genes = merge(
  df_expression.var(axis=1).to_frame('var'),
  df_features[['symbol']].applymap(lambda s: str(ncbi_lookup.get(s.upper())))
).groupby('symbol')['var'].idxmax().reset_index()
df_transcript_genes.index = df_transcript_genes['var']
df_transcript_genes = df_transcript_genes.drop('var', axis=1)
df_transcript_genes
{% endif %}

### Obtain a gene expression matrix

In [None]:
%%appyter code_exec
{% if features.value %}
df_gene_expression = df_expression.loc[df_transcript_genes.index]
df_gene_expression.index = df_transcript_genes['symbol']
{% else %}
df_gene_expression = df_expression
{% endif %}

## Normalize Gene Expression Matrix

### Review existing library size and distribution

In [None]:
df_library_size = pd.DataFrame(
    {
        'n_reads': df_gene_expression[df_gene_expression > 0].count(),
        'log_n_reads': np.log2(df_gene_expression[df_gene_expression > 0].count() + 1),
        'n_expressed_genes': df_gene_expression.sum(),
    }
).sort_values('n_reads', ascending=False)

display(df_library_size.head())
sns.distplot(df_gene_expression.iloc[0, :]); plt.show()
sns.distplot(df_gene_expression.iloc[:, 0]); plt.show()

### Perform normalization

In [None]:
%%appyter code_exec
{# TODO: make configurable #}

df_gene_expression_norm = filter_by_expr(df_gene_expression)
df_gene_expression_norm = log2_normalize(df_gene_expression_norm)
df_gene_expression_norm = zscore_normalize(df_gene_expression_norm.T).T
df_gene_expression_norm = quantile_normalize(df_gene_expression_norm)

display(df_gene_expression_norm.head())

### Review normalized count distributions

In [None]:
%%appyter code_exec
{# TODO: potentially evaluate kurtosis and warn about problems with normalization #}
sns.distplot(df_gene_expression_norm.iloc[0, :]); plt.show()
sns.distplot(df_gene_expression_norm.iloc[:, 0]); plt.show()

## Dimensionality Reduction & Visualization

### PCA

In [None]:
%%appyter code_exec
gene_expression_norm_pca = PCA(random_state=random_state)
gene_expression_norm_pca.fit(df_gene_expression_norm.values.T)
df_gene_expression_norm_pca = pd.DataFrame(
    gene_expression_norm_pca.transform(df_gene_expression_norm.values.T),
    index=df_gene_expression_norm.T.index
)
df_gene_expression_norm_pca.columns = [
    f'PCA-{c} ({r:.3f})'
    for c, r in zip(df_gene_expression_norm_pca.columns, gene_expression_norm_pca.explained_variance_ratio_)
]
display(df_gene_expression_norm_pca.head())

In [None]:
display(
  px.scatter(
    merge(
      df_gene_expression_norm_pca,
      df_library_size,
    ),
    x=df_gene_expression_norm_pca.columns[0],
    y=df_gene_expression_norm_pca.columns[1],
    size='n_reads',
    size_max=8,
    hover_data=[df_gene_expression_norm.columns],
  )
)

### UMAP

In [None]:
%%appyter code_exec
{# TODO: make configurable #}
gene_expression_norm_umap = UMAP(
  random_state=random_state,
  n_components=2,
  n_neighbors=30,
  metric='cosine',
  min_dist=0.3,
)
gene_expression_norm_umap.fit(df_gene_expression_norm_pca.iloc[:, :10].values)

df_gene_expression_norm_umap = pd.DataFrame(
  gene_expression_norm_umap.transform(df_gene_expression_norm_pca.iloc[:, :10].values),
  columns=['UMAP-1', 'UMAP-2'],
  index=df_gene_expression_norm_pca.index,
)

In [None]:
display(
  px.scatter(
    merge(
      df_gene_expression_norm_umap,
      df_library_size,
    ),
    x=df_gene_expression_norm_umap.columns[0],
    y=df_gene_expression_norm_umap.columns[1],
    size='n_reads',
    size_max=8,
    hover_data=[df_gene_expression_norm.columns],
  )
)

## Silhouette Cluster Analysis

In [None]:
silhouette_scores = {}
for n in range(2, min(df_gene_expression_norm_umap.shape[0] - 1, 25)):
    np.random.seed(0)
    y_pred = KMeans(n_clusters=n, random_state=random_state).fit_predict(df_gene_expression_norm_umap.values)
    silhouette_scores[n] = silhouette_score(df_gene_expression_norm_umap.values, y_pred, metric='cosine')

silhouette_scores = pd.DataFrame([
    {'N Clusters': k, 'Silhouette Score': v}
    for k, v in silhouette_scores.items()
])
best = silhouette_scores.sort_values('Silhouette Score').iloc[-1]
silhouette_scores

In [None]:
plt.plot(silhouette_scores['N Clusters'], silhouette_scores['Silhouette Score'])
plt.scatter([best['N Clusters']], [best['Silhouette Score']], label='Best')
plt.legend()
plt.title('Cluster size selection')
plt.ylabel('Silhouette Score')
plt.xlabel('Number of Clusters')
plt.show()

In [None]:
km = KMeans(n_clusters=int(best['N Clusters']), random_state=random_state)
df_gene_expression_norm_km = pd.DataFrame({
    'Cluster': [
        str(c)
        for c in km.fit_predict(df_gene_expression_norm_umap.values)
    ]
}, index=df_gene_expression_norm_umap.index)

### PCA with Clusters

In [None]:
px.scatter(
  merge(
    df_gene_expression_norm_pca,
    df_gene_expression_norm_km,
    df_library_size,
  ),
  x=df_gene_expression_norm_pca.columns[0],
  y=df_gene_expression_norm_pca.columns[1],
  size='n_reads',
  size_max=8,
  color='Cluster',
  hover_data=[df_gene_expression_norm.columns],
)

### UMAP with Clusters

In [None]:
px.scatter(
  merge(
    df_gene_expression_norm_umap,
    df_gene_expression_norm_km,
    df_library_size,
  ),
  x=df_gene_expression_norm_umap.columns[0],
  y=df_gene_expression_norm_umap.columns[1],
  size='n_reads',
  size_max=8,
  color='Cluster',
  hover_data=[df_gene_expression_norm.columns],
)

## Differential Expression

We perform differential expression for each cluster in a one vs rest fashion.

In [None]:
# Perform differential expression for each cluter
top_genes = {}
for cluster, samples in df_gene_expression_norm_km.groupby('Cluster'):
  top_genes[cluster] = up_down_from_characteristic_direction(
    characteristic_direction(
      # expression outside of this cluster
      df_gene_expression_norm.loc[:, df_gene_expression_norm.columns.difference(samples.index)],
      # expression in this cluster
      df_gene_expression_norm.loc[:, samples.index],
    ),
    top_n_genes,
  )

display(top_genes)

## Enrichment Analysis

### We submit differentially expressed genes to Enrichr.

In [None]:
# Get Enrichr links for each cluster
enrichr_links = {}

for cluster, genes in top_genes.items():
  up_link, dn_link = None, None
  if len(genes.up):
    up_link = enrichr_link_from_genes(sorted(genes.up), 'cluster %s up' % (cluster))
    # display_link_inline(up_link['link'])
  else:
    print('cluster %s up: empty' % (cluster))
  if len(genes.down):
    dn_link = enrichr_link_from_genes(sorted(genes.down), 'cluster %s down' % (cluster))
    # display_link_inline(dn_link['link'])
  else:
    print('cluster %s down: empty' % (cluster))
  enrichr_links[cluster] = {
    'up': up_link,
    'down': dn_link,
  }

pd.DataFrame(enrichr_links)

### Grab top results from Enrichr results

In [None]:
# Grab top results for each cluster
all_results = []
for cluster, links in enrichr_links.items():
  for link_type, link in links.items():
    if link is None:
      continue
    for category, libraries in useful_libs.items():
      for library in libraries:
        try:
          results = enrichr_get_top_results(link['userListId'], library).sort_values('pvalue').iloc[:top_n_results]
          results['link'] = link['link']
          results['library'] = library
          results['category'] = category
          results['direction'] = link_type
          results['cluster'] = cluster
          all_results.append(results)
        except:
          print('{}: {} {} {} cluster {} failed, continuing'.format(link, library, category, link_type, cluster))

df_all_results = pd.concat(all_results)
df_all_results

## Export results for scEnrichr Dashboard

In [None]:
g = merge(df_gene_expression_norm_km, df_gene_expression_norm_pca)
g.index.rename('Barcode', inplace=True)
g.reset_index().to_csv(
  'df_pca.tsv',
  sep='\t',
  index=None,
)

g = merge(df_gene_expression_norm_km, df_gene_expression_norm_umap)
g.index.rename('Barcode', inplace=True)
g.reset_index().to_csv(
  'df_umap.tsv',
  sep='\t',
  index=None,
)

df_all_results.to_csv(
  'df_enrich.tsv',
  sep='\t',
  index=None,
)

In [None]:
%%appyter markdown

The files are now available for download and for display with the scEnrichr Dashboard:

- [df_pca.tsv](./df_pca.tsv)
- [df_umap.tsv](./df_umap.tsv)
- [df_enrich.tsv](./df_enrich.tsv)


**[View Dashboard](../dashboard/{{ _session }})**