In [None]:
!pip install --user phate scprep umap-learn tasklogger magic-impute 
!pip install --user git+https://github.com/jacoblevine/phenograph.git

We'll install `louvain` by downloading a precompiled version, because it takes a while. You could install it simply by running

    pip install --user louvain

In [None]:
import scprep
scprep.io.download.download_google_drive("1cpwiEVJhEhXPNvO3K6-jxEg3k3H-d1Sg", "python_igraph-0.7.1.post6-cp36-cp36m-linux_x86_64.whl")
scprep.io.download.download_google_drive("1IsD8uMy_7g-yydMRy2W6Mc4XLqXaF6ea", "louvain-0.6.1-cp36-cp36m-linux_x86_64.whl")

In [None]:
!pip install --user python_igraph-0.7.1.post6-cp36-cp36m-linux_x86_64.whl louvain-0.6.1-cp36-cp36m-linux_x86_64.whl

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sklearn
import sklearn.cluster
import sklearn.manifold

import os
import tasklogger
import phate
import umap

import graphtools as gt
import magic
import phenograph
import louvain

<a id='loading'></a>
# 1. Loading preprocessed data

### Load EB Data (and download if needed)

In [None]:
file_path = os.path.expanduser('~/EBT_counts.pkl.gz')
if not os.path.exists(file_path):
    scprep.io.download.download_google_drive(id='1Xz0ONnRWp2MLC_R6r74MzNwaZ4DkQPcM',
                                             destination=file_path)
data = scprep.utils.SparseDataFrame(pd.read_pickle(file_path))

In [None]:
metadata = pd.DataFrame([ix.split('_')[1] for ix in data.index], columns=['sample'], index=data.index)

# 2. Imputation using MAGIC

As we mentioned, because scRNA-seq, suffers from dropout (the undercounting of mRNA molecules in single cells), examining expression lowly expressed genes can be challenging. To overcome this, we developed a method for imputation called MAGIC (Markov Affinity Graph Imputation in Cells; [PMID: 29961576 ](https://www.ncbi.nlm.nih.gov/pubmed/29961576)). 

Without going into too much detail about how it works, the basic idea is that although gene expression counts in each cell are unreliable, the gene expression counts matrix contains several degrees of redundancy between genes (i.e. expression of genes is not completely indepdent) and among cells (i.e. we see very similar cells more than once).

A full description of MAGIC can be found in the paper, and on our website: [Denoising and imputing scRNA-seq data](https://www.krishnaswamylab.org/blog/2018/10/28/denoising-noisy-gene-expression-in-scrna-seq).

#### Running MAGIC

Because MAGIC returns a dense matrix (many zeros in the counts matrix are removed), we only want to keep expression of a few marker genes. The list in the following cell was gathering while analyzing the data, and we'll look at them later.



In [None]:
marker_genes = scprep.select.get_gene_set(data, exact_word=[
    'ARID3A', 'ASCL2',  'CD34', 'CDX2', 'CER1', 'DLX1',
    'DMRT3', 'EN2', 'EOMES', 'FOXA2', 'FOXD3-AS1', 'GATA3-AS1',
    'GATA4', 'GATA5', 'GATA6-AS1', 'GBX2', 'GLI3', 'HOXA2',
    'HOXB1', 'HOXB4', 'HOXD13', 'HOXD9', 'ISL1', 'KLF5',
    'KLF7', 'LEF1', 'LHX2', 'LHX5', 'LMX1A', 'MAP2',
    'MIXL1', 'MYCBP', 'NANOG', 'NES', 'NKX2-1', 'NKX2-5',
    'NKX2-8', 'NPAS1', 'NR2F1-AS1', 'OLIG1', 'OLIG3', 'ONECUT1',
    'ONECUT2', 'OTX2', 'PAX3', 'PAX6', 'PDGFRA', 'PECAM1',
    'POU5F1', 'SATB1', 'SIX2', 'SIX3-AS1', 'SIX6', 'SOX13',
    'SOX10', 'SOX15', 'SOX17', 'SOX9', 'TTLL10', 'TAL1',
    'TBX15', 'TBX18', 'TBX5', 'TNNT2', 'WT1', 'ZBTB16',
    'ZIC2', 'ZIC5', 'ACTB', 'HAND1'])
marker_genes

In [None]:
data_magic = magic.MAGIC().fit_transform(data, genes=marker_genes)

In [None]:
data_magic.head()

# 3. Visualizing the data

We've already spent time discussing visualizations of this data using tSNE, PHATE, and UMAP. Here, we're going to focus on PHATE, but feel free to try running another algorithm here.

In [None]:
data_phate = phate.PHATE().fit_transform(data)
# alternative: umap.UMAP(), sklearn.manifold.TSNE()
data_phate = pd.DataFrame(data_phate, index=data.index)

In [None]:
scprep.plot.scatter2d(data_phate, c=metadata['sample'], figsize=(12,8), cmap="Spectral",
                      ticks=False, label_prefix="PHATE")

# 4. Clustering

## 4.1 Introduction to clustering

#### What is clustering?

The goal of clustering is to identify a partition of the data such that all of the observations within each partition (called a cluster) are more similar to each other than they are to those in other clusters. Many clustering algorithms exist, and they each have their own quirks (just like visualization algorithms). `sklearn` has implementations for some of the most popular ones and their [User Guide on Clustering](https://scikit-learn.org/stable/modules/clustering.html) is a good resource to understand general clustering approaches.

In [None]:
data_pca = scprep.reduce.pca(data, n_components=50, method='dense')

In [None]:
phenograph_clusters, _, _ = phenograph.cluster(data_pca)

In [None]:
with tasklogger.log_task("KMeans"):
    kmeans_clusters = sklearn.cluster.KMeans(n_clusters=20).fit_predict(data_pca)

In [None]:
G = gt.Graph(data_pca)
G_igraph = G.to_igraph()

In [None]:
with tasklogger.log_task("Louvain"):
    partition = louvain.find_partition(G_igraph, louvain.RBConfigurationVertexPartition, 
                                       weights="weight", resolution_parameter=1)
    louvain_clusters = np.array(partition.membership)

In [None]:
with tasklogger.log_task("Spectral clustering"):
    spec_op = sklearn.cluster.SpectralClustering(n_clusters=20, affinity='precomputed')
    spectral_clusters = spec_op.fit_predict(G.K)

## 4.2 Clustering single cell data

#### Clustering methods for single cell RNA-seq

In the previous exercise, we compared three popular algorithms for clustering: KMeans, Spectral Clustering, and Louvain.

#### Reordering clusters by PHATE coordinates

By default, cluster numbers are meaningless. This makes it difficult to compare clusterings, because cluster "0" can refer to entirely different cells between runs of the same algorithm.

To enforce some consistency, we are going to reorder the clusters so that cluster "0" is always on the far right of the PHATE plot, and the highest-numbered cluster is always on the far left.

In [None]:
clusterings = {'Phenograph':phenograph_clusters,
               'Louvain':louvain_clusters, 
               'KMeans':kmeans_clusters, 
               'Spectral':spectral_clusters}

for alg in clusterings:
    cl_nu = scprep.utils.sort_clusters_by_values(clusterings[alg], -data_phate.iloc[:,0])
    clusterings[alg] = cl_nu

#### Plotting clusters with PHATE

Above, we colored our scatter plots using expression of a marker gene. However, we can use any information with one value per cell to color the plot. Here, we're using the cluster identity.

In [None]:
fig, axes = plt.subplots(2,2, figsize=(16,16))

for ax, algorithm in zip(axes.flatten(), clusterings):
    scprep.plot.scatter2d(data_phate, c=clusterings[algorithm], cmap=plt.cm.tab20,
                          title='{} - ({})'.format(algorithm, len(np.unique(clusterings[alg]))), 
                          ticks=False, label_prefix="PHATE", legend=False, discrete=True,
                          ax=ax)

We can also plot each cluster individually to see where they are on the PHATE plot




In [None]:
n_rows = 7
n_cols = 3

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*4,n_rows*4))
axes = axes.flatten()
clusters = clusterings['Spectral']
cluster_ids = np.unique(clusters)
for i in range(len(axes)):
    ax = axes[i]
    try:
        curr_cluster = cluster_ids[i]
        # Returns array([False, True,...,False]) indicating if each cell is in the
        # current cluster
        curr_mask = clusters == curr_cluster  
        scprep.plot.scatter2d(data_phate, c=curr_mask,
                              cmap={True: plt.cm.tab20.colors[i], False: 'grey'},
                              ticks=False, label_prefix='PHATE', ax=ax)
    except IndexError:
        # we have more axes than clusters
        ax.axis('off')


fig.tight_layout()

### How quantitatively similar are the clustering algorithms?

We can use a metric, called the adjusted rand score, to quantify how similar any given clustering results are to each other. This is a similarity metric, so two identical clusterings will produce a score of `1.0`. If every point is randomly assigned to different clusters, then the adjusted rand score will be close to `0.0`. You can learn more about this metric in the `sklearn` user guide: https://scikit-learn.org/stable/modules/clustering.html#adjusted-rand-score.

#### Grouping all of the cluster vectors into a single array

Reading the following code, can you guess what shape `all_clusterings` will be at the end of the code block? 

In [None]:
all_clusterings = []
all_algorithms = list(clusterings.keys())
for algo in all_algorithms:
    all_clusterings.append(clusterings[algo])    
all_clusterings = np.vstack(all_clusterings)

Here, we're using `pdist` with a custom distance metric. Note, `sklearn.metrics.adjusted_rand_score` is actually a similarity metric, not a distance metric. So this code is not good practice and qualifies as a hack. That being said, it works.

In [None]:
from scipy.spatial.distance import pdist, squareform
cluster_similarities = squareform(pdist(all_clusterings, metric=sklearn.metrics.adjusted_rand_score))
# squareform assumes diagonals will be 0, but they're actually 1 because this is a similarity metric
# so we need to add 1's on the diagonal with np.eye()
cluster_similarities = cluster_similarities + np.eye(4)

#### Plot a clustermap of distances

With your partner, interpret this heatmap. Which cluster results are the most similar or different?

In [None]:
sns.clustermap(cluster_similarities, xticklabels=all_algorithms, yticklabels=all_algorithms)



## Characterizing clusters

Now, we want to characterize gene expression in each cluster to identify the cell type of that cluster. Cell type is easy to grasp intuitively, but hard to formalize. Generally, we want a cell that expresses a set of genes differently than the rest of the cells in the dataset. As part of writing the PHATE paper, we characterized all the cell types in this time course, and made the following fate map.

<img src="https://krishnaswamylab.github.io/img/how_to_single_cell/EB_lineage_map.png" style="height: 40rem;"/>

>Lineage tree of the EB system determined from the PHATE analysis showing embryonic stem cells (ESC), the primitive streak (PS), mesoderm  (ME), endoderm (EN), neuroectoderm (NE), neural crest (NC), neural progenitors (NP), lateral plate ME (LP ME), hemangioblast (H), cardiac (C), epicardial precursors (EP), smooth muscle precursors (SMP), cardiac precursors (CP), and neuronal subtypes (NS).

In groups, you need to examine the expression of these sets of genes and figure out which cell type matches your cluster.

### Selecting a set of clusters

As a table, decide on a clustering result to use for this next exercise. Each pair will then pick a cluster to manually annotate based on known markers.

In [None]:
# ==============
# choose your favorite clustering method
clusters = clusterings[ ? ]
# ==============

### Jitterplot

The following plot is called a [jitterplot](https://scprep.readthedocs.io/en/stable/examples/jitter.html). This plot is similar to a [violinplot](https://seaborn.pydata.org/generated/seaborn.violinplot.html) or a [swarmplot](https://seaborn.pydata.org/generated/seaborn.swarmplot.html), but is computationally cheaper. The goal of this plot is to examine the distribution of values associated with each cell in a set of clusters.

Each point is a cell and the x-axis represents the cluster assignment of that cell. There's a little bit of noise added to the x-position of the cell (that's the jitter). The y-axis represents expression of a given gene.

You can change which gene is plotted by changing the `curr_gene` variable. All of the genes in the above plot will work.

FYI: when a gene is listed as `ZIC2/5`, this actually represents two genes: `ZIC2` and `ZIC5`.

Try running with and without MAGIC and determine which is more helpful for identifying clusters.

#### Raw data

In [None]:
curr_gene = 'POU5F1'
curr_expression = scprep.select.select_cols(data, exact_word=curr_gene)
scprep.plot.jitter(clusters, curr_expression, c=clusters, figsize=(12, 5),
                   legend_anchor=(1,1), title=curr_gene)

#### MAGIC

Let's plot the same thing with denoised data from MAGIC.

In [None]:
curr_gene = 'POU5F1'
curr_expression = scprep.select.select_cols(data_magic, exact_word=curr_gene)
scprep.plot.jitter(clusters, curr_expression, c=clusters, figsize=(12, 5),
                   legend_anchor=(1,1), title=curr_gene)

#### Examining expression on PHATE

The other tool you should use for examining clusters is the PHATE plots we used above. You can change the `genes_for_plotting` list, as long as you keep it restricted to three genes.

Again change `data` to `data_magic` and determine if this makes annotating clusters easier.

In [None]:
fig, axes = plt.subplots(1,3, figsize=(14,4))
genes_for_plotting = ['NANOG', 'POU5F1', 'HAND1']

for curr_gene, ax in zip(genes_for_plotting, axes.flatten()):
    
    expression = scprep.select.select_cols(data, exact_word=curr_gene)
    
    sort_index = expression.sort_values(by=expression.columns[0]).index
    
    scprep.plot.scatter2d(data_phate.loc[sort_index], c=expression.loc[sort_index], shuffle=False,
                         title=curr_gene, ticks=None, label_prefix='PHATE', ax=ax)

fig.tight_layout()

## Discussion

Now, in your groups, you will be assigned to a cell type from the above lineage map. Please do the following tasks and be prepared to share your conclusions with the rest of the class.
1. Identify which cluster corresponds to your cell type of interest. 
2. Create a PHATE plot that highlights your cells of interest in some way (i.e. shows them in another color). Does this plot make sense with the position of your cell type in the developmental lineage? What information can you learn about your cell type from the PHATE plot?
3. Open coding exercise: Create another plot that shows something interesting about your cell type. This plot should answer a biological question, for example:

      a. Is the expression of marker genes heterogeneous within your cluster?
      
      b. Are your cells present at multiple time points?
      
      c. What other genes are uniquely expressed in your cell type?

# Differential expression

In the above exercise, we manually inspected known marker genes to annotate clusters. Here, we're going to simply identify the genes that change the most between two groups. As discussed in lecture, there are several ways to calculate differential expression. We are going to focus on the mean-difference and the rank-sum statistic.

You can easily calculate basic differential expression statistics using `scprep.stats.differential_expression` and `scprep.stats.differential_expression_by_cluster`. Documentation for these methods can be found at: https://scprep.readthedocs.io/en/stable/reference.html#scprep.stats.differential_expression



In [None]:
data_sparse = scprep.utils.SparseDataFrame(data)

In [None]:
curr_cluster = 0
with tasklogger.log_task("t-test"):
    ttest_results = scprep.stats.differential_expression(data_sparse[clusters == curr_cluster], 
                                                         data_sparse[clusters != curr_cluster], 
                                                         measure='ttest')

In [None]:
top_genes = ttest_results['ttest'].abs().sort_values(ascending=False)[:50]

#### Print the 20 most differentially expressed genes

In [None]:
top_genes.head(n=20)

### Plotting a histogram of gene expression in your cluster and in all other clusters

Hints:

1. Use `scprep.plot.histogram` (https://scprep.readthedocs.io/en/dev/reference.html#scprep.plot.histogram). 
 * Note, you can pass an list of expression values to get them plotted on the same axis.
 * Try using `log='y'` to get a log-scale y-axis
 * Use `title=` to set the title
 * The blue / grey histogram is the first group in the list, orange is the rest
2. Use the code from the `scprep.stats.differential_expression` to select the rows of a DataFrame in your cluster and not in your cluster
3. The gene names are stored in `top_genes.index`. How would you select the 0th item from this index?

In [None]:
# ======
# Write code to create a histogram here
curr_gene = 
expression_in_cluster = 
expression_not_in_cluster = 


scprep.plot.histogram(

# ======

### Creating a heatmap of differentially expressed genes

#### Creating a heatmap using `sns.clustermap`

First, we'll do the simplest thing and create a simple `clustermap`. 

In [None]:
sns.clustermap(data_sparse[top_genes.index].T)

#### Why does this look bad?

Notice that the ranges of expression for each gene is very different. In order to get a pretty plot, we'll need to normalize the data first.

### Z-score standardizing the data

We can easily z-score standardize the data using `sklearn.preprocessing.StandardScalar`. This will mean-center each columns of the `de_data` and set the standard deviation of each column to 1.

In [None]:
de_data = data_sparse[top_genes.index]
de_data_zscore = sklearn.preprocessing.StandardScaler().fit_transform(de_data)

#### Generating `col_colors`

We can add colors for each column in the clustermap using a `col_colors` variable that has one RBGA or hex color per column of the data.

In [None]:
col_colors = plt.cm.tab20(clusters)

#### Calculating color limits so the `cmap` is centered at 0

In [None]:
lim = np.max(np.abs(de_data_zscore))

#### Generating the pretty clustermap

In [None]:
cg = sns.clustermap(de_data_zscore.T, col_colors=col_colors, cmap='RdBu_r', 
                    vmin=-lim, vmax=lim, yticklabels=de_data.columns.values)

cg.ax_heatmap.figure.set_size_inches(12,12)

### Discussion

Using the clusters that you annotated in the previous exercise, perform the differential expression analysis for that cluster and generate the above heatmap and histogram. 

1. What do you notice about how uniformly each gene is expressed in each cluster?
2. How well does the clustermap separate your cluster from other clusters based on each individual gene?
3. Try plotting heatmaps for the 500-550th most DE genes or the 7000-7050 most DE genes. Try the 50 least differentially expressed genes. What do you notice about these plots compared to the ones for the most DE genes?



