In [None]:
import os

from aavomics import database
from aavomics import aavomics
import anndata
import numpy
import scipy.stats
from scipy import stats
from statsmodels.stats import proportion
from statsmodels.stats import multitest

import plotly.graph_objects as graph_objects
from plotly import offline as plotly
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import metrics
from sklearn.cluster import AgglomerativeClustering
import scanpy
scanpy.settings.figdir = "out"

In [None]:
CELL_SET_NAME = "20190711_TC4"
ALIGNMENT_NAME = "cellranger_5.0.1_gex_mm10_2020_A"

cell_set = database.CELL_SETS_DICT[CELL_SET_NAME]

In [None]:
adata = anndata.read_h5ad(cell_set.get_anndata_file_path(alignment_name=ALIGNMENT_NAME))

In [None]:
cell_ranger_filtered = adata[adata.obs["Cell Ranger Called"]].copy()

In [None]:
non_zero_genes = cell_ranger_filtered.X.sum(axis=0) > 0

cell_ranger_filtered = cell_ranger_filtered[:, non_zero_genes].copy()

transcript_counts = numpy.array(cell_ranger_filtered.X.todense())

transcript_counts = transcript_counts/transcript_counts.sum(axis=1).reshape((-1, 1))
transcript_counts = transcript_counts * 5000
transcript_counts = transcript_counts + 1
transcript_counts = numpy.log10(transcript_counts)

In [None]:
print("Dimensionality reduction via PCA")

pca = PCA(n_components=50)
transformed_PCA = pca.fit_transform(transcript_counts)

In [None]:
transformed_tSNE = TSNE(
    verbose=True, perplexity=30, n_components=2, n_jobs=16).\
    fit_transform(transformed_PCA)

In [None]:
# Attempt agglomerative clustering for a range of clusters and find the highest one

cluster_range = range(15, 16)

silhouette_scores = []

highest_silhouette_score = -numpy.inf
highest_clusters = None

for num_clusters in cluster_range:

    print("Testing %i clusters" % num_clusters)

    clusterer = AgglomerativeClustering(n_clusters=num_clusters)
    clusters = clusterer.fit_predict(transformed_PCA)

    silhouette_score = metrics.silhouette_score(transformed_PCA, clusters)
    
    if silhouette_score > highest_silhouette_score:
        highest_silhouette_score = silhouette_score
        highest_clusters = clusters
    silhouette_scores.append(silhouette_score)

num_clusters = cluster_range[numpy.argmax(silhouette_scores)]
clusters = highest_clusters

In [None]:
num_clusters = cluster_range[numpy.argmax(silhouette_scores)]
clusterer = AgglomerativeClustering(n_clusters=num_clusters)
clusters = clusterer.fit_predict(transformed_PCA)

In [None]:
aavomics.plot_clusters(transformed_tSNE, clusters, "out/cell_ranger_called.html")

In [None]:
cell_ranger_filtered.obs["Clusters"] = clusters

In [None]:
transcript_counts_sum = numpy.mean(cell_ranger_filtered.X.sum(axis=1))
scanpy.pp.normalize_total(cell_ranger_filtered,target_sum=transcript_counts_sum,inplace=True)

In [None]:
cell_type_marker_genes = [
    "Olig2",
    "Aldh1l1",
    "Rbfox3",
    "Cldn5",
    "Tmem119"
]

In [None]:
cell_ranger_filtered.obs["Clusters"] = cell_ranger_filtered.obs["Clusters"].astype(str)

In [None]:
cell_type_order = [
    "Astrocytes",
    "Vascular Cells",
    "Immune Cells",
    "Oligodendrocytes",
    "Neurons"
]

scanpy.pl.dotplot(
    cell_ranger_filtered,
    cell_type_marker_genes,
    groupby="Clusters",
    dendrogram=False,
    gene_symbols="Gene Name",
    log=True,
    figsize=(5, 5),
    save="cell_ranger.svg"
)