In [None]:
"""
Analysis of Nematostella polyp single cell RNAseq data using ScanPy
Marc Meynadier
"""

In [None]:
!date

In [None]:
import numpy as np
import anndata
import pandas as pd
import scanpy as sc
import scipy.sparse
import scanpy.external as sce
import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
from matplotlib import rcParams
#import plotly.tools as tls

import seaborn as sns
sns.set(style="whitegrid")

import collections
import scipy.sparse as sp_sparse
import h5py

In [None]:
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, facecolor='white')

In [None]:
# Loading polyp data

tentaclePath = "../../../../species/Nematostella/analysis/STARmapping/steger2022/GSE154105/GSM4663943_NvSubPolypTentacle/"
bodywallPath = "../../../../species/Nematostella/analysis/STARmapping/steger2022/GSE154105/GSM4663946_NvSubPolypBodywall/"
mesenteryPath = "../../../../species/Nematostella/analysis/STARmapping/steger2022/GSE154105/GSM4663944_NvSubPolypMesentery/"
pharynxPath = "../../../../species/Nematostella/analysis/STARmapping/steger2022/GSE154105/GSM4663945_NvSubPolypPharynx/"
tentacleData = sc.read_10x_mtx(tentaclePath) ; tentacleData.var_names_make_unique()
bodywallData = sc.read_10x_mtx(bodywallPath) ; bodywallData.var_names_make_unique()
mesenteryData = sc.read_10x_mtx(mesenteryPath) ; mesenteryData.var_names_make_unique()
pharynxData = sc.read_10x_mtx(pharynxPath) ; pharynxData.var_names_make_unique()

In [None]:
# Metadata
tentacleData.obs['type']="polyp"
tentacleData.obs['sample']="Tentacle"

bodywallData.obs['type']="polyp"
bodywallData.obs['sample']="Bodywall"

mesenteryData.obs['type']="polyp"
mesenteryData.obs['sample']="Mesentery"

pharynxData.obs['type']="polyp"
pharynxData.obs['sample']="Pharynx"

In [None]:
# Merging
adata = tentacleData.concatenate(bodywallData,mesenteryData,pharynxData,join='outer') 
print(adata.obs['sample'].value_counts())

In [None]:
# Mitochondrial genes
mitoGenes = pd.read_csv('../../../../species/Nematostella/raw/NematostellaMitoGenes.txt',sep='\t')
mitoGenes = list(set(list(mitoGenes.iloc[:,0])))


adata.var['mt'] = False
adata.var = adata.var.reset_index()
for idx, row in adata.var.iterrows():
    for j in mitoGenes:
        if row['index'] == j:
            adata.var.at[idx,'mt'] = True
adata.var = adata.var.set_index('index')

print(adata.var)
adata.var.mt.value_counts()

In [None]:
# Mitochondrial quality check
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=True, inplace=True)
mito_genes = adata.var['mt'].to_numpy()
# For each cell compute fraction of counts in mito genes vs. all genes
adata.obs['percent_mt2'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# Add the total counts per cell as observations-annotation 
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts'],
             jitter=0.4, groupby = 'sample', rotation= 45, ncols=2)
sc.pl.violin(adata, ['pct_counts_mt', 'percent_mt2','log1p_total_counts_mt'],
             jitter=0.4, groupby = 'sample', rotation= 45, ncols=2)

In [None]:
# Plot quality check measures
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', color="sample")
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', color="sample")
sc.pl.scatter(adata, x='log1p_total_counts', y='log1p_n_genes_by_counts', color="sample")

In [None]:
# Filtering by genes counts and mitochrondial genes counts
adata = adata[adata.obs.n_genes_by_counts < 9000, :]
adata = adata[adata.obs.pct_counts_mt < 1, :]
adata

In [None]:
# Filtering by number of counts and number of cells
sc.pp.filter_cells(adata, min_counts=500)
sc.pp.filter_genes(adata, min_cells=2)
print(adata.n_obs,adata.n_vars)

In [None]:
# Plotting percentage of counts per gene 
sc.pl.highest_expr_genes(adata, n_top=50)

In [None]:
# Exporting raw adata
rPath = os.getcwd()
outputPath = rPath+'/../../../../species/Nematostella/analysis/ScanPy/'
adata.write_h5ad(outputPath+'Nematostella_polyp_raw.h5ad')

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

adata.raw = adata

adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

In [None]:
# Correcting batch effet
sce.pp.harmony_integrate(adata,'sample',max_iter_harmony=20)

In [None]:
adata

In [None]:
#plot the amount of variance explained by each PC
sc.pl.pca_variance_ratio(adata, log=True, n_pcs = 50)

In [None]:
# Computing neighbors graph
sc.pp.neighbors(adata, n_neighbors=50, n_pcs=30, knn=True, use_rep='X_pca_harmony')

# Clustering the graph using Leiden algorithm
sc.tl.leiden(adata,resolution=1.2) 

# Computing UMAP
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['leiden','sample'], title="UMAP Nematostella polyp",ncols=1)
sc.pl.umap(adata, color=['leiden'], legend_loc= 'on data')
sc.pl.umap(adata, color=['total_counts', 'n_genes_by_counts'])
sc.pl.umap(adata, color=['log1p_total_counts', 'log1p_n_genes_by_counts'])

In [None]:
# Checking how each sample contribute to the final map => investigate possible batch effect

sc.pl.umap(adata, color=['leiden'], frameon=False)
sc.pl.umap(adata[adata.obs.loc[adata.obs['sample'].isin(['Tentacle'])].index.tolist(), ], color= ['leiden','sample'], title=['Clusters','sample'], frameon=False, size=10,ncols=1)
sc.pl.umap(adata[adata.obs.loc[adata.obs['sample'].isin(['Bodywall'])].index.tolist(), ], color= ['leiden','sample'], title=['Clusters','sample'], frameon=False, size=10,ncols=1)
sc.pl.umap(adata[adata.obs.loc[adata.obs['sample'].isin(['Mesentery'])].index.tolist(), ], color= ['leiden','sample'], title=['Clusters','sample'], frameon=False, size=10,ncols=1)
sc.pl.umap(adata[adata.obs.loc[adata.obs['sample'].isin(['Pharynx'])].index.tolist(), ], color= ['leiden','sample'], title=['Clusters','sample'], frameon=False, size=10,ncols=1)

In [None]:
# Re-exporting data
adata.write_h5ad(outputPath+'Nematostella_polyp.h5ad')

In [None]:
# Ranking and extracting marker genes for all clusters
sc.tl.rank_genes_groups(adata,'leiden',n_genes=1000,method='wilcoxon')

clusters = np.unique(adata.obs['leiden'])
clusters = clusters.astype('int')
clusters = np.sort(clusters)
clusters = clusters.astype('str')

markers = pd.DataFrame()
clus = []
markerGene = []
padj = []
for i in clusters:
  genes = adata.uns['rank_genes_groups']['names'][str(i)]
  clus += list(np.repeat(i,len(genes)))
  markerGene += list(genes)
  padj += list(adata.uns['rank_genes_groups']['pvals_adj'][str(i)])
markers['clus'] = clus
markers['markerGene'] = markerGene
markers['padj'] = padj
significant = []
for index,row in markers.iterrows():
    if row['padj'] < 0.05:
        significant += "T"
    else:
        significant += "F"
markers['significant'] = significant
significant_markers = markers[markers.significant == "T"]
significant_markers.to_csv(outputPath+'Nematostella_adult_markers_1000g.csv', sep='\t',index=False)
print(significant_markers)

In [None]:
def cluster_markers(markersDf,clusterNumber):
    subset = markersDf[markersDf["clus"].isin(clusterNumber)]
    markerGene = subset["markerGene"].tolist()
    return markerGene

def save_markers(markerList,path,clusterName):
    with open(path+"Nematostella_polyp_"+clusterName+"_markers.txt","w") as f:
        for i in markerList:
            f.write(i+"\n")
        f.close

In [None]:
cluster_markers(significant_markers,['3'])

In [None]:
# Synaptotagmins
sc.pl.umap(adata,color=['NV2g009911000.1','NV2g024306000.1','NV2g024303000.1','NV2g020231000.1','NV2g011180000.1','NV2g009892000.1','NV2g009676000.1','NV2g025037000.1','NV2g011361000.1','NV2g012780000.1','NV2g010021000.1','NV2g015150000.1','NV2g014334000.1','NV2g015152000.1','NV2g012332000.1','NV2g003480000.1','NV2g023244000.1','NV2g024221000.1','NV2g024302000.1','NV2g023245000.1','NV2g020252000.1'],size=20)

In [None]:
# Plotting top marker gene for each cluster
markers = []
for i in np.unique(adata.obs['leiden']):
     markers += [adata.uns['rank_genes_groups']['names'][str(i)][0]]
sc.pl.umap(adata,color=markers,color_map = 'viridis')

In [None]:
# Embryonic ectoderm => TXT51-like 
sc.pl.umap(adata,color=['NV2g004249000.1'],size=20,ncols=2,color_map = 'viridis') 

In [None]:
# Embryonic endomesoderm => NvSnailB
sc.pl.umap(adata,color=['NV2g024886000.1'],size=20,ncols=2,color_map = 'viridis') 
embryonic_mesoderm_markers = cluster_markers(significant_markers,['6']) 

In [None]:
# Embryonic endomesoderm
sc.pl.umap(adata,color=['NV2g008037000.1','NV2g023418000.1','NV2g016313000.1','NV2g014212000.1','NV2g016600000.1','NV2g014143000.1'],size=20,ncols=2,color_map = 'viridis') 

In [None]:
# Endomesoderm
sc.pl.umap(adata,color=['NV2g019682000.1','NV2g019627000.1','NV2g003734000.1','NV2g010172000.1'],size=20,ncols=2,color_map = 'viridis') 

In [None]:
# Pharyngeal ectoderm => NvFoxA
sc.pl.umap(adata,color=["NV2g011441000.1","NV2g007354000.1","NV2g022761000.1","NV2g020247000.1","NV2g007661000.1","NV2g019607000.1","NV2g012087000.1","NV2g001654000.1","NV2g025421000.1"],size=20,ncols=2,color_map = 'viridis')
pharyngeal_ectoderm_markers = cluster_markers(significant_markers,['1','2','3','15'])  

In [None]:
# Ectoderm epidermis
sc.pl.umap(adata,color=['NV2g013603000.1'],size=20,ncols=2,color_map = 'viridis')

In [None]:
# Gastroderm => FRIS-like4
sc.pl.umap(adata,color=['NV2g009345000.1'],size=20,ncols=2,color_map = 'viridis') 

In [None]:
# Retractor muscle => Nve-Tpm2 
sc.pl.umap(adata,color=['NV2g019134000.1','NV2g017681000.1'],size=20,ncols=2,color_map = 'viridis') 

In [None]:
# NPC => NvSoxB.2  
sc.pl.umap(adata,color=['NV2g004477000.1','NV2g013863000.1','NV2g017090000.1','NV2g015516000.1'],size=20,ncols=2,color_map = 'viridis') 

In [None]:
# Neuronal => NvAshA
sc.pl.umap(adata,color=['NV2g009665000.1'],size=20,ncols=2,color_map = 'viridis') 
neuronal_markers = cluster_markers(significant_markers,['5']) 
save_markers(neuronal_markers,outputPath,"neuronal")

In [None]:
# Secretory => VKT52-like
sc.pl.umap(adata,color=['NV2g008165000.1'],size=20,ncols=2,color_map = 'viridis') 
secretory_markers = cluster_markers(significant_markers,['23']) 
save_markers(secretory_markers,outputPath,"secretory")

In [None]:
# Mucous gland 
sc.pl.umap(adata,color=['NV2g012902000.1'],size=20,ncols=2,color_map = 'viridis') # C8 (mucin)
mucous_markers = cluster_markers(significant_markers,['13'])
save_markers(mucous_markers,outputPath,"mucous") 

In [None]:
# Cnidocyte => EVA1C-like3
sc.pl.umap(adata,color=['NV2g005200000.1'],size=20,ncols=2,color_map = 'viridis')
cnidocyte_markers = cluster_markers(significant_markers,['7','9']) 
save_markers(cnidocyte_markers,outputPath,"cnidocyte")

In [None]:
# Mature cnidocyte => FOS-like
sc.pl.umap(adata,color=['NV2g019749000.1'],size=20,ncols=2,color_map = 'viridis') 
cnidocyte_mature_markers = cluster_markers(significant_markers,['14']) 
save_markers(cnidocyte_mature_markers,outputPath,"mature_cnidocyte")

In [None]:
# Smooth muscle => Calponin
sc.pl.umap(adata,color=['NV2g009314000.1'],size=20,ncols=2,color_map = 'viridis') 

In [None]:
sc.tl.paga(adata, groups='leiden')

In [None]:
adata.obs['leiden_anno'] = adata.obs['leiden']

In [None]:
adata.obs['leiden_anno'].cat.categories = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12','13', '14', '15', '16', '17', '18', '19', '20','21']

In [None]:
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
sc.pl.paga(adata, threshold=0.03, show=False,fontsize=9,edge_width_scale=0.5)

In [None]:
sc.pl.umap(adata, color=['leiden','NV2g016733000.1'], legend_loc= 'on data')

In [None]:
cluster_markers(significant_markers,['14'])

In [None]:
adata.obs['leiden_anno'].cat.categories = ["0 - Gastrodermis","1 - Ectodermis","2 - Pharyngeal ectodermis","3 - Pharyngeal ectodermis","4 - Epidermis","5 - Neurons","6 - Ectodermis","7 - Cnidocytes","8 - Smooth muscle","9 - Differentiating cnidocytes","10 - NPC","11 - Ectodermis","12 - Retractor muscle","13 - Mucous gland cells","14 - Mature cnidocytes","15 - Pharyngeal ectoderm","16 - Secretory cells","17 - Secretory cells","18 - Gastrodermis","19 - ?","20 - Secretory cells","21 - ? (Neurons-like cells)"]

In [None]:
sc.settings.set_figure_params(dpi=90, facecolor='white')
sc.pl.umap(adata, color=['NV2g009911000.1','NV2g024306000.1','NV2g024303000.1','NV2g020231000.1','NV2g011180000.1','NV2g009892000.1','NV2g009676000.1','NV2g025037000.1','NV2g011361000.1','NV2g012780000.1','NV2g010021000.1','NV2g015150000.1','NV2g014334000.1','NV2g015152000.1','NV2g012332000.1','NV2g003480000.1','NV2g023244000.1','NV2g024221000.1','NV2g024302000.1','NV2g023245000.1','NV2g020252000.1','leiden_anno'],frameon=False,save="synaptoNvPolyp.pdf")

In [None]:
sc.settings.set_figure_params(dpi=150, facecolor='white')
sc.pl.umap(adata,title='Nematostella polyp\nRBH - one2one - orthopairsBased', color=['leiden'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#E2E2E2", "7":"#A366E0", "8":"#E2E2E2", "9":"#A366E0", "10":"#E2E2E2", "11":"#E2E2E2", "12":"#E2E2E2", "13":"#E2E2E2", "14":"#A366E0", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Nematostella polyp\n RBH - many2many - orthopairsBased', color=['leiden'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#E2E2E2", "7":"#A366E0", "8":"#E2E2E2", "9":"#A366E0", "10":"#E2E2E2", "11":"#E2E2E2", "12":"#E2E2E2", "13":"#E2E2E2", "14":"#A366E0", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Nematostella polyp\nOrthoFinder - many2many - orthopairsBased', color=['leiden'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#E2E2E2", "7":"#A366E0", "8":"#E2E2E2", "9":"#A366E0", "10":"#E2E2E2", "11":"#E2E2E2", "12":"#E2E2E2", "13":"#E2E2E2", "14":"#A366E0", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Nematostella polyp\nOrthoFinder - many2many - genomeBased', color=['leiden'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#E2E2E2", "7":"#A366E0", "8":"#E2E2E2", "9":"#A366E0", "10":"#E2E2E2", "11":"#E2E2E2", "12":"#E2E2E2", "13":"#E2E2E2", "14":"#A366E0", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Nematostella polyp\nRBH - one2one - orthopairsBased', color=['leiden'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#0080ff", "6":"#E2E2E2", "7":"#E2E2E2", "8":"#E2E2E2", "9":"#E2E2E2", "10":"#E2E2E2", "11":"#E2E2E2", "12":"#E2E2E2", "13":"#E2E2E2", "14":"#E2E2E2", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Nematostella polyp\nRBH - many2many - orthopairsBased', color=['leiden'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#0080ff", "5":"#0080ff", "6":"#E2E2E2", "7":"#E2E2E2", "8":"#E2E2E2", "9":"#E2E2E2", "10":"#0080ff", "11":"#E2E2E2", "12":"#0080ff", "13":"#E2E2E2", "14":"#0080ff", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Nematostella polyp\nOrthoFinder - many2many - orthopairsBased', color=['leiden'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#0080ff", "5":"#0080ff", "6":"#E2E2E2", "7":"#E2E2E2", "8":"#E2E2E2", "9":"#E2E2E2", "10":"#E2E2E2", "11":"#E2E2E2", "12":"#0080ff", "13":"#E2E2E2", "14":"#0080ff", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Nematostella polyp\nOrthoFinder - many2many - genomeBased', color=['leiden'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#0080ff", "5":"#0080ff", "6":"#E2E2E2", "7":"#E2E2E2", "8":"#E2E2E2", "9":"#E2E2E2", "10":"#E2E2E2", "11":"#E2E2E2", "12":"#0080ff", "13":"#E2E2E2", "14":"#0080ff", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2"}, frameon=False)