In [None]:
"""
Analysis of Clytia medusa single cell RNAseq data using ScanPy
Marc Meynadier
"""

In [None]:
!date

In [None]:
import numpy as np
import anndata
import pandas as pd
import scanpy as sc
import scipy.sparse
import scanpy.external as sce
import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
from matplotlib import rcParams
#import plotly.tools as tls

import seaborn as sns
sns.set(style="whitegrid")

import collections
import scipy.sparse as sp_sparse
import h5py

In [None]:
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, facecolor='white')

In [None]:
# Loadind medusa data

ipmcPath = "../../../../species/Clytia/analysis/STARmapping/chari2021/IPMC/"
emblPath = "../../../../species/Clytia/analysis/STARmapping/chari2021/EMBL/"
fsPath = "../../../../species/Clytia/analysis/STARmapping/chari2021/FS/"
ipmcData = sc.read_10x_mtx(ipmcPath) ; ipmcData.var_names_make_unique()
emblData = sc.read_10x_mtx(emblPath) ; emblData.var_names_make_unique()
fsData = sc.read_10x_mtx(fsPath) ; fsData.var_names_make_unique()

In [None]:
# Metadata
ipmcData.obs['type']="medusa"
ipmcData.obs['sample']="IPMC"

emblData.obs['type']="medusa"
emblData.obs['sample']="EMBL"

fsData.obs['type']="medusa"
fsData.obs['sample']="FS"

In [None]:
# Merging
adata = ipmcData.concatenate(emblData,fsData,join='outer') 
print(adata.obs['sample'].value_counts())

In [None]:
# Mitochondrial genes
adata.var['mt'] = adata.var_names.str.startswith('XLOC_MITO')
adata.var.mt.value_counts()

In [None]:
# Mitochondrial quality check
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=True, inplace=True)
mito_genes = adata.var_names.str.startswith('XLOC_MITO')
# For each cell compute fraction of counts in mito genes vs. all genes
adata.obs['percent_mt2'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# Add the total counts per cell as observations-annotation 
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts'],
             jitter=0.4, groupby = 'sample', rotation= 45, ncols=2)
sc.pl.violin(adata, ['pct_counts_mt', 'percent_mt2','log1p_total_counts_mt'],
             jitter=0.4, groupby = 'sample', rotation= 45, ncols=2)

In [None]:
# Plot quality check measures
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', color="sample")
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', color="sample")
sc.pl.scatter(adata, x='log1p_total_counts', y='log1p_n_genes_by_counts', color="sample")

In [None]:
# Filtering by genes counts and mitochrondial genes counts
adata = adata[adata.obs.n_genes_by_counts < 9000, :]
adata = adata[adata.obs.pct_counts_mt < 1, :]
adata

In [None]:
# Filtering by number of counts and number of cells
sc.pp.filter_cells(adata, min_counts=500)
sc.pp.filter_genes(adata, min_cells=2)
print(adata.n_obs,adata.n_vars)

In [None]:
# Plotting percentage of counts per gene 
sc.pl.highest_expr_genes(adata, n_top=50)

In [None]:
# Exporting raw adata
rPath = os.getcwd()
outputPath = rPath+'/../../../../species/Clytia/analysis/ScanPy/'
adata.write_h5ad(outputPath+'Clytia_medusa_raw.h5ad')

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

adata.raw = adata

adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

In [None]:
# Correcting batch effet
sce.pp.harmony_integrate(adata,'sample',max_iter_harmony=20)

In [None]:
adata

In [None]:
#plot the amount of variance explained by each PC
sc.pl.pca_variance_ratio(adata, log=True, n_pcs = 50)

In [None]:
# Computing neighbors graph
sc.pp.neighbors(adata, n_neighbors=50, n_pcs=40, knn=True, use_rep='X_pca_harmony')

# Clustering the graph using Leiden algorithm
sc.tl.leiden(adata,resolution=1) 

# Computing UMAP
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['leiden','sample'], title="UMAP medusa",ncols=1)
sc.pl.umap(adata, color=['leiden'], legend_loc= 'on data')
sc.pl.umap(adata, color=['total_counts', 'n_genes_by_counts'])
sc.pl.umap(adata, color=['log1p_total_counts', 'log1p_n_genes_by_counts'])

In [None]:
# Checking how each sample contribute to the final map => investigate possible batch effect

sc.pl.umap(adata, color=['leiden'], frameon=False)
sc.pl.umap(adata[adata.obs.loc[adata.obs['sample'].isin(['IPMC'])].index.tolist(), ], color= ['leiden','sample'], title=['Clusters','sample'], frameon=False, size=10,ncols=1)
sc.pl.umap(adata[adata.obs.loc[adata.obs['sample'].isin(['FS'])].index.tolist(), ], color= ['leiden','sample'], title=['Clusters','sample'], frameon=False, size=10,ncols=1)
sc.pl.umap(adata[adata.obs.loc[adata.obs['sample'].isin(['EMBL'])].index.tolist(), ], color= ['leiden','sample'], title=['Clusters','sample'], frameon=False, size=10,ncols=1)

In [None]:
# Re-exporting data
adata.write_h5ad(outputPath+'Clytia_medusa.h5ad')

In [None]:
# Ranking and extracting marker genes for all clusters
sc.tl.rank_genes_groups(adata,'leiden',n_genes=1000,method='wilcoxon')

clusters = np.unique(adata.obs['leiden'])
clusters = clusters.astype('int')
clusters = np.sort(clusters)
clusters = clusters.astype('str')

markers = pd.DataFrame()
clus = []
markerGene = []
padj = []
for i in clusters:
  genes = adata.uns['rank_genes_groups']['names'][str(i)]
  clus += list(np.repeat(i,len(genes)))
  markerGene += list(genes)
  padj += list(adata.uns['rank_genes_groups']['pvals_adj'][str(i)])
markers['clus'] = clus
markers['markerGene'] = markerGene
markers['padj'] = padj
significant = []
for index,row in markers.iterrows():
    if row['padj'] < 0.05:
        significant += "T"
    else:
        significant += "F"
markers['significant'] = significant
significant_markers = markers[markers.significant == "T"]
significant_markers.to_csv(outputPath+'Clytia_medusa_markers_1000g.csv',sep='\t',index=False)
print(significant_markers)

In [None]:
def cluster_markers(markersDf,clusterNumber):
    subset = markersDf[markersDf["clus"].isin(clusterNumber)]
    markerGene = subset["markerGene"].tolist()
    return markerGene

def save_markers(markerList,path,clusterName):
    with open(path+"Clytia_medusa_"+clusterName+"_markers.txt","w") as f:
        for i in markerList:
            f.write(i+"\n")
        f.close

In [None]:
cluster_markers(significant_markers,['0'])

In [None]:
#Test
sc.pl.umap(adata,color=['XLOC_001915','XLOC_004970','XLOC_005639','XLOC_014218','XLOC_034515','XLOC_034627','XLOC_045283'])

In [None]:
# Synaptotagmin
sc.pl.umap(adata,color=['XLOC_038850','XLOC_006609','XLOC_038825','XLOC_041567','XLOC_007395','XLOC_040916','XLOC_029295','XLOC_029731','XLOC_043438','XLOC_011327','XLOC_002797','XLOC_011867','XLOC_002849','XLOC_002851','XLOC_000865','XLOC_003662','XLOC_019417'])

In [None]:
# Plotting top marker gene for each cluster
markers = []
for i in np.unique(adata.obs['leiden']):
     markers += [adata.uns['rank_genes_groups']['names'][str(i)][0]]
sc.pl.umap(adata,color=markers,color_map = 'viridis')

In [None]:
# Cnidocytes
sc.pl.umap(adata,color=['XLOC_017841','XLOC_017845','XLOC_015554','XLOC_044122','XLOC_039385','XLOC_039341','XLOC_004102'],title=['XLOC_017841 znf845-a','XLOC_017845 znf845-b','XLOC_015554 mos3','XLOC_044122 mcol3/4','XLOC_039385 m14-peptidase','XLOC_039341 sans/USH-1G','XLOC_004102 nematocilin'],size=20,ncols=2,color_map = 'viridis')
cnidocyte_markers = cluster_markers(significant_markers,['4','14','28','11','7','16','25']) 
save_markers(cnidocyte_markers,outputPath,"cnidocyte")

In [None]:
# neurons
sc.pl.umap(adata,color=['XLOC_001566','XLOC_029731','XLOC_004785','XLOC_030971','XLOC_030920','XLOC_018937','XLOC_045293'],title=['XLOC_001566 calmodulin','XLOC_029731 synaptotagmin5','XLOC_004785 ELAV1','XLOC_030971 ELAV2','XLOC_030920 hlh6','XLOC_018937 neurogenin','XLOC_045293 sox10'],size=40,color_map = 'viridis')
neuronal_markers = cluster_markers(significant_markers,['33','31','30','17','13','26']) 
save_markers(neuronal_markers,outputPath,"neuronal")

In [None]:
# stem/germ
sc.pl.umap(adata,color=['XLOC_004150','XLOC_002094','XLOC_044232','XLOC_043332','XLOC_006164','XLOC_007915','XLOC_039192','XLOC_033976','XLOC_033801'],title=['GFP2a XLOC_004102','Clytin2 XLOC_002094','Oocyte Protein XLOC_044232','Boule1a XLOC_043332','FMNeductase XLOC_006164','Piwi XLOC_007915','XLOC_039192 Nanos2','XLOC_033976 PL10','XLOC_033801 Vasa'],size=20, color_map = 'viridis')

In [None]:
#Gastroderm markers
sc.pl.umap(adata, color=['XLOC_005609','XLOC_008632','XLOC_006965','XLOC_008858','XLOC_038345','XLOC_010708','XLOC_033751','XLOC_007437','XLOC_029934','XLOC_030379','XLOC_032555'], title=['XLOC_005609 VCBSprot','XLOC_008632 FibCdom-2','XLOC_006965 CathepsinL','XLOC_008858 DDAH','XLOC_038345 CheGast','XLOC_010708 P-lipase','XLOC_033751 Fibulin','XLOC_007437 BP10-like','XLOC_029934 FibColl-A','XLOC_030379 FibColl-B','XLOC_032555 NucHyd'], size=20, color_map='viridis')

In [None]:
#bioluminescent cells
sc.pl.umap(adata, color=['XLOC_004165'],title=['XLOC_004165 GFP2'],size=20, color_map='viridis')

In [None]:
#Gland cells
sc.pl.umap(adata,color=['XLOC_002105','XLOC_006072','XLOC_021506','XLOC_001911','XLOC_034427','XLOC_002272','XLOC_011101'],title=['XLOC_002105 Chitinase','XLOC_006072 FibCdom-1','XLOC_021506 C-lectin','XLOC_001911 Tryp-like','XLOC_034427 ShKT-TrypB','XLOC_002272 ShKT-TrypA','XLOC_011101 C3-Lipase'], size=20, color_map='viridis')

In [None]:
# epidermal/muscle from medusa
sc.pl.umap(adata,color=['XLOC_000520','XLOC_042542','XLOC_044475','XLOC_038183','XLOC_029205','XLOC_007598','XLOC_035357','XLOC_031686'],title=['XLOC_000520 TPM-A','XLOC_042542 TPM-B','XLOC_044475 Peroxidase','XLOC_038183 ST MyHCa','XLOC_029205 ST MyHCb','XLOC_007598 GFP3','XLOC_035357 GFP4','XLOC_031686 wnt2'],size=40,color_map='viridis', ncols=2)

In [None]:
#mixed markers 
sc.pl.umap(adata,color=['XLOC_006965','XLOC_036006','XLOC_007915','XLOC_033801','XLOC_007598','XLOC_001566','XLOC_029731','XLOC_004102'],title=['XLOC_006965 cathepsinL','XLOC_036006 nanos1','XLOC_007915 piwi','XLOC_033801 vasa','XLOC_007598 GFP3','XLOC_001566 calmodulin','XLOC_029731 synaptotagmin5','XLOC_004102 GFP2a'],size=40,color_map = 'viridis')

In [None]:
# stem cells vasa+
sc.pl.umap(adata, color=['XLOC_033801','XLOC_000173','XLOC_007932'], title=('XLOC_033801 vasa','XLOC_000173','XLOC_007932'), size=40, color_map='viridis')

In [None]:
# ectoderm v3
sc.pl.umap(adata, color=['XLOC_000520','XLOC_032848','XLOC_000520','XLOC_011667','XLOC_038183','XLOC_035583','XLOC_005114'], title=['XLOC_000520 tropomyosin 1','XLOC_032848','XLOC_000520 TPM-A','XLOC_011667 myosin lc kinase','XLOC_038183 ST MyHCa','XLOC_035583  GFP1','XLOC_005114'], size=40, color_map='viridis')

In [None]:
# gastroderm v3
sc.pl.umap(adata,color=['XLOC_006965','XLOC_008858','XLOC_038345','XLOC_037915'],title=['XLOC_006965 CathepsinL','XLOC_008858 DDAH','XLOC_038345 cheGAst','XLOC_037915'], size=40, color_map='viridis')

In [None]:
# neuropeptides
sc.pl.umap(adata,color=['XLOC_017096','XLOC_003691','XLOC_019434','XLOC_012334','XLOC_040580','XLOC_041442','XLOC_041402','XLOC_017097','XLOC_000626','XLOC_004021','XLOC_003339','XLOC_030120','XLOC_040209','XLOC_038155','XLOC_010892','XLOC_035224','XLOC_021799','XLOC_008730','XLOC_014624'],title=['XLOC_017096 pp2','XLOC_003691 pp3','XLOC_019434 pp5','XLOC_012334 pp6','XLOC_040580 pp7b','XLOC_041442 pp9a','XLOC_041402 pp9b','XLOC_017097 pp11','XLOC_000626 pp13','XLOC_004021 pp14','XLOC_003339 pp15','XLOC_030120 pp17','XLOC_040209 pp19','XLOC_038155 pp20','XLOC_010892 pp21','XLOC_035224 pp24','XLOC_021799 pp25','XLOC_008730 pp26','XLOC_014624 pp27'],size=40,color_map = 'viridis')

In [None]:
sc.tl.paga(adata, groups='leiden')

In [None]:
adata.obs['leiden_anno'] = adata.obs['leiden']

In [None]:
adata.obs['leiden_anno'].cat.categories = ["0 - Medium oocytes","1 - Exumbrella epidermis","2 - I-cells","3 - Mature cnidocytes","4 - Early cnidocytes","5 - Exumbrella epidermis B","6 - Gastrodermis","7 - Striated muscle","8 - Mixed profile","9 - Cnidocytes","10 - Gastrodermis","11 - Tentacle bulb epidermis","12 - Gastrodermis","13 - Terminal differentiating cnidocytes","14 - NPC","15 - Gonad/Manubrium epidermis","16 - Differentiating cnidocytes","17 - Neurons (pp11/pp17)","18 - Gastrodermis","19 - Gastrodermis","20 - Radial smooth muscle","21 - Tentacle bulb gastrodermis","22 - Digestive gland cells","23 - Neurons (pp14/pp25)","24 - Endodermal plate","25 - Digestive gland cells","26 - Neurons","27 - Differentiating cnidocyte","28 - Digestive gland cells","29 - Neurons (pp9)","30 - Small oocytes","31 - Digestive gland cells"]

In [None]:
sc.pl.paga(adata, threshold=0.03, show=False,fontsize=9)

In [None]:
sc.pl.umap(adata, color=['leiden','XLOC_045293'], legend_loc= 'on data')

In [None]:
# epidermis
sc.pl.umap(adata, color=['XLOC_000520','XLOC_007598','XLOC_035357','XLOC_042542','XLOC_029205','XLOC_038183','XLOC_031686'])

In [None]:
# gastroderm
sc.pl.umap(adata, color=['XLOC_006965','XLOC_038345','XLOC_010708','XLOC_032555','XLOC_008632','XLOC_005609','XLOC_008858','XLOC_033751','XLOC_007437','XLOC_029934','XLOC_030379'])

In [None]:
# Bioluminescent  
sc.pl.umap(adata, color=['XLOC_004165','XLOC_002094'])

In [None]:
# Oocytes
sc.pl.umap(adata, color=['XLOC_004150','XLOC_044232','XLOC_006164'])

In [None]:
# i-cells
sc.pl.umap(adata, color=['XLOC_007915','XLOC_036006','XLOC_033801','XLOC_017841'])

In [None]:
# Cnidocytes
sc.pl.umap(adata, color=['XLOC_015554','XLOC_044122','XLOC_039385','XLOC_011100','XLOC_039341','XLOC_004102'])

In [None]:
# Neural cells
sc.pl.umap(adata, color=['XLOC_030971','XLOC_008730','XLOC_014624','XLOC_038155','XLOC_019434','XLOC_021799','XLOC_017097','XLOC_042761','XLOC_030120','XLOC_040584','XLOC_041442','XLOC_004021'])

In [None]:
# Digestive gland cells
sc.pl.umap(adata, color=['XLOC_002105','XLOC_034427','XLOC_002272','XLOC_001911','XLOC_011101','XLOC_021506','XLOC_006072'])

In [None]:
sc.pl.umap(adata, color=['XLOC_038850','XLOC_006609','XLOC_038825','XLOC_041567','XLOC_007395','XLOC_040916','XLOC_029295','XLOC_029731','XLOC_043438','XLOC_011327','XLOC_002797','XLOC_011867','XLOC_002849','XLOC_002851','XLOC_000865','XLOC_003662','XLOC_019417','leiden_anno'],frameon=False,save="synaptoChMedusa.pdf")

In [None]:
cluster_markers(significant_markers,['3'])

In [None]:
adata.obs['leiden_anno'].cat.categories = ["0 - Medium oocytes","1 - Exumbrella epidermis A","2 - I-cells","3 - Mature cnidocytes","4 - Early cnidocytes","5 - Exumbrella epidermis B","6 - Gastrodermis","7 - Striated muscle","8 - Mixed profile","9 - Cnidocytes","10 - Gastrodermis","11 - Tentacle bulb epidermis","12 - Gastrodermis","13 - Terminal differentiating cnidocytes","14 - NPC","15 - Gonad/Manubrium epidermis","16 - Differentiating cnidocytes","17 - Neurons (pp11/pp17)","18 - Gastrodermis","19 - Gastrodermis","20 - Radial smooth muscle","21 - Tentacle bulb gastrodermis","22 - Digestive gland cells","23 - Neurons (pp14/pp25)","24 - Endodermal plate","25 - Digestive gland cells","26 - Neurons","27 - Differentiating cnidocyte","28 - Digestive gland cells","29 - Neurons (pp9)","30 - Small oocytes","31 - Digestive gland cells"]

In [None]:
sc.pl.umap(adata, color=['leiden_anno'],title='Clytia medusa annotated clusters',legend_fontsize=7,ncols=1,groups=["3 - Mature cnidocytes","4 - Early cnidocytes","9 - Cnidocytes","13 - Terminal differentiating cnidocytes","16 - Differentiating cnidocytes","27 - Differentiating cnidocyte"],frameon=True,save='Clytia_medusa_cnidocytes_UMAP.pdf')

In [None]:
adata

In [None]:
# Exporting for SCVI

adata.write_h5ad(outputPath+'Clytia_medusa_annotated.h5ad')

In [None]:
sc.pl.umap(adata, color=['XLOC_015554','XLOC_044122','XLOC_039385','XLOC_039341','XLOC_004102','XLOC_039010','XLOC_001015','XLOC_034759'],title=['Mos3','Minicollagen','Peptidase_M14','Sans/USH-IG','Nematocilin','Gal_lectin','TMEM214','EF-hand'])