In [None]:
"""
Analysis of Hydractinia polyp single cell data RNAseq data using Scanpy
Marc Meynadier
"""

In [None]:
!date

In [None]:
import numpy as np
import anndata
import pandas as pd
import scanpy as sc
import scipy.sparse
import scanpy.external as sce
import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
from matplotlib import rcParams
#import plotly.tools as tls

import seaborn as sns
sns.set(style="whitegrid")

import collections
import scipy.sparse as sp_sparse
import h5py

In [None]:
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, facecolor='white')

In [None]:
# Loading polyp data

pbsPath = "/Users/mmeynadier/Documents/PhD/species/Hydractinia/analysis/STARmapping/schnitzler_2023/SRR18058177"
seawaterPath = "/Users/mmeynadier/Documents/PhD/species/Hydractinia/analysis/STARmapping/schnitzler_2023/SRR18058178"
pbs = sc.read_10x_mtx(pbsPath) ; pbs.var_names_make_unique()
seawater = sc.read_10x_mtx(seawaterPath) ; seawater.var_names_make_unique()

In [None]:
# Metadata
pbs.obs['type']="polyp"
pbs.obs['sample']="pbs"

seawater.obs['type']="polyp"
seawater.obs['sample']="seawater"

In [None]:
# Merging
adata = pbs.concatenate(seawater,join='outer') 
print(adata.obs['sample'].value_counts())

In [None]:

# Mitochondrial genes
mitoGenes = pd.read_csv('../../../../species/Hydractinia/raw/HydractiniaMitoGenes.txt',sep='\t')
mitoGenes = list(set(list(mitoGenes.iloc[:,0])))


adata.var['mt'] = False
adata.var = adata.var.reset_index()
for idx, row in adata.var.iterrows():
    for j in mitoGenes:
        if row['index'] == j:
            adata.var.at[idx,'mt'] = True
adata.var = adata.var.set_index('index')

print(adata.var)
adata.var.mt.value_counts()

In [None]:
# Mitochondrial quality check
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=True, inplace=True)
mito_genes = adata.var['mt'].to_numpy()
# For each cell compute fraction of counts in mito genes vs. all genes
adata.obs['percent_mt2'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# Add the total counts per cell as observations-annotation 
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts'],
             jitter=0.4, groupby = 'sample', rotation= 45, ncols=2)
sc.pl.violin(adata, ['pct_counts_mt', 'percent_mt2','log1p_total_counts_mt'],
             jitter=0.4, groupby = 'sample', rotation= 45, ncols=2)

In [None]:
# Plot quality check measures
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', color="sample")
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', color="sample")
sc.pl.scatter(adata, x='log1p_total_counts', y='log1p_n_genes_by_counts', color="sample")

In [None]:
# Filtering by genes counts and mitochrondial genes counts
adata = adata[adata.obs.n_genes_by_counts < 9000, :]
adata = adata[adata.obs.pct_counts_mt < 1, :]
adata

In [None]:
# Filtering by number of counts and number of cells
sc.pp.filter_cells(adata, min_counts=500)
sc.pp.filter_genes(adata, min_cells=2)
print(adata.n_obs,adata.n_vars)

In [None]:
# Plotting percentage of counts per gene 
sc.pl.highest_expr_genes(adata, n_top=50)

In [None]:
# Exporting raw adata
rPath = os.getcwd()
outputPath = rPath+'/../../../../species/Hydractinia/analysis/Scanpy/'
adata.write_h5ad(outputPath+'Hydractinia_polyp_raw.h5ad')

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

adata.raw = adata

adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

In [None]:
# Correcting batch effet
sce.pp.harmony_integrate(adata,'sample',max_iter_harmony=20)

In [None]:
adata

In [None]:
#plot the amount of variance explained by each PC
sc.pl.pca_variance_ratio(adata, log=True, n_pcs = 50)

In [None]:
# Computing neighbors graph
sc.pp.neighbors(adata, n_neighbors=50, n_pcs=16, knn=True, use_rep='X_pca_harmony')

# Clustering the graph using Leiden algorithm
sc.tl.leiden(adata,resolution=1.2) 

# Computing UMAP
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['leiden','sample'], title="UMAP Hydractinia polyp",ncols=1)
sc.pl.umap(adata, color=['leiden'], legend_loc= 'on data')
sc.pl.umap(adata, color=['total_counts', 'n_genes_by_counts'])
sc.pl.umap(adata, color=['log1p_total_counts', 'log1p_n_genes_by_counts'])

In [None]:
# Checking how each sample contribute to the final map => investigate possible batch effect

sc.pl.umap(adata, color=['leiden'], frameon=False)
sc.pl.umap(adata[adata.obs.loc[adata.obs['sample'].isin(['pbs'])].index.tolist(), ], color= ['leiden','sample'], title=['Clusters','sample'], frameon=False, size=10,ncols=1)
sc.pl.umap(adata[adata.obs.loc[adata.obs['sample'].isin(['seawater'])].index.tolist(), ], color= ['leiden','sample'], title=['Clusters','sample'], frameon=False, size=10,ncols=1)

In [None]:
# Re-exporting data
adata.write_h5ad(outputPath+'Hydractinia_polyp.h5ad')

In [None]:
# Ranking and extracting marker genes for all clusters
sc.tl.rank_genes_groups(adata,'leiden',n_genes=1000,method='wilcoxon')

clusters = np.unique(adata.obs['leiden'])
clusters = clusters.astype('int')
clusters = np.sort(clusters)
clusters = clusters.astype('str')

markers = pd.DataFrame()
clus = []
markerGene = []
padj = []
for i in clusters:
  genes = adata.uns['rank_genes_groups']['names'][str(i)]
  clus += list(np.repeat(i,len(genes)))
  markerGene += list(genes)
  padj += list(adata.uns['rank_genes_groups']['pvals_adj'][str(i)])
markers['clus'] = clus
markers['markerGene'] = markerGene
markers['padj'] = padj
significant = []
for index,row in markers.iterrows():
    if row['padj'] < 0.05:
        significant += "T"
    else:
        significant += "F"
markers['significant'] = significant
significant_markers = markers[markers.significant == "T"]
significant_markers.to_csv(outputPath+'Hydractinia_polyp_markers_1000g.csv', sep='\t',index=False)
print(significant_markers)

In [None]:
def cluster_markers(markersDf,clusterNumber):
    subset = markersDf[markersDf["clus"].isin(clusterNumber)]
    markerGene = subset["markerGene"].tolist()
    return markerGene

def save_markers(markerList,path,clusterName):
    with open(path+"Hydractinia_polyp_"+clusterName+"_markers.txt","w") as f:
        for i in markerList:
            f.write(i+"\n")
        f.close

In [None]:
cluster_markers(significant_markers,['3'])

In [None]:
# Plotting top marker gene for each cluster
markers = []
for i in np.unique(adata.obs['leiden']):
     markers += [adata.uns['rank_genes_groups']['names'][str(i)][0]]
sc.pl.umap(adata,color=markers,color_map = 'viridis')

In [None]:
# Test
sc.pl.umap(adata, color=['HyS0012.308g','HyS0074.36g'])

In [None]:
# Cnidocytes
sc.pl.umap(adata, color=['HyS0011.105g','HyS0008.263g','HyS0017.228g','HyS0017.242g','HyS0146.2g','HyS0030.203g'])

In [None]:
# Neural cells
sc.pl.umap(adata, color=['HyS0085.53g','HyS0058.42g','HyS0051.70g','HyS0003.498g','HyS0013.338g','HyS0028.162g','HyS0003.220g','HyS0052.141g','HyS0085.53g','HyS0003.401g','HyS0078.25g','HyS0001.204g'])

In [None]:
#Gland cells
sc.pl.umap(adata,color=['HyS0041.99g','HyS0013.147g','HyS0032.90g','HyS0047.131g','HyS0033.92g','HyS0047.131g','HyS0031.145g'])

In [None]:
# Digestive gland cells
sc.pl.umap(adata, color=['HyS0041.99g','HyS0033.92g','HyS0047.131g','HyS0031.145g','HyS0032.90g','HyS0013.147g'])

In [None]:
# i-cells
sc.pl.umap(adata, color=['HyS0050.7g','HyS0036.26g','HyS0036.89g','HyS0038.46g'])

In [None]:
# epidermis
sc.pl.umap(adata, color=['HyS0010.17g','HyS0017.173g','HyS0085.48g','HyS0057.99g','HyS0006.325g','HyS0056.79g'])

In [None]:
# gastroderm
sc.pl.umap(adata, color=['HyS0054.100g','HyS0027.35g','HyS0011.145g','HyS0001.548g','HyS0062.5g','HyS0036.115g','HyS0003.314g','HyS0010.162g','HyS0012.275g','HyS0087.37g','HyS0004.64g'])

In [None]:
sc.tl.paga(adata, groups='leiden')

In [None]:
adata.obs['leiden_anno'] = adata.obs['leiden']

In [None]:
adata.obs['leiden_anno'].cat.categories = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12','13', '14', '15', '16', '17', '18', '19', '20','21','22','23','24','25']

In [None]:
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
sc.pl.paga(adata, threshold=0.03, show=False,fontsize=9,edge_width_scale=0.5)

In [None]:
cluster_markers(significant_markers,['14'])

In [None]:
#adata.obs['leiden_anno'].cat.categories = ["0 - Gastrodermis","1 - Ectodermis","2 - Pharyngeal ectodermis","3 - Pharyngeal ectodermis","4 - Epidermis","5 - Neurons","6 - Ectodermis","7 - Cnidocytes","8 - Smooth muscle","9 - Differentiating cnidocytes","10 - NPC","11 - Ectodermis","12 - Retractor muscle","13 - Mucous gland cells","14 - Mature cnidocytes","15 - Pharyngeal ectoderm","16 - Secretory cells","17 - Secretory cells","18 - Gastrodermis","19 - ?","20 - Secretory cells","21 - ? (Neurons-like cells)"]

In [None]:
sc.settings.set_figure_params(dpi=90, facecolor='white')
sc.pl.umap(adata, color=['leiden_anno'],title='Hydractinia polyp annotated clusters',ncols=1,frameon=False)

In [None]:
sc.settings.set_figure_params(dpi=150, facecolor='white')
sc.pl.umap(adata,title='Hydractinia polypp\nRBH - one2one - orthopairsBased', color=['leiden_anno'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#A366E0", "7":"#E2E2E2", "8":"#A366E0", "9":"#E2E2E2", "10":"#A366E0", "11":"#E2E2E2", "12":"#A366E0", "13":"#A366E0", "14":"#E2E2E2", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#A366E0", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#A366E0","22":"#E2E2E2","23":"#A366E0","24":"#E2E2E2","25":"#A366E0"}, frameon=False)

In [None]:
sc.settings.set_figure_params(dpi=150, facecolor='white')
sc.pl.umap(adata,title='Hydractinia polyp\nRBH - many2many - orthopairsBased', color=['leiden_anno'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#A366E0", "7":"#E2E2E2", "8":"#A366E0", "9":"#E2E2E2", "10":"#A366E0", "11":"#E2E2E2", "12":"#A366E0", "13":"#E2E2E2", "14":"#E2E2E2", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#A366E0", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#A366E0","22":"#E2E2E2","23":"#A366E0","24":"#E2E2E2","25":"#A366E0"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Hydractinia polyp\nOrthoFinder - many2many - orthopairsBased', color=['leiden_anno'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#A366E0", "7":"#E2E2E2", "8":"#A366E0", "9":"#A366E0", "10":"#A366E0", "11":"#E2E2E2", "12":"#A366E0", "13":"#E2E2E2", "14":"#E2E2E2", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#A366E0", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#A366E0","22":"#E2E2E2","23":"#A366E0","24":"#E2E2E2","25":"#A366E0"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Hydractinia polyp\nOrthoFinder - many2many - genomeBased', color=['leiden_anno'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#A366E0", "7":"#E2E2E2", "8":"#A366E0", "9":"#A366E0", "10":"#A366E0", "11":"#E2E2E2", "12":"#A366E0", "13":"#E2E2E2", "14":"#E2E2E2", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#A366E0", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#A366E0","22":"#E2E2E2","23":"#A366E0","24":"#E2E2E2","25":"#A366E0"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Hydractinia polyp\nRBH - one2one - orthopairsBased', color=['leiden_anno'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#E2E2E2", "7":"#E2E2E2", "8":"#E2E2E2", "9":"#E2E2E2", "10":"#E2E2E2", "11":"#E2E2E2", "12":"#E2E2E2", "13":"#E2E2E2", "14":"#0080ff", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2","22":"#0080ff","23":"#E2E2E2","24":"#E2E2E2","25":"#E2E2E2"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Hydractinia polyp\nRBH - many2many - orthopairsBased', color=['leiden_anno'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#E2E2E2", "7":"#E2E2E2", "8":"#E2E2E2", "9":"#E2E2E2", "10":"#E2E2E2", "11":"#E2E2E2", "12":"#E2E2E2", "13":"#E2E2E2", "14":"#0080ff", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2","22":"#0080ff","23":"#E2E2E2","24":"#E2E2E2","25":"#E2E2E2"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Hydractinia polyp\nOrthoFinder - many2many - orthopairsBased', color=['leiden_anno'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#E2E2E2", "7":"#E2E2E2", "8":"#E2E2E2", "9":"#E2E2E2", "10":"#E2E2E2", "11":"#E2E2E2", "12":"#E2E2E2", "13":"#E2E2E2", "14":"#0080ff", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2","22":"#0080ff","23":"#E2E2E2","24":"#E2E2E2","25":"#E2E2E2"}, frameon=False)

In [None]:
sc.pl.umap(adata,title='Hydractinia polyp\nOrthoFinder - many2many - genomeBased', color=['leiden_anno'], legend_loc= 'on data', legend_fontsize=6, legend_fontoutline=3, palette={"0":"#E2E2E2", "1":"#E2E2E2", "2":"#E2E2E2", "3":"#E2E2E2", "4":"#E2E2E2", "5":"#E2E2E2", "6":"#E2E2E2", "7":"#E2E2E2", "8":"#E2E2E2", "9":"#E2E2E2", "10":"#E2E2E2", "11":"#E2E2E2", "12":"#E2E2E2", "13":"#E2E2E2", "14":"#0080ff", "15":"#E2E2E2", "16":"#E2E2E2", "17":"#E2E2E2", "18":"#E2E2E2","19":"#E2E2E2","20":"#E2E2E2","21":"#E2E2E2","22":"#0080ff","23":"#E2E2E2","24":"#E2E2E2","25":"#E2E2E2"}, frameon=False)

In [None]:
# Test

sc.pl.umap(adata, color=['HyS0008.263g','HyS0030.203g','HyS0042.80g'],title=['Ncol1 (HyS0008.263)','Nematocilin A (HyS0030.203)','ARSTNd2-like (Hys0042.80)'])