In [1]:
# import packages
import os
import six
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import colors as clrs
import matplotlib.transforms as transforms
import numpy as np
import scanpy as sc
import glob
import anndata as an
import seaborn as sns
from tqdm.notebook import tqdm
from random import sample
from anndata import AnnData
from dask.base import get_name_from_key
from IPython.display import IFrame

In [2]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
# print package versions
import session_info
session_info.show()

# Variables, Info, and Data Paths

In [4]:
# number of degrees to turn each of the regions in order to orient them in the same way, used later 
transform_degrees = {
    '1_0': 180,
    '2_0': 180,
}
# regions refer to each individual tissue section
regions_list = ['1_0', '2_0']

# gene lists 
neuron_gene_list = 'Nkx2-2,Foxn4,Lhx2,Lhx4,Neurog3,Hmx3,Bhlhe23,Lhx1,Barhl2,Otp,Pax8,Pax2,Bhlhe22,Slc10a4,Slc18a3,Evx1,Vsx2,Sox14,Vsx1,Onecut2,Pou2f2,Tubb3,Elavl3'.split(',')
mesoderm_gene_list = 'Foxc1,Foxc2,Meox1,Meox2,Myog'.split(',')
neural_crest_gene_list = 'Dlx2,Sox10'.split(',')
neural_progenitor_gene_list = 'Wnt1,Olig3,Pax6,Dbx2,Sp8,Foxa2,Shh,Sox2,Olig2,Pax3'.split(',')
blood_gene_list = 'Klf1,Hemgn,Sox17,Fermt3'.split(',')
DRG_gene_list = 'Tlx2'.split(',')
NC_progenitor_gene_list = 'Mafb'.split(',')

markers = {
    'Neuron': neuron_gene_list,
    'Neural progenitor': neural_progenitor_gene_list,
    'Mesoderm': mesoderm_gene_list,
    'DRG': DRG_gene_list,
    'Blood': blood_gene_list,
    'Neural crest': neural_crest_gene_list,
    'Pre-EMT NC progenitor': NC_progenitor_gene_list
}

# These are the markers used for ScTyping
full_marker_list = neuron_gene_list + mesoderm_gene_list + neural_crest_gene_list + neural_progenitor_gene_list + blood_gene_list + DRG_gene_list + NC_progenitor_gene_list

# This is the 36 genes worked with in the DNM cohort
MM_DNM_gene_list = ['Add2', 'Atg10', 'Bicra', 'Celsr1', 'Chordc1', 'Cnpy1','Ehd4','Evl','Fam83g','Gon4l','Med13l','Mink1','Mta3','Mul1','Opalin','Osbpl5','Plg', 'Pop1','Ppp1r14c','Ppp5c','Pttg1ip','Rc3h2','Rnd2','Scaper','Sec31a','Serpinb12','Slain2','Smurf2','Spen','Spx','Stab1','Tcf12','Unc80','Whamm','Xirp2','Zswim6']

# used to mark gene expression values as positive
def positive(value):
    return max(value, 0)


In [5]:
# read clustered analysis
adata = sc.read('DNM_MERFISH_9_5_M085_filtered_normalized_UMAP.h5ad')

# Clustering and ScTyping

In [7]:
# clustering found using leiden algorithm
M085_clusters = pd.read_csv('./M085_clusters.tsv',sep='\t',index_col='obs_names')

# load and merge the ScType results
sctype_df = pd.read_csv('./sctype_ct_pred_top2perCluster.txt',sep='\t',index_col='Unnamed: 0')

sctype_df['num_cells']=sctype_df['cluster'].map(dict(M085_clusters['clusters'].value_counts()))

# low confidence calls are those which have score<numcells/10
sctype_df['high_conf_score']=sctype_df['scores']>sctype_df['num_cells']/10.0


# remove anything with score< 0 or score less than confidence
sctype_df['type'][sctype_df['high_conf_score']==False]='Indeterminate'

# map scores and typing to dataframe
sctype_top = sctype_df.sort_values('scores',ascending=False).drop_duplicates(subset=['cluster'])
sctype_top.index=sctype_top['cluster']
M085_clusters['sctype_pred']=M085_clusters['clusters'].map(dict(sctype_top['type']))
M085_clusters['sctype_pred_score']=M085_clusters['clusters'].map(dict(sctype_top['scores']))
M085_clusters_map = M085_clusters[['clusters','sctype_pred','sctype_pred_score']].drop_duplicates(subset=['clusters'])
M085_clusters_map=M085_clusters_map.sort_values('clusters')
M085_clusters_map.index=M085_clusters_map['clusters']


M085_clusters

Unnamed: 0_level_0,clusters,cell_type,sctype_pred,sctype_pred_score
obs_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2517282000001100005,9,unknown,Mesoderm,603.973678
2517282000003100049,13,unknown,Neuron,1594.759459
2517282000006100007,13,unknown,Neuron,1594.759459
2517282000006100013,13,unknown,Neuron,1594.759459
2517282000006100014,13,unknown,Neuron,1594.759459
...,...,...,...,...
2517281600044200083,5,unknown,Pre-EMT-NCP,594.796953
2517281600044200087,11,unknown,Neural Crest,3171.900903
2517281600044200088,11,unknown,Neural Crest,3171.900903
2517281600044200093,9,unknown,Mesoderm,603.973678


In [8]:
# genes for cell type identification
marker_df = pd.read_csv('./MarkerGeneAssignment_final.txt',sep='\t',names=['gene','celltype'],index_col='gene')
marker_df.index.astype(str)
marker_df = marker_df.reset_index()
marker_df = marker_df[marker_df['gene'].isin(full_marker_list)]
marker_df = marker_df.groupby('gene').agg(lambda x: ', '.join(x.astype(str)))
marker_df = marker_df.reset_index()
marker_df = marker_df.T
marker_df = marker_df.reset_index()
marker_df = marker_df.set_axis(marker_df.iloc[0], axis=1).drop(0).reset_index(drop=True)
marker_df

Unnamed: 0,gene,Barhl2,Bhlhe22,Bhlhe23,Dbx2,Dlx2,Elavl3,Evx1,Fermt3,Foxa2,...,Sox10,Sox14,Sox17,Sox2,Sp8,Tlx2,Tubb3,Vsx1,Vsx2,Wnt1
0,celltype,Neuron,Neuron,Neuron,Neural Progenitor,Neural crest,Neuron,Neuron,Blood,Neural Progenitor,...,Neural crest,Neuron,Blood,Neural Progenitor,Neural Progenitor,DRG,Neuron,Neuron,Neuron,Neural Progenitor


In [9]:
M085_clusters['sctype_pred'].value_counts()/(M085_clusters['sctype_pred'].value_counts().sum())

Mesoderm               0.297902
Indeterminate          0.162052
Neural Progenitor      0.150987
Neural Crest           0.130437
Neuron                 0.126701
Pre-EMT-NCP            0.106294
Dorsal Root Ganglia    0.013844
Blood                  0.011784
Name: sctype_pred, dtype: float64

# Marker Gene Analysis and Dataframes

In [11]:
# get marker gene data and replace negative expression values to 0
marker_cells = adata.to_df()
marker_cells = marker_cells.filter(full_marker_list, axis=1)
convert = marker_cells.columns[1:]
marker_cells[convert] = marker_cells[convert].astype(float)

marker_cells = marker_cells.applymap(positive)
marker_cells.index = marker_cells.index.astype(int)
marker_cells = pd.merge(M085_clusters, marker_cells, left_index=True, right_index=True)
marker_cells = marker_cells.drop(['clusters','sctype_pred_score', 'cell_type'], axis = 1)
marker_cells = marker_cells.set_index('sctype_pred')
marker_cells = marker_cells.reset_index()
convert = marker_cells.columns[1:]
marker_cells[convert] = marker_cells[convert].astype(float)
marker_clusters = marker_cells.groupby('sctype_pred').mean()
percent_avg_exp_marker = (marker_clusters.div(marker_clusters.sum(axis=0), axis=1) * 100)
percent_avg_exp_marker = percent_avg_exp_marker.reset_index()

# use tsv file to replicate figures in manuscript
#percent_avg_exp_marker.to_csv('Marker_percent_avg.tsv',sep='\t',index=False)

percent_avg_exp_marker

Unnamed: 0,sctype_pred,Nkx2-2,Foxn4,Lhx2,Lhx4,Neurog3,Hmx3,Bhlhe23,Lhx1,Barhl2,...,Shh,Sox2,Olig2,Pax3,Klf1,Hemgn,Sox17,Fermt3,Tlx2,Mafb
0,Blood,0.0,0.0,10.885597,2.261855,0.0,0.0,2.594176,2.867448,0.0,...,9.209074,11.590397,3.204265,7.57563,51.362502,94.534657,93.311633,72.524376,1.665212,10.947412
1,Dorsal Root Ganglia,12.726132,2.705643,0.0,0.0,0.0,0.0,37.325521,1.990842,4.735771,...,9.330475,9.668504,2.024627,17.682714,0.0,1.278377,0.0,1.781102,94.597272,10.210911
2,Indeterminate,2.239637,4.729642,5.017991,1.011512,2.059675,6.311297,2.360763,2.829732,5.721443,...,1.414218,2.324387,1.146599,2.356249,16.10635,0.580499,1.532132,3.156762,0.254796,6.267895
3,Mesoderm,2.941147,8.56492,7.306863,2.75885,1.202349,6.924049,2.314458,7.905317,12.02676,...,1.263631,1.67473,0.793068,8.2767,12.396575,0.49209,1.51026,3.408601,0.597985,9.435307
4,Neural Crest,3.977893,4.064234,2.145681,1.348913,1.117142,14.283875,3.248023,1.997857,4.308035,...,2.063211,3.886921,1.028115,7.207758,3.420199,0.880255,0.458364,3.495992,1.910432,10.087687
5,Neural Progenitor,10.822585,3.281912,6.905774,16.250712,4.085666,4.323703,3.606121,6.670651,4.070562,...,70.466038,46.785617,64.015943,39.601764,5.830471,0.741898,1.619133,5.117182,0.350074,5.330049
6,Neuron,64.345013,69.790875,65.246245,75.14789,90.110325,64.270424,44.797197,73.343107,58.274233,...,4.090178,20.714829,25.716753,9.952337,4.638869,0.68038,0.813072,4.153689,0.389363,9.95745
7,Pre-EMT-NCP,2.947593,6.862775,2.491849,1.220268,1.424844,3.886652,3.75374,2.395046,10.863195,...,2.163175,3.354615,2.070629,7.346847,6.245034,0.811845,0.755406,6.362297,0.234866,37.763289


In [12]:
# final typing with gene expression
percent_markers_typing = pd.concat([percent_avg_exp_marker, marker_df], ignore_index=True)
percent_markers_typing

Unnamed: 0,sctype_pred,Nkx2-2,Foxn4,Lhx2,Lhx4,Neurog3,Hmx3,Bhlhe23,Lhx1,Barhl2,...,Sox2,Olig2,Pax3,Klf1,Hemgn,Sox17,Fermt3,Tlx2,Mafb,gene
0,Blood,0.0,0.0,10.885597,2.261855,0.0,0.0,2.594176,2.867448,0.0,...,11.590397,3.204265,7.57563,51.362502,94.534657,93.311633,72.524376,1.665212,10.947412,
1,Dorsal Root Ganglia,12.726132,2.705643,0.0,0.0,0.0,0.0,37.325521,1.990842,4.735771,...,9.668504,2.024627,17.682714,0.0,1.278377,0.0,1.781102,94.597272,10.210911,
2,Indeterminate,2.239637,4.729642,5.017991,1.011512,2.059675,6.311297,2.360763,2.829732,5.721443,...,2.324387,1.146599,2.356249,16.10635,0.580499,1.532132,3.156762,0.254796,6.267895,
3,Mesoderm,2.941147,8.56492,7.306863,2.75885,1.202349,6.924049,2.314458,7.905317,12.02676,...,1.67473,0.793068,8.2767,12.396575,0.49209,1.51026,3.408601,0.597985,9.435307,
4,Neural Crest,3.977893,4.064234,2.145681,1.348913,1.117142,14.283875,3.248023,1.997857,4.308035,...,3.886921,1.028115,7.207758,3.420199,0.880255,0.458364,3.495992,1.910432,10.087687,
5,Neural Progenitor,10.822585,3.281912,6.905774,16.250712,4.085666,4.323703,3.606121,6.670651,4.070562,...,46.785617,64.015943,39.601764,5.830471,0.741898,1.619133,5.117182,0.350074,5.330049,
6,Neuron,64.345013,69.790875,65.246245,75.14789,90.110325,64.270424,44.797197,73.343107,58.274233,...,20.714829,25.716753,9.952337,4.638869,0.68038,0.813072,4.153689,0.389363,9.95745,
7,Pre-EMT-NCP,2.947593,6.862775,2.491849,1.220268,1.424844,3.886652,3.75374,2.395046,10.863195,...,3.354615,2.070629,7.346847,6.245034,0.811845,0.755406,6.362297,0.234866,37.763289,
8,,Neuron,Neuron,Neuron,Neuron,Neuron,Neuron,Neuron,Neuron,Neuron,...,Neural Progenitor,Neural Progenitor,Neural Progenitor,Blood,Blood,Blood,Blood,DRG,Pre-EMT-NCP,celltype


# DNM candidate analysis Figures and Dataframes

In [14]:
# get DNM gene data and replace negative expression values to 0
DNM_cells = adata.to_df()
DNM_cells = DNM_cells.filter(MM_DNM_gene_list, axis=1)
convert = DNM_cells.columns[1:]
DNM_cells[convert] = DNM_cells[convert].astype(float)

DNM_cells = DNM_cells.applymap(positive)
DNM_cells.index = DNM_cells.index.astype(int)
DNM_cells = pd.merge(M085_clusters, DNM_cells, left_index=True, right_index=True)
DNM_cells = DNM_cells.drop(['clusters','sctype_pred_score', 'cell_type'], axis = 1)
DNM_cells = DNM_cells.set_index('sctype_pred')
DNM_cells = DNM_cells.reset_index()
convert = DNM_cells.columns[1:]
DNM_cells[convert] = DNM_cells[convert].astype(float)
DNM_clusters = DNM_cells.groupby('sctype_pred').mean()
percent_avg_exp_DNM = (DNM_clusters.div(DNM_clusters.sum(axis=0), axis=1) * 100)
percent_avg_exp_DNM = percent_avg_exp_DNM.reset_index()

# use tsv file to replicate figures in manuscript
#percent_avg_exp_DNM.to_csv('DNM_percent_avg.tsv',sep='\t',index=False)

percent_avg_exp_DNM

Unnamed: 0,sctype_pred,Add2,Atg10,Bicra,Celsr1,Chordc1,Cnpy1,Ehd4,Evl,Fam83g,...,Slain2,Smurf2,Spen,Spx,Stab1,Tcf12,Unc80,Whamm,Xirp2,Zswim6
0,Blood,11.387692,0.0,13.086818,5.271362,88.582466,73.245196,7.814293,15.03995,28.088471,...,10.038439,44.588485,12.985371,0.0,4.474607,15.67421,53.691811,14.713905,0.0,11.747457
1,Dorsal Root Ganglia,5.172702,34.099959,11.438745,8.052933,1.344478,2.277262,15.550389,9.383136,8.553006,...,17.942969,9.062001,10.89894,0.0,0.0,10.419953,3.004207,0.0,25.743942,9.997055
2,Indeterminate,20.096838,40.78226,12.18982,11.03877,2.139661,3.843777,24.989074,13.493009,11.169273,...,14.295973,7.835353,12.62061,7.604668,1.718526,14.082721,17.83458,16.676685,3.917756,9.410517
3,Mesoderm,8.649346,4.491062,12.148296,4.762152,1.12091,4.141637,11.291991,12.351029,6.837435,...,11.542914,9.659454,10.495262,77.630116,3.45592,12.867381,5.29897,19.66043,3.312264,10.537258
4,Neural Crest,6.274297,6.617462,13.910968,7.849703,1.393423,3.382724,7.720799,13.060399,7.731158,...,12.318675,6.556884,12.091453,0.0,1.56726,14.324858,4.264707,9.24251,5.04251,11.624459
5,Neural Progenitor,9.593981,5.3991,12.459564,39.702425,1.448977,3.740895,6.196899,11.351815,19.331678,...,10.22208,8.129836,13.490693,3.144044,3.356603,10.246732,3.396781,13.926677,3.607713,14.233762
6,Neuron,26.179142,5.843154,10.769677,16.389102,2.40149,5.279254,5.336463,12.745891,8.86627,...,7.949937,6.034103,13.292016,2.699849,1.225044,11.918481,6.073036,11.756927,49.591015,19.525924
7,Pre-EMT-NCP,12.646002,2.767002,13.996112,6.933554,1.568595,4.089255,21.100091,12.574772,9.422709,...,15.689012,8.133885,14.125655,8.921323,84.202039,10.465663,6.435908,14.022866,8.7848,12.923567
