In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scvi
import sys
import os

In [2]:
dd = '/home/ql312/rds/rds-turco-lab4-oIhPyuXA8U8/raw_data/trophoblast_organoid_from_Sanger'
sp1 = ['6044STDY864056' + str(i+1) for i in range(6)]
sp2 = ['Pla_Camb101239' + str(i) for i in range(28, 36, 1)]
sps = sp1 + sp2
dd2 = '/home/ql312/rds/hpc-work/tropho_organoid_EVT'

Cells that were 1) having > 20% of mitochondria genes; 2) unassigned to a donor or assigned to multiple donors by Souporcell; 3) falling within the doublet-enriched clusters were excluded from the downstream analysis.

In [3]:
adatas = []
for ff in sps:
    adata = sc.read_10x_mtx(f'{dd}/{ff}/cellranger_res/{ff}/outs/filtered_feature_bc_matrix/')
    info = pd.read_csv(f'{dd2}/sample_info_{ff}.csv', index_col=0)
    ind = (info['pct_counts_mt'] < 20) & (info['ID']!='unassigned') & (~info['doublet_local_pred'])
    adata = adata[ind, :]
    info = info[ind]
    adata.obs = info
    adatas.append(adata)
adata = adatas[0].concatenate(adatas[1:], batch_categories=sps)

Filter genes detected in less than 20 cells and select the highly variable genes.

In [None]:
sc.pp.filter_genes(adata, min_cells=20)
adata.layers['counts'] = adata.X.copy()
adata.raw = adata
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, layer='counts', n_top_genes=hvg, flavor='seurat_v3', batch_key='ID', subset=True)

Run scVI based on all cells.

In [None]:
scvi.data.setup_anndata(adata, layer="counts", batch_key="ID")
vae = scvi.model.SCVI(adata, n_latent=lv)
vae.train()
adata.obsm["X_scVI"] = vae.get_latent_representation()
sc.pp.neighbors(adata, use_rep="X_scVI")
sc.tl.leiden(adata)
sc.tl.umap(adata, min_dist=0.3)

Plot trophoblast and proliferating marker genes.

In [None]:
fig, axs = plt.subplots(ncols=4, nrows=1, figsize=(20, 5))
for i, gg in enumerate(['DUSP9', 'CYP19A1', 'HLA-G', 'MKI67']):
    sc.pl.umap(adata, color=gg, show=False, ax=axs[i], use_raw=False)
plt.show()

Plot the number of genes detected in each cell. For each type of trophoblast cells, there were clusters of cells which were isolated from their corresponding cell types in the UMAP and had a much lower number of genes detected

In [None]:
sc.pl.umap(adata, color='n_genes_by_counts')

Annotate each Leiden cluster based on established trophoblast marker genes.

In [None]:
new_nms = [
        'EVT_late_1', 'EVT_early_1', 'EVT_proliferating', 'EVT_early_lowUMI_1', 'EVT_late_2', 'SCT', 'TOM_VCT',
        'EVT_early_3', 'EVT_intermediate_2', 'TOM_VCT_proliferating', 'EVT_intermediate_1', 'SCT_lowUMI',
        'EVT_intermediate_lowUMI', 'VCT', 'EVT_early_lowUMI_2', 'EVT_late_lowUMI', 'VCT_lowUMI', 'EVT_early_2',
        'EVT_late_3', 'TOM_SCT']
adata.obs['celltype'] = adata.obs['leiden'].copy()
adata.rename_categories('celltype', new_nms)

fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(10, 5))
for i, pp in enumerate(['leiden', 'celltype']):
    sc.pl.umap(adata, color=pp, legend_loc='on data', legend_fontsize=6, show=False, ax=axs[i])

Unsupervised hierarchical clustering for all cell types based on highly variable genes.

In [None]:
sc.tl.dendrogram(adata, var_names=adata.var_names, use_rep='X', groupby='celltype', linkage_method='ward')
sc.pl.dendrogram(adata, groupby='celltype')

Plot cells from each donor and each time point. The fourth donor failed to be differentiated to EVT.

In [None]:
fig, axs = plt.subplots(ncols=6, nrows=2, figsize=(30, 10))
##donors
for i, indiv in enumerate([f'ID{j+1}' for j in range(6)]):
    sc.pl.umap(adata, color='ID', groups=[indiv], palette=['red'], title=indiv, legend_loc='none', frameon=False, show=False, ax=axs[0, i])
##time points
for i, ti in enumerate(['-48H', '0H', '3H', '24H', '48H', '96H']):
    sc.pl.umap(adata, color='tp', groups=[ti], palette=['red'], title=ti, legend_loc='none', frameon=False, show=False, ax=axs[1, i])

The clusters with low number of genes detected, together with the cells from the fourth donor which failed to be differentiated to EVT, were excluded from the following analysis. The data integration was re-conducted using scVI.

In [None]:
adata = adata.raw.to_adata()
##filtering cells with low UMI and from donor 4
ind1 = adata.obs['celltype'].str.contains('lowUMI').values
ind2 = (adata.obs['ID'] != 'ID4')
ind = ((~ind1) & ind2)
adata = adata[ind, :].copy()
sc.pp.filter_genes(adata, min_cells=20)
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata
sc.pp.highly_variable_genes(adata, layer='counts', n_top_genes=hvg, flavor='seurat_v3', batch_key='ID', subset=True)

##run scVI
scvi.data.setup_anndata(adata, layer="counts", batch_key="ID")
vae = scvi.model.SCVI(adata, n_latent=lv)
vae.train()
adata.obsm["X_scVI"] = vae.get_latent_representation()
sc.pp.neighbors(adata, use_rep="X_scVI")
sc.tl.leiden(adata)
sc.tl.umap(adata, min_dist=0.3)

## visualize the high quality cells
sc.pl.umap(adata, color='celltype', legend_loc='on data', legend_fontsize=6, frameon=False)

Plot the known marker genes across the cell types.

In [None]:
vct_mk = ['PAGE4', 'PEG10', 'PARP1', 'DUSP9', 'CYSTM1', 'ALDH7A1', 'BTG3', 'PEG3', 'PNP', 'MPC2', 'MEST', 'ISYNA1', 'MLLT1', 'PMAIP1', 'JUN']
sct_mk = ['CYP19A1', 'LCMT1-AS2', 'GDF15', 'ERVFRD-1', 'HSPB8', 'PRKCZ', 'SLC22A11', 'INSL4']
evt_mk = ['HLA-G', 'FSTL3', 'DIO2', 'PLAC8', 'TNNI2', 'PTPRF', 'EFNA1', 'FN1', 'C12orf75', 'FLNB', 'COL4A1', 'ITGA5', 'FXYD5', 'NOTUM', 'C15orf48', 'MMP2', 'HPGD', 'TIMP1', 'MCAM', 'AIF1L', 'PAPPA2', 'AOC1', 'RAC1', 'CD81', 'PRG2', 'PAPPA']
ggs = vct_mk + sct_mk + evt_mk

reorder_nms = ['TOM_VCT_proliferating', 'TOM_VCT', 'VCT',
                'TOM_SCT', 'SCT',
                'EVT_proliferating', 'EVT_early_1', 'EVT_early_2', 'EVT_early_3',
                'EVT_intermediate_1', 'EVT_intermediate_2',
                'EVT_late_1', 'EVT_late_2', 'EVT_late_3']
sc.pl.dotplot(adata, ggs, groupby='celltype', standard_scale='var', cmap='GnBu', categories_order=reorder_nms)