# Dowsett dataset prediction with complete tree

### Load packages/libraries

In [1]:
import os
os.chdir('../')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import gdown
import copy as cp
import seaborn as sns
import pandas as pd
import anndata as ad
import pickle
from scipy.sparse import hstack, csc_matrix
import scanpy as sc
import torch

In [3]:
import scarches as sca
import scHPL
from scarches.dataset.trvae.data_handling import remove_sparsity

Global seed set to 0


### Set other parameters

In [4]:
sc.settings.set_figure_params(dpi=1000, frameon=False)
sc.set_figure_params(dpi=1000)
sc.set_figure_params(figsize=(7,7))
torch.set_printoptions(precision=3, sci_mode=False, edgeitems=7)

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

### Import necessary data/objects

In [5]:
#Full AnnData (Hes and Ludwig)
adata_full = sc.read('surgery_model_ludwig_2500/hesludwig_adata_processed_2500.h5ad')
adata_full

AnnData object with n_obs × n_vars = 171868 × 2500
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'ratio_nCount_nFeat', 'doubletfinder_class', 'seq_sample', 'treatment', 'single_r_celltypes', 'chen_celltypes', 'tasic_celltypes', 'romanov_celltypes', 'camp1_celltypes_full', 'identity_layer1', 'identity_layer2', 'identity_layer3', 'study', 'cellident_study', '_scvi_batch', '_scvi_labels', 'Unnamed: 0', 'percent.mito', 'percent.ribo', 'cell.type', 'neuronal.subtype', 'cell_barcode', 'ref_query1'
    var: 'mvp.mean-0', 'mvp.dispersion-0', 'mvp.dispersion.scaled-0', 'mvp.variable-0', 'highly_variable-0', 'means-0', 'dispersions-0', 'dispersions_norm-0', 'highly_variable_nbatches-0', 'highly_variable_intersection-0'
    layers: 'counts'

In [6]:
##Read the tree
file_to_read = open('full_model_2500/tree_full_HesLudwig_2500.pickle', 'rb')
tree_rq = pickle.load(file_to_read)
file_to_read.close()

#### Dowsett et al. dataset

In [7]:
#Dowsett data with the missing genes added
adata_dowsett = sc.read('DVC_datasets_2500/dowsett_data_2500.h5ad')
adata_dowsett

AnnData object with n_obs × n_vars = 15931 × 55406
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percmt', 'ratio_nCount_nFeat', 'RNA_snn_res.0.2', 'RNA_snn_res.0.3', 'RNA_snn_res.0.5', 'RNA_snn_res.0.7', 'RNA_snn_res.1', 'seurat_clusters', 'treatment', 'pANN_0.25_0.005_256', 'doubletfinder_class', 'pANN_0.25_0.005_273', 'seq_sample', 'cell_barcode'
    var: 'mvp.mean', 'mvp.dispersion', 'mvp.dispersion.scaled', 'mvp.variable'
    layers: 'counts', 'data'

### Prepare data

Subset the Dowsett dataset to the original 2500 most variable genes in the Hes dataset

In [8]:
adata_dowsett = adata_dowsett[:, adata_full.var_names]
adata_dowsett

View of AnnData object with n_obs × n_vars = 15931 × 2500
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percmt', 'ratio_nCount_nFeat', 'RNA_snn_res.0.2', 'RNA_snn_res.0.3', 'RNA_snn_res.0.5', 'RNA_snn_res.0.7', 'RNA_snn_res.1', 'seurat_clusters', 'treatment', 'pANN_0.25_0.005_256', 'doubletfinder_class', 'pANN_0.25_0.005_273', 'seq_sample', 'cell_barcode'
    var: 'mvp.mean', 'mvp.dispersion', 'mvp.dispersion.scaled', 'mvp.variable'
    layers: 'counts', 'data'

In [9]:
#Normalize Dowsett dataset
sc.pp.normalize_total(adata_dowsett)

Build latent representation for new query (the Dowsett data)

In [10]:
full_path = 'full_model_2500/'

model = sca.models.SCVI.load_query_data(
    adata_dowsett,
    full_path,
    freeze_dropout = True,
)

[34mINFO    [0m File full_model_2500/model.pt already downloaded                                                          


In [11]:
model.train(max_epochs=50)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 50/50: 100%|███████████| 50/50 [01:45<00:00,  2.11s/it, loss=710, v_num=1]


In [12]:
query_latent = sc.AnnData(model.get_latent_representation())

In [13]:
query_latent.obs['orig.ident'] = adata_dowsett.obs["orig.ident"].tolist()
query_latent.obs["seq_sample"] = adata_dowsett.obs["seq_sample"].tolist()

In [14]:
#Save this model
model.save('surgery_model_dowsett_2500/', overwrite=True)
query_latent.write('surgery_model_dowsett_2500/dowsett_latent_2500.h5ad')

### Prediction

In [15]:
#Using the full reference tree (Hes and Ludwig)
dowsett_pred = scHPL.predict.predict_labels(query_latent.X, tree = tree_rq, threshold = 0.5)

In [16]:
#Save the resulting object
file_to_store = open('prediction_dowsett_2500/dowsett_pred_2500.pickle', 'wb')
pickle.dump(dowsett_pred, file_to_store)
file_to_store.close()

Transfer labels

In [17]:
dowsett_pred[0]

array(['Sall3_Nox4_Hes', 'myelinating_OL_Hes',
       'myelinating_intermediate_OL_Hes', 'mixed_neurons3_Hes',
       'myelinating_intermediate_OL_Hes', 'Oligodendrocytes_ludwig',
       'myelinating_intermediate_OL_Hes', ..., 'mixed_neurons3_Hes',
       'mixed_neurons3_Hes', 'A_M2_Hes',
       'myelinating_intermediate_OL_Hes', 'paradoxic_Hes',
       'myelinating_intermediate_OL_Hes', 'premyelinating_OL_Hes'],
      dtype='<U31')

In [18]:
query_latent.obs['identity_layer3'] = dowsett_pred[0]
adata_dowsett.obs['identity_layer3'] = dowsett_pred[0]

In [19]:
adata_dowsett.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percmt,ratio_nCount_nFeat,RNA_snn_res.0.2,RNA_snn_res.0.3,RNA_snn_res.0.5,RNA_snn_res.0.7,RNA_snn_res.1,seurat_clusters,treatment,pANN_0.25_0.005_256,doubletfinder_class,pANN_0.25_0.005_273,seq_sample,cell_barcode,_scvi_batch,_scvi_labels,identity_layer3
nts_adlib1_AAACCCACAGATCCTA-1,dowsett_adlib1,6962.0,3088,0.014364,2.254534,9,9,8,7,8,8,adlib,0.037037,Singlet,,nts_adlib1,AAACCCACAGATCCTA-1,7,0,Sall3_Nox4_Hes
nts_adlib1_AAACCCAGTCTTGTCC-1,dowsett_adlib1,2365.0,1364,0.000000,1.733871,2,2,2,13,13,13,adlib,0.000000,Singlet,,nts_adlib1,AAACCCAGTCTTGTCC-1,7,0,myelinating_OL_Hes
nts_adlib1_AAACCCAGTGATTCTG-1,dowsett_adlib1,2435.0,1241,0.000000,1.962127,1,0,0,1,0,0,adlib,0.000000,Singlet,,nts_adlib1,AAACCCAGTGATTCTG-1,7,0,myelinating_intermediate_OL_Hes
nts_adlib1_AAACGCTCACATTCGA-1,dowsett_adlib1,19008.0,5598,0.021044,3.395498,0,1,1,0,2,2,adlib,0.148148,Singlet,,nts_adlib1,AAACGCTCACATTCGA-1,7,0,mixed_neurons3_Hes
nts_adlib1_AAACGCTCAGACGGAT-1,dowsett_adlib1,2654.0,1319,0.000000,2.012130,1,0,0,1,0,0,adlib,0.000000,Singlet,,nts_adlib1,AAACGCTCAGACGGAT-1,7,0,myelinating_intermediate_OL_Hes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nts_fast2_TTTGTTGAGCGCGTTC-1,dowsett_fasted2,4252.0,2238,0.023518,1.899911,0,1,1,0,1,1,fast,,Singlet,0.000000,nts_fast2,TTTGTTGAGCGCGTTC-1,10,0,A_M2_Hes
nts_fast2_TTTGTTGAGGCCCACT-1,dowsett_fasted2,3787.0,1538,0.000000,2.462289,1,0,0,1,0,0,fast,,Singlet,0.000000,nts_fast2,TTTGTTGAGGCCCACT-1,10,0,myelinating_intermediate_OL_Hes
nts_fast2_TTTGTTGCACCGGAAA-1,dowsett_fasted2,11627.0,4109,0.000000,2.829642,5,6,10,19,22,22,fast,,Singlet,0.125000,nts_fast2,TTTGTTGCACCGGAAA-1,10,0,paradoxic_Hes
nts_fast2_TTTGTTGGTAGCGATG-1,dowsett_fasted2,1746.0,1031,0.057274,1.693501,1,0,0,1,3,3,fast,,Singlet,0.000000,nts_fast2,TTTGTTGGTAGCGATG-1,10,0,myelinating_intermediate_OL_Hes


In [20]:
#Save new objects
query_latent.write('prediction_dowsett_2500/dowsett_latent_labeled_2500.h5ad')
adata_dowsett.write('prediction_dowsett_2500/dowsett_data_new_labeled_2500.h5ad')