In [1]:
import scanpy as sc
import scanpy.external as sce
import tempfile
import anndata as ad
import os
import torch
import scvi
from glob import glob
import ray
import matplotlib.pyplot as plt
from ray import tune
from scvi import autotune
scvi.settings.dl_num_worker=25
scvi.settings.dl_persistent_workers=True

torch.set_float32_matmul_precision('high')
scvi.settings.seed = 42
os.chdir("/data")


Seed set to 42


In [2]:
paths = glob("cellranger/*/qc_filtered.h5ad")
adata_list=[] #reading files
names=[]
for n in paths:
    adata = sc.read_h5ad(n)
    sample = n.split("/")[1]
    adata.obs["sample_id"]=sample
    adata.obs["condition"]=sample.split("-")[0]
    adata_list.append(adata)
    sc.pp.filter_genes(adata, min_cells=3)
    names.append(sample)

In [3]:
torch.cuda.device_count()

4

In [4]:
adata=ad.concat(adata_list, index_unique="-", keys=names) #creating one object

In [5]:
del adata_list

In [6]:
adata.layers["counts"]=adata.X.copy()

In [7]:
scvi.data.poisson_gene_selection(adata, layer='counts', n_top_genes=8000, subset=True, batch_key="sample_id") # selecting HVGs using poisson sampling

Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.


Sampling from binomial...:   0%|          | 0/10000 [00:00<?, ?it/s]

Sampling from binomial...:   0%|          | 0/10000 [00:00<?, ?it/s]

Sampling from binomial...:   0%|          | 0/10000 [00:00<?, ?it/s]

Sampling from binomial...:   0%|          | 0/10000 [00:00<?, ?it/s]

Sampling from binomial...:   0%|          | 0/10000 [00:00<?, ?it/s]

Sampling from binomial...:   0%|          | 0/10000 [00:00<?, ?it/s]

Sampling from binomial...:   0%|          | 0/10000 [00:00<?, ?it/s]

Sampling from binomial...:   0%|          | 0/10000 [00:00<?, ?it/s]

In [8]:
adata.layers["counts"] = adata.layers["counts"].toarray().copy()

In [9]:
model_cls = scvi.model.SCVI

model_cls.setup_anndata(adata, layer="counts", batch_key='sample_id')

search_space = {
    "model_params": {"n_hidden": tune.choice([64, 128, 256]), 
                     "n_layers": tune.choice([1, 2, 3, 4]),
                     "n_latent": tune.choice([10, 20, 30, 40, 50]),
                     "gene_likelihood": tune.choice(["nb", "zinb"])
                    },
    "train_params": {"max_epochs": 100,
                     "plan_kwargs": {"lr": tune.loguniform(1e-4, 1e-2)}}}

In [10]:
adata

AnnData object with n_obs × n_vars = 137651 × 8000
    obs: 'background_fraction', 'cell_probability', 'cell_size', 'droplet_efficiency', 'n_raw', 'n_cellbender', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'vaeda_scores', 'vaeda_calls', 'outlier', 'mt_outlier', 'sample_id', 'condition', '_scvi_batch', '_scvi_labels'
    var: 'highly_variable', 'observed_fraction_zeros', 'expected_fraction_zeros', 'prob_zero_enriched_nbatches', 'prob_zero_enrichment', 'prob_zero_enrichment_rank'
    uns: 'hvg', '_scvi_uuid', '_scvi_manager_uuid'
    obsm: 'cellbender_embedding', 'vaeda_embedding'
    layers: 'cellbender', 'raw', 'counts'

In [11]:
ray.init(log_to_driver=False, dashboard_host="0.0.0.0", dashboard_port=8889, include_dashboard=True, runtime_env={"env_vars": {"TORCH_ALLOW_TF32_CUBLAS_OVERRIDE": "1"}}, _temp_dir="/opt/dlami/nvme/ray")

  self.pid = _posixsubprocess.fork_exec(
  self.pid = _posixsubprocess.fork_exec(
2024-10-25 22:24:25,539	INFO worker.py:1715 -- Started a local Ray instance. View the dashboard at 12.0.0.75:8889 


0,1
Python version:,3.10.13
Ray version:,2.9.3
Dashboard:,http://12.0.0.75:8889


In [None]:
results = autotune.run_autotune(
    model_cls,
    data=adata,
    mode="min",
    metrics="validation_loss",
    search_space=search_space,
    num_samples=110,
    resources={"cpu": 16, "gpu": 2},
    logging_dir="/opt/dlami/nvme"    
)

0,1
Current time:,2024-10-26 01:19:07
Running for:,02:54:37.23
Memory:,79.9/181.8 GiB

Trial name,status,loc,model_params/gene_li kelihood,model_params/n_hidde n,model_params/n_laten t,model_params/n_layer s,train_params/max_epo chs,train_params/plan_kw args/lr,iter,total time (s),validation_loss
_trainable_3f0f2181,RUNNING,12.0.0.75:18480,zinb,256,50,2,100,0.00273047,3.0,39.2613,5751.61
_trainable_a47aace4,RUNNING,12.0.0.75:18561,zinb,256,50,2,100,0.0041641,3.0,119.013,5757.92
_trainable_4be13cfe,PENDING,,zinb,256,50,2,100,0.00321819,,,
_trainable_0a098449,TERMINATED,12.0.0.75:18480,nb,256,40,2,100,0.000642181,1.0,14.1751,6133.52
_trainable_0a5ab68c,TERMINATED,12.0.0.75:18480,zinb,256,50,2,100,0.000817868,1.0,14.7179,6071.12
_trainable_0b05f8ab,TERMINATED,12.0.0.75:18480,zinb,128,20,1,100,0.000624557,1.0,93.0779,6169.95
_trainable_0b369d4a,TERMINATED,12.0.0.75:18561,zinb,64,10,4,100,0.00119812,1.0,15.5016,6215.79
_trainable_0cf59b85,TERMINATED,12.0.0.75:18561,zinb,256,20,1,100,0.00869011,32.0,337.477,5679.8
_trainable_0dbb23cb,TERMINATED,12.0.0.75:18480,zinb,256,50,2,100,0.000224682,1.0,15.0754,6300.78
_trainable_0ebfcc90,TERMINATED,12.0.0.75:18561,zinb,64,10,2,100,0.000101686,1.0,13.9812,6859.92


In [None]:
df=results.result_grid.get_dataframe()

In [None]:
df.to_csv("nb/scvi/autotune_results.csv")

In [None]:
ray.shutdown()

In [15]:
results.result_grid.get_best_result(metric="validation_loss", mode="min")

    

Result(
  metrics={'validation_loss': 5635.21240234375},
  path='/opt/dlami/nvme/scvi_b6a36029-d112-43ab-92af-b4b1ca5f2056/_trainable_58deecf1_27_gene_likelihood=zinb,n_hidden=256,n_latent=50,n_layers=2,max_epochs=100,lr=0.0024_2024-10-26_00-03-45',
  filesystem='local',
  checkpoint=None
)

In [None]:
import pickle
with open('h5ad/data/autotune.pckl', 'wb') as f:
  # Dump the data into the file
  pickle.dump(results, f)
