In [1]:
import sys
sys.path = ["../../.."] + sys.path # 切换到项目目录下

import scanpy as sc
import scvelo as scv
import velovgi

from ray import tune, air
from ray.air import session

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


## 1. 读取原始数据

In [2]:
adata = scv.read("../../../erythroid_lineage/data/erythroid_lineage.h5ad")

cluster_key = "celltype"
batch_key="stage"

adata

AnnData object with n_obs × n_vars = 500 × 53801
    obs: 'sample', 'stage', 'sequencing.batch', 'theiler', 'celltype'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'MURK_gene', 'Δm', 'scaled Δm'
    uns: 'celltype_colors'
    obsm: 'X_pca', 'X_umap'
    layers: 'spliced', 'unspliced'

In [3]:
batch_pair_list = [
    ["E7.0", "E7.25"],
    ["E7.25", "E7.5"],
    ["E7.5", "E7.75"],
    ["E7.75", "E8.0"],
    ["E8.0", "E8.25"],
    ["E8.25", "E8.5"],
]

In [4]:
cluster_edges = [
    ("Blood progenitors 1", "Blood progenitors 2"), 
    ("Blood progenitors 2", "Erythroid1"), 
    ("Erythroid1", "Erythroid2"), 
    ("Erythroid2", "Erythroid3")
    ] # 已知的细胞类型间的分化信息

## 2. 在预处理部分调整构建邻居数量的两个参数

1. 目标函数

In [5]:
from pytorch_lightning import loggers
from torch_geometric import seed_everything

def train_velovgi(config):

    # 邻居数量的两个参数
    n_bnn_neighbors = config["n_bnn_neighbors"]
    n_knn_neighbors = config["n_knn_neighbors"]
    
    knn_mask, bnn_mask, subsample_adata = velovgi.pp.preprocess(adata, n_bnn_neighbors, n_knn_neighbors, sample_mode="random", batch_key=batch_key, batch_pair_list=batch_pair_list)

    name = ""
    for k,v in config.items():
        name += "%s_%s,"%(k, v)
    name = name[:-1]

    seed_everything(0)
    # 模型训练
    logger = loggers.TensorBoardLogger(save_dir="./log", name=name)
    velovgi.tl.VELOVGI.setup_anndata(adata=subsample_adata, spliced_layer="Ms", unspliced_layer="Mu")
    velovgi_model = velovgi.tl.VELOVGI(subsample_adata)
    velovgi_model.train(logger=logger, max_epochs=50)

    # 模型恢复
    velovgi.tl.add_velovi_outputs_to_adata(subsample_adata, velovgi_model) # 模型输出
    velovgi.pp.moment_recover(adata, subsample_adata) # 恢复

    # 速率计算
    scv.tl.velocity_graph(adata)
    scv.pl.velocity_embedding(adata, color=cluster_key)
    scv.pl.velocity_embedding_stream(adata, color=cluster_key, save="%s.png"%name)

    # 计算指标评价
    adata_velo = velovgi.tl.pre_metric(adata)
    exp_metrics = velovgi.tl.summary_metric(adata_velo, cluster_edges, cluster_key)[-1] # 计算指标汇总后的结果

    session.report({"CBDir": exp_metrics["CBDir"], "ICVCoh": exp_metrics["ICVCoh"]})


2. 搜索空间，网格搜索

In [6]:
search_space = {
    "n_bnn_neighbors": tune.grid_search([15, 45]),
    "n_knn_neighbors": tune.grid_search([15),
}

3. 执行调参

In [7]:
from ray.tune.schedulers import ASHAScheduler

name = "test_experiment"

tuner = tune.Tuner(
    train_velovgi,
    tune_config=tune.TuneConfig(
        metric="CBDir",
        mode="max",
        scheduler=ASHAScheduler()
    ),
    run_config=air.RunConfig(
        local_dir="./results", # Trail内部具体输出结果在这里保存
        name=name # 开启调参的Tensorboard日志
    ),
    param_space=search_space,
)

results = tuner.fit()

2023-06-03 23:40:10,927	INFO worker.py:1625 -- Started a local Ray instance.
2023-06-03 23:40:12,744	INFO tune.py:218 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.
2023-06-03 23:40:13,086	INFO tensorboardx.py:172 -- pip install "ray[tune]" to see TensorBoard files.


0,1
Current time:,2023-06-03 23:42:34
Running for:,00:02:21.56
Memory:,11.5/12.4 GiB

Trial name,# failures,error file
train_velovgi_ee0b1_00003,1,"/mnt/h/F_bak/Python进阶/scRNA/Other/velovgi_workstation/notebook/local_pc/dev_notebook/23_05_30模型调参/erythroid_lineage调参/results/test_experiment/train_velovgi_ee0b1_00003_3_n_bnn_neighbors=45,n_knn_neighbors=45_2023-06-03_23-40-38/error.txt"

Trial name,status,loc,n_bnn_neighbors,n_knn_neighbors
train_velovgi_ee0b1_00000,RUNNING,172.24.199.116:25135,15,15
train_velovgi_ee0b1_00001,RUNNING,172.24.199.116:25199,45,15
train_velovgi_ee0b1_00002,RUNNING,172.24.199.116:25257,15,45
train_velovgi_ee0b1_00003,ERROR,172.24.199.116:25350,45,45


[2m[36m(pid=25135)[0m Global seed set to 0
[2m[36m(pid=25135)[0m   new_rank_zero_deprecation(
[2m[36m(pid=25135)[0m   return new_rank_zero_deprecation(*args, **kwargs)


[2m[36m(train_velovgi pid=25135)[0m Filtered out 51490 genes that are detected 20 counts (shared).
[2m[36m(train_velovgi pid=25135)[0m Normalized count data: X, spliced, unspliced.
[2m[36m(train_velovgi pid=25135)[0m Extracted 2000 highly variable genes.
[2m[36m(train_velovgi pid=25135)[0m Logarithmized X.
[2m[36m(train_velovgi pid=25135)[0m calculating knn and bnn mask...
[2m[36m(train_velovgi pid=25135)[0m pair_list : [['E7.0', 'E7.25'], ['E7.25', 'E7.5'], ['E7.5', 'E7.75'], ['E7.75', 'E8.0'], ['E8.0', 'E8.25'], ['E8.25', 'E8.5']]


[2m[36m(pid=25199)[0m Global seed set to 0
[2m[36m(pid=25199)[0m   new_rank_zero_deprecation(
[2m[36m(pid=25199)[0m   return new_rank_zero_deprecation(*args, **kwargs)


[2m[36m(train_velovgi pid=25199)[0m Normalized count data: X, spliced, unspliced.
[2m[36m(train_velovgi pid=25199)[0m Normalized count data: X, spliced, unspliced.
[2m[36m(train_velovgi pid=25199)[0m Extracted 2000 highly variable genes.
[2m[36m(train_velovgi pid=25199)[0m Logarithmized X.
[2m[36m(train_velovgi pid=25199)[0m calculating knn and bnn mask...
[2m[36m(train_velovgi pid=25199)[0m pair_list : [['E7.0', 'E7.25'], ['E7.25', 'E7.5'], ['E7.5', 'E7.75'], ['E7.75', 'E8.0'], ['E8.0', 'E8.25'], ['E8.25', 'E8.5']]


[2m[36m(train_velovgi pid=25135)[0m OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


[2m[36m(train_velovgi pid=25135)[0m smoothing...
[2m[36m(train_velovgi pid=25135)[0m or is corrupted (e.g. due to subsetting). Consider recomputing with `pp.neighbors`.
[2m[36m(train_velovgi pid=25135)[0m computing moments based on connectivities
[2m[36m(train_velovgi pid=25135)[0m     finished (0:00:00) --> added 
[2m[36m(train_velovgi pid=25135)[0m     'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
[2m[36m(train_velovgi pid=25135)[0m 初始训练，初始化runner参数
[2m[36m(train_velovgi pid=25135)[0m choosing neighbor minibatch
Epoch 1/50:   0%|          | 0/50 [00:00<?, ?it/s]


[2m[36m(train_velovgi pid=25135)[0m GPU available: False, used: False
[2m[36m(train_velovgi pid=25135)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_velovgi pid=25135)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_velovgi pid=25135)[0m HPU available: False, using: 0 HPUs
[2m[36m(train_velovgi pid=25135)[0m Missing logger folder: ./log/n_bnn_neighbors_15,n_knn_neighbors_15


Epoch 1/50:   2%|▏         | 1/50 [00:01<01:12,  1.48s/it, loss=1.96e+06, v_num=0]
Epoch 2/50:   2%|▏         | 1/50 [00:01<01:12,  1.48s/it, loss=1.96e+06, v_num=0]
Epoch 2/50:   4%|▍         | 2/50 [00:02<01:08,  1.43s/it, loss=1.96e+06, v_num=0]
Epoch 3/50:   4%|▍         | 2/50 [00:02<01:08,  1.43s/it, loss=1.96e+06, v_num=0]


[2m[36m(pid=25257)[0m Global seed set to 0


Epoch 4/50:   6%|▌         | 3/50 [00:04<01:07,  1.43s/it, loss=1.95e+06, v_num=0]
Epoch 4/50:   8%|▊         | 4/50 [00:05<01:00,  1.33s/it, loss=1.94e+06, v_num=0]
Epoch 5/50:   8%|▊         | 4/50 [00:05<01:00,  1.33s/it, loss=1.94e+06, v_num=0]


[2m[36m(pid=25257)[0m   new_rank_zero_deprecation(
[2m[36m(pid=25257)[0m   return new_rank_zero_deprecation(*args, **kwargs)


Epoch 5/50:  10%|█         | 5/50 [00:06<00:58,  1.29s/it, loss=1.94e+06, v_num=0]
Epoch 6/50:  10%|█         | 5/50 [00:06<00:58,  1.29s/it, loss=1.94e+06, v_num=0]
Epoch 6/50:  12%|█▏        | 6/50 [00:08<01:02,  1.41s/it, loss=1.93e+06, v_num=0]
[2m[36m(train_velovgi pid=25257)[0m Filtered out 51490 genes that are detected 20 counts (shared).
[2m[36m(train_velovgi pid=25257)[0m Normalized count data: X, spliced, unspliced.
Epoch 7/50:  12%|█▏        | 6/50 [00:08<01:02,  1.41s/it, loss=1.93e+06, v_num=0]
[2m[36m(train_velovgi pid=25257)[0m Extracted 2000 highly variable genes.
[2m[36m(train_velovgi pid=25257)[0m Logarithmized X.
[2m[36m(train_velovgi pid=25257)[0m calculating knn and bnn mask...
[2m[36m(train_velovgi pid=25257)[0m pair_list : [['E7.0', 'E7.25'], ['E7.25', 'E7.5'], ['E7.5', 'E7.75'], ['E7.75', 'E8.0'], ['E8.0', 'E8.25'], ['E8.25', 'E8.5']]


[2m[36m(train_velovgi pid=25199)[0m OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Epoch 8/50:  14%|█▍        | 7/50 [00:09<00:59,  1.39s/it, loss=1.93e+06, v_num=0]
Epoch 9/50:  16%|█▌        | 8/50 [00:11<00:59,  1.42s/it, loss=1.92e+06, v_num=0]
[2m[36m(train_velovgi pid=25199)[0m smoothing...
[2m[36m(train_velovgi pid=25199)[0m or is corrupted (e.g. due to subsetting). Consider recomputing with `pp.neighbors`.
[2m[36m(train_velovgi pid=25199)[0m computing moments based on connectivities
[2m[36m(train_velovgi pid=25199)[0m     finished (0:00:00) --> added 
[2m[36m(train_velovgi pid=25199)[0m     'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
[2m[36m(train_velovgi pid=25199)[0m 初始训练，初始化runner参数
[2m[36m(train_velovgi pid=25199)[0m choosing neighbor minibatch
Epoch 1/50:   0%|          | 0/50 [00:00<?, ?it/s]


[2m[36m(train_velovgi pid=25199)[0m GPU available: False, used: False
[2m[36m(train_velovgi pid=25199)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_velovgi pid=25199)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_velovgi pid=25199)[0m HPU available: False, using: 0 HPUs
[2m[36m(train_velovgi pid=25199)[0m Missing logger folder: ./log/n_bnn_neighbors_45,n_knn_neighbors_15


Epoch 9/50:  18%|█▊        | 9/50 [00:12<00:59,  1.45s/it, loss=1.91e+06, v_num=0]
Epoch 10/50:  18%|█▊        | 9/50 [00:12<00:59,  1.45s/it, loss=1.91e+06, v_num=0]
Epoch 10/50:  20%|██        | 10/50 [00:14<01:03,  1.59s/it, loss=1.91e+06, v_num=0]
Epoch 11/50:  20%|██        | 10/50 [00:15<01:03,  1.59s/it, loss=1.91e+06, v_num=0]
Epoch 3/50:   4%|▍         | 2/50 [00:04<01:34,  1.97s/it, loss=1.94e+06, v_num=0]


[2m[36m(pid=25350)[0m Global seed set to 0


Epoch 12/50:  22%|██▏       | 11/50 [00:16<01:10,  1.81s/it, loss=1.9e+06, v_num=0][32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m


[2m[36m(pid=25350)[0m   new_rank_zero_deprecation(
[2m[36m(pid=25350)[0m   return new_rank_zero_deprecation(*args, **kwargs)


Epoch 4/50:   8%|▊         | 4/50 [00:08<01:34,  2.05s/it, loss=1.93e+06, v_num=0][32m [repeated 5x across cluster][0m
[2m[36m(train_velovgi pid=25350)[0m Filtered out 51490 genes that are detected 20 counts (shared).
[2m[36m(train_velovgi pid=25350)[0m Normalized count data: X, spliced, unspliced.
[2m[36m(train_velovgi pid=25350)[0m Extracted 2000 highly variable genes.
[2m[36m(train_velovgi pid=25350)[0m Logarithmized X.
[2m[36m(train_velovgi pid=25350)[0m calculating knn and bnn mask...
[2m[36m(train_velovgi pid=25350)[0m pair_list : [['E7.0', 'E7.25'], ['E7.25', 'E7.5'], ['E7.5', 'E7.75'], ['E7.75', 'E8.0'], ['E8.0', 'E8.25'], ['E8.25', 'E8.5']]


[2m[36m(train_velovgi pid=25257)[0m OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Epoch 6/50:  10%|█         | 5/50 [00:10<01:29,  1.99s/it, loss=1.93e+06, v_num=0][32m [repeated 4x across cluster][0m
[2m[36m(train_velovgi pid=25257)[0m smoothing...
[2m[36m(train_velovgi pid=25257)[0m or is corrupted (e.g. due to subsetting). Consider recomputing with `pp.neighbors`.
[2m[36m(train_velovgi pid=25257)[0m computing moments based on connectivities
Epoch 14/50:  28%|██▊       | 14/50 [00:23<01:16,  2.14s/it, loss=1.88e+06, v_num=0][32m [repeated 2x across cluster][0m
[2m[36m(train_velovgi pid=25257)[0m     finished (0:00:00) --> added 
[2m[36m(train_velovgi pid=25257)[0m     'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
[2m[36m(train_velovgi pid=25257)[0m 初始训练，初始化runner参数
[2m[36m(train_velovgi pid=25257)[0m choosing neighbor minibatch
Epoch 16/50:  30%|███       | 15/50 [00:26<01:16,  2.19s/it, loss=1.88e+06, v_num=0][32m [repeated 2x across cluster][0m


Epoch 8/50:  14%|█▍        | 7/50 [00:14<01:28,  2.06s/it, loss=1.91e+06, v_num=0][32m [repeated 2x across cluster][0m
Epoch 16/50:  32%|███▏      | 16/50 [00:28<01:20,  2.38s/it, loss=1.87e+06, v_num=0][32m [repeated 2x across cluster][0m
Epoch 2/50:   2%|▏         | 1/50 [00:04<03:26,  4.22s/it, loss=1.89e+06, v_num=0][32m [repeated 2x across cluster][0m


2023-06-03 23:41:03,368	ERROR trial_runner.py:1450 -- Trial train_velovgi_ee0b1_00003: Error happened when processing _ExecutorEventType.TRAINING_RESULT.
ray.tune.error._TuneNoNextExecutorEventError: Traceback (most recent call last):
  File "/usr/local/conda/envs/velovi-env/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py", line 1231, in get_next_executor_event
    future_result = ray.get(ready_future)
  File "/usr/local/conda/envs/velovi-env/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/conda/envs/velovi-env/lib/python3.8/site-packages/ray/_private/worker.py", line 2523, in get
    raise value
ray.exceptions.OutOfMemoryError: Task was killed due to the node running low on memory.
Memory on the node (IP: 172.24.199.116, ID: 2dfdba42a28fe9d17bbfb37d7aba897717420052661643c79e1b9214) where the task (actor ID: 9d1379104de1f40573f83dcc01000000, name=ImplicitFunc.__init__, pid=25350

Trial name,date,hostname,node_ip,pid,timestamp,trial_id
train_velovgi_ee0b1_00003,2023-06-03_23-40-52,DESKTOP-9GVJMSD,172.24.199.116,25350,1685806852,ee0b1_00003


2023-06-03 23:41:03,489	ERROR ray_trial_executor.py:883 -- An exception occurred when trying to stop the Ray actor:Traceback (most recent call last):
  File "/usr/local/conda/envs/velovi-env/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py", line 874, in _resolve_stop_event
    ray.get(future, timeout=timeout)
  File "/usr/local/conda/envs/velovi-env/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/conda/envs/velovi-env/lib/python3.8/site-packages/ray/_private/worker.py", line 2523, in get
    raise value
ray.exceptions.OutOfMemoryError: Task was killed due to the node running low on memory.
Memory on the node (IP: 172.24.199.116, ID: 2dfdba42a28fe9d17bbfb37d7aba897717420052661643c79e1b9214) where the task (actor ID: 9d1379104de1f40573f83dcc01000000, name=ImplicitFunc.__init__, pid=25350, memory used=0.51GB) was running was 11.79GB / 12.39GB (0.951635), which exceeds the memory us

Epoch 18/50:  34%|███▍      | 17/50 [00:31<01:21,  2.48s/it, loss=1.87e+06, v_num=0][32m [repeated 2x across cluster][0m
Epoch 11/50:  22%|██▏       | 11/50 [00:25<01:44,  2.67s/it, loss=1.89e+06, v_num=0][32m [repeated 3x across cluster][0m
Epoch 3/50:   4%|▍         | 2/50 [00:08<03:11,  4.00s/it, loss=1.88e+06, v_num=0][32m [repeated 3x across cluster][0m
Epoch 4/50:   6%|▌         | 3/50 [00:11<02:54,  3.71s/it, loss=1.87e+06, v_num=0][32m [repeated 3x across cluster][0m


[2m[33m(raylet)[0m [2023-06-03 23:41:10,847 E 24460 24460] (raylet) node_manager.cc:3071: 5 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 2dfdba42a28fe9d17bbfb37d7aba897717420052661643c79e1b9214, IP: 172.24.199.116) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.24.199.116`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


Epoch 13/50:  26%|██▌       | 13/50 [00:31<01:40,  2.71s/it, loss=1.88e+06, v_num=0][32m [repeated 3x across cluster][0m
Epoch 5/50:   8%|▊         | 4/50 [00:13<02:26,  3.19s/it, loss=1.87e+06, v_num=0][32m [repeated 3x across cluster][0m
Epoch 22/50:  42%|████▏     | 21/50 [00:44<01:22,  2.85s/it, loss=1.84e+06, v_num=0][32m [repeated 4x across cluster][0m
Epoch 15/50:  30%|███       | 15/50 [00:36<01:32,  2.64s/it, loss=1.87e+06, v_num=0][32m [repeated 6x across cluster][0m
Epoch 8/50:  14%|█▍        | 7/50 [00:21<02:00,  2.79s/it, loss=1.85e+06, v_num=0]
Epoch 17/50:  32%|███▏      | 16/50 [00:38<01:26,  2.54s/it, loss=1.86e+06, v_num=0][32m [repeated 5x across cluster][0m
Epoch 9/50:  18%|█▊        | 9/50 [00:26<01:40,  2.45s/it, loss=1.84e+06, v_num=0][32m [repeated 4x across cluster][0m
Epoch 26/50:  50%|█████     | 25/50 [00:55<01:11,  2.87s/it, loss=1.79e+06, v_num=0][32m [repeated 3x across cluster][0m
Epoch 11/50:  20%|██        | 10/50 [00:29<01:41,  2.55s/it,

[2m[36m(train_velovgi pid=25135)[0m `Trainer.fit` stopped: `max_epochs=50` reached.


4. 查看训练日志与最优结果: tensorboard --logdir

In [None]:
results.get_best_result().config



RuntimeError: No best trial found for the given metric: CBDir. This means that no trial has reported this metric, or all values reported for this metric are NaN. To not ignore NaN values, you can set the `filter_nan_and_inf` arg to False.