# 2.model_structure

In [1]:
import sys
sys.path = ["../../.."] + sys.path # 切换到项目目录下

import scanpy as sc
import scvelo as scv
import velovgi

from ray import tune, air
from ray.air import session

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


1. 目标函数

In [2]:
from pytorch_lightning import loggers
from torch_geometric import seed_everything

# TODO: 跳整多种参数，简化调参Trail的名字
def train_velovgi(config):
    # 提取参数
    # 随机数种子，确保结果的可复现性
    random_seed = config.get("random_seed", 0)
    # 预处理的参数
    n_bnn_neighbors = config.get("n_bnn_neighbors", 15)
    n_knn_neighbors = config.get("n_knn_neighbors", 15)
    is_ot = config.get("is_ot", True)
    # 模型结构参数
    n_hidden = config.get("n_hidden", 256)
    n_latent = config.get("n_latent", 10)
    n_layers = config.get("n_layers", 1)
    # 训练参数
    num_neighbors = [config.get("num_neighbors", 8)]*n_layers
    max_epochs = config.get("max_epochs", 10) # TODO:这里是最关键的一个参数，小epochs测试之后再提交到服务器上用大epoch
    batch_size = config.get("batch_size", 64)
    max_kl_weight = config.get("max_kl_weight", 0.8)

    name = ""
    for k,v in config.items():
        name += "%s_%s,"%(k, v)
    name = name[:-1]

    # seed_everything(random_seed)
    # # TODO:数据读入，对于不同的数据集这里需要替换
    # adata_filename = "/mnt/h/F_bak/Python进阶/scRNA/Other/velovgi_workstation/notebook/local_pc/erythroid_lineage/data/erythroid_lineage.h5ad" # 数据路径使用绝对路径
    # adata = scv.read(adata_filename)
    # batch_key = "stage" # 批次key
    cluster_key = "celltype" # 细胞类型key
    cluster_edges = [
    ("Blood progenitors 1", "Blood progenitors 2"), 
    ("Blood progenitors 2", "Erythroid1"), 
    ("Erythroid1", "Erythroid2"), 
    ("Erythroid2", "Erythroid3")
    ] # 指定对应数据集已知的细胞类型间的分化信息

    # # TODO:预处理，这里batch_pair_list以后可能需要手动指定
    # batch_list = list(adata.obs[batch_key].cat.categories)
    # batch_pair_list = list(zip(batch_list[:-1], batch_list[1:]))
    # subsample_adata = velovgi.pp.preprocess(adata,
    #                                         n_bnn_neighbors=n_bnn_neighbors,
    #                                         n_knn_neighbors=n_knn_neighbors,
    #                                         batch_key=batch_key,
    #                                         batch_pair_list=batch_pair_list,
    #                                         is_ot=is_ot)
    # TODO: 如果需要调整预处理之后的参数，就不需要重复做预处理了，读取预处理之后的结果即可
    adata = velovgi.tl.read_adata("/mnt/h/F_bak/Python进阶/scRNA/Other/velovgi_workstation/notebook/local_pc/erythroid_lineage/data/adata")
    subsample_adata = scv.read("/mnt/h/F_bak/Python进阶/scRNA/Other/velovgi_workstation/notebook/local_pc/erythroid_lineage/data/subsample_adata.h5ad") # 使用这个AnnData做训练
    seed_everything(random_seed)

    # 模型训练
    logger = loggers.TensorBoardLogger(save_dir="./log", name=name)
    velovgi.tl.VELOVGI.setup_anndata(adata=subsample_adata, spliced_layer="Ms", unspliced_layer="Mu")
    velovgi_model = velovgi.tl.VELOVGI(subsample_adata,
                                       n_hidden=n_hidden,
                                       n_latent=n_latent,
                                       n_layers=n_layers)
    velovgi_model.train(num_neighbors=num_neighbors,
                        max_epochs=max_epochs,
                        batch_size=batch_size,
                        plan_kwargs={"max_kl_weight": max_kl_weight},
                        logger=logger)

    # 模型恢复
    velovgi.tl.add_velovi_outputs_to_adata(subsample_adata, velovgi_model) # 模型输出
    velovgi.pp.moment_recover(adata, subsample_adata) # 恢复

    # 速率计算
    scv.tl.velocity_graph(adata)
    scv.pl.velocity_embedding(adata, color=cluster_key, title=name, save="arrow.png")
    scv.pl.velocity_embedding_stream(adata, color=cluster_key, title=name, legend_loc="right", save="stream.png")

    # 伪时间计算
    scv.tl.velocity_pseudotime(adata)
    scv.pl.velocity_embedding_stream(adata, color="velocity_pseudotime", title=name, colorbar=False, save="pseudotime.png")


    # 保存结果
    subsample_adata.write("subsample_adata.h5ad")
    velovgi.tl.write_adata(adata, "adata")
    velovgi_model.save("model")

    # 计算指标评价
    adata_velo = velovgi.tl.pre_metric(adata)
    exp_metrics = velovgi.tl.summary_metric(adata_velo, cluster_edges, cluster_key)[-1] # 计算指标汇总后的结果

    session.report({"CBDir": exp_metrics["CBDir"], "ICVCoh": exp_metrics["ICVCoh"]})


2. 搜索空间，这里可以添加键值，实现更多层面的网格调参

In [3]:
search_space = {
    "max_epochs" : tune.grid_search([40, 50]),
}

3. 执行调参，等待传入实验名称和搜索空间

In [4]:
from ray.tune.schedulers import ASHAScheduler

name = "max_epochs_tune" # TODO:指定此次调参的名字，这里是预处理过程的调参

tuner = tune.Tuner(
    train_velovgi,
    tune_config=tune.TuneConfig(
        metric="CBDir",
        mode="max",
        scheduler=ASHAScheduler()
    ),
    run_config=air.RunConfig(
        local_dir="./results", # Trail内部具体输出结果在这里保存
        name=name # 开启调参的Tensorboard日志
    ),
    param_space=search_space,
)

results = tuner.fit()

2023-06-10 15:24:25,495	INFO worker.py:1625 -- Started a local Ray instance.
2023-06-10 15:24:27,213	INFO tune.py:218 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.
2023-06-10 15:24:27,228	INFO tensorboardx.py:172 -- pip install "ray[tune]" to see TensorBoard files.


0,1
Current time:,2023-06-10 15:26:20
Running for:,00:01:53.67
Memory:,11.4/12.4 GiB

Trial name,# failures,error file
train_velovgi_d505b_00001,1,/mnt/h/F_bak/Python进阶/scRNA/Other/velovgi_workstation/notebook/local_pc/erythroid_lineage/tune/results/max_epochs_tune/train_velovgi_d505b_00001_1_max_epochs=50_2023-06-10_15-24-32/error.txt

Trial name,status,loc,max_epochs,iter,total time (s),CBDir,ICVCoh
train_velovgi_d505b_00000,TERMINATED,172.29.205.215:8846,40,1.0,108.199,0.836977,0.967631
train_velovgi_d505b_00001,ERROR,172.29.205.215:8910,50,,,,


[2m[36m(pid=8846)[0m Global seed set to 0
[2m[36m(pid=8846)[0m   new_rank_zero_deprecation(
[2m[36m(pid=8846)[0m   return new_rank_zero_deprecation(*args, **kwargs)


[2m[36m(train_velovgi pid=8846)[0m load /mnt/h/F_bak/Python进阶/scRNA/Other/velovgi_workstation/notebook/local_pc/erythroid_lineage/data/adata/adata.h5ad
[2m[36m(train_velovgi pid=8846)[0m load /mnt/h/F_bak/Python进阶/scRNA/Other/velovgi_workstation/notebook/local_pc/erythroid_lineage/data/adata/sample_recover.pkl
[2m[36m(train_velovgi pid=8846)[0m 初始训练，初始化runner参数
[2m[36m(train_velovgi pid=8846)[0m choosing neighbor minibatch
Epoch 1/40:   0%|          | 0/40 [00:00<?, ?it/s]


[2m[36m(train_velovgi pid=8846)[0m GPU available: False, used: False
[2m[36m(train_velovgi pid=8846)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_velovgi pid=8846)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_velovgi pid=8846)[0m HPU available: False, using: 0 HPUs
[2m[36m(train_velovgi pid=8846)[0m Missing logger folder: ./log/max_epochs_40


Epoch 2/40:   2%|▎         | 1/40 [00:01<01:07,  1.74s/it, loss=1.91e+06, v_num=0]


[2m[36m(pid=8910)[0m Global seed set to 0
[2m[36m(pid=8910)[0m   new_rank_zero_deprecation(
[2m[36m(pid=8910)[0m   return new_rank_zero_deprecation(*args, **kwargs)


Epoch 3/40:   5%|▌         | 2/40 [00:03<01:02,  1.63s/it, loss=1.87e+06, v_num=0]
[2m[36m(train_velovgi pid=8910)[0m load /mnt/h/F_bak/Python进阶/scRNA/Other/velovgi_workstation/notebook/local_pc/erythroid_lineage/data/adata/sample_recover.pkl
[2m[36m(train_velovgi pid=8910)[0m load /mnt/h/F_bak/Python进阶/scRNA/Other/velovgi_workstation/notebook/local_pc/erythroid_lineage/data/adata/sample_recover.pkl
Epoch 3/40:   8%|▊         | 3/40 [00:04<01:00,  1.62s/it, loss=1.85e+06, v_num=0]
Epoch 4/40:   8%|▊         | 3/40 [00:04<01:00,  1.62s/it, loss=1.85e+06, v_num=0]
Epoch 4/40:   8%|▊         | 3/40 [00:04<01:00,  1.62s/it, loss=1.85e+06, v_num=0]
Epoch 4/40:   8%|▊         | 3/40 [00:04<01:00,  1.62s/it, loss=1.85e+06, v_num=0]
Epoch 4/40:  10%|█         | 4/40 [00:06<00:58,  1.62s/it, loss=1.83e+06, v_num=0]
Epoch 4/40:  10%|█         | 4/40 [00:06<00:58,  1.62s/it, loss=1.83e+06, v_num=0]
Epoch 5/40:  10%|█         | 4/40 [00:06<00:58,  1.62s/it, loss=1.83e+06, v_num=0]
Epoch 5/40

[2m[36m(train_velovgi pid=8846)[0m `Trainer.fit` stopped: `max_epochs=40` reached.
[2m[36m(train_velovgi pid=8910)[0m `Trainer.fit` stopped: `max_epochs=40` reached.
[2m[36m(train_velovgi pid=8910)[0m `Trainer.fit` stopped: `max_epochs=40` reached.
[2m[36m(train_velovgi pid=8910)[0m `Trainer.fit` stopped: `max_epochs=40` reached.
[2m[36m(train_velovgi pid=8910)[0m `Trainer.fit` stopped: `max_epochs=40` reached.
[2m[36m(train_velovgi pid=8910)[0m `Trainer.fit` stopped: `max_epochs=40` reached.
2023-06-10 15:25:49,895	ERROR trial_runner.py:1450 -- Trial train_velovgi_d505b_00001: Error happened when processing _ExecutorEventType.TRAINING_RESULT.
ray.tune.error._TuneNoNextExecutorEventError: Traceback (most recent call last):
  File "/usr/local/conda/envs/velovi-env/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py", line 1231, in get_next_executor_event
    future_result = ray.get(ready_future)
  File "/usr/local/conda/envs/velovi-env/lib/python3.8/si

Trial name,CBDir,ICVCoh,date,done,experiment_tag,hostname,iterations_since_restore,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
train_velovgi_d505b_00000,0.8369767347720017,0.9676312586003908,2023-06-10_15-26-20,True,0_max_epochs=40,DESKTOP-9GVJMSD,1.0,172.29.205.215,8846,108.1987648010254,108.1987648010254,108.1987648010254,1686381980,1.0,d505b_00000
train_velovgi_d505b_00001,,,2023-06-10_15-24-38,,,DESKTOP-9GVJMSD,,172.29.205.215,8910,,,,1686381878,,d505b_00001


2023-06-10 15:25:49,962	ERROR ray_trial_executor.py:883 -- An exception occurred when trying to stop the Ray actor:Traceback (most recent call last):
  File "/usr/local/conda/envs/velovi-env/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py", line 874, in _resolve_stop_event
    ray.get(future, timeout=timeout)
  File "/usr/local/conda/envs/velovi-env/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/conda/envs/velovi-env/lib/python3.8/site-packages/ray/_private/worker.py", line 2523, in get
    raise value
ray.exceptions.OutOfMemoryError: Task was killed due to the node running low on memory.
Memory on the node (IP: 172.29.205.215, ID: 2a97e891dee2655b045830313aa387420a682a8acb44b58ba0d90210) where the task (actor ID: 24834c27f254c870697a7a3401000000, name=ImplicitFunc.__init__, pid=8910, memory used=0.66GB) was running was 11.79GB / 12.39GB (0.951148), which exceeds the memory usa

[2m[36m(train_velovgi pid=8846)[0m computing velocity graph (using 1/12 cores)
Epoch 38/50:  74%|███████▍  | 37/50 [01:08<00:24,  1.86s/it, loss=1.04e+06, v_num=0][32m [repeated 2x across cluster][0m
[2m[36m(train_velovgi pid=8846)[0m   0%|          | 0/500 [00:00<?, ?cells/s]
[2m[36m(train_velovgi pid=8846)[0m     finished (0:00:01) --> added 
[2m[36m(train_velovgi pid=8846)[0m     'velocity_graph', sparse matrix with cosine correlations (adata.uns)
[2m[36m(train_velovgi pid=8846)[0m computing velocity embedding
[2m[36m(train_velovgi pid=8846)[0m     finished (0:00:00) --> added
[2m[36m(train_velovgi pid=8846)[0m     'velocity_umap', embedded velocity vectors (adata.obsm)
[2m[36m(train_velovgi pid=8846)[0m saving figure to file ./figures/scvelo_arrow.png
[2m[36m(train_velovgi pid=8846)[0m Figure(640x480)
[2m[36m(train_velovgi pid=8846)[0m saving figure to file ./figures/scvelo_stream.png
[2m[36m(train_velovgi pid=8846)[0m Figure(640x480)
[2m[36m(tr

2023-06-10 15:26:20,917	ERROR tune.py:941 -- Trials did not complete: [train_velovgi_d505b_00001]
2023-06-10 15:26:20,917	INFO tune.py:945 -- Total run time: 113.70 seconds (113.65 seconds for the tuning loop).


[2m[33m(raylet)[0m [2023-06-10 15:26:25,461 E 8187 8187] (raylet) node_manager.cc:3071: 3 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 2a97e891dee2655b045830313aa387420a682a8acb44b58ba0d90210, IP: 172.29.205.215) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.29.205.215`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.
