In [1]:
import os
from pathlib import Path
import hydra
from nuplan.planning.script.run_training import main as main_train
from omegaconf import DictConfig
import tempfile


def visualize(sim_dict: dict) -> str:
    # Location of path with all simulation configs
    CONFIG_PATH = sim_dict['CONFIG_PATH']
    CONFIG_NAME = sim_dict['CONFIG_NAME']
    
    # add save directory
    SAVE_DIR = sim_dict['SAVE_DIR']
    # Name of the experiment
    EXPERIMENT = sim_dict['EXPERIMENT']
    JOB_NAME = sim_dict['JOB_NAME']
    TRAINING_MODEL = sim_dict['TRAINING_MODEL']
    
    # Training params
    PY_FUNC = sim_dict['PY_FUNC']
    SCENARIO_BUILDER = sim_dict['SCENARIO_BUILDER']
    SCENARIO_SELECTION = sim_dict['SCENARIO_SELECTION']
    MAX_EPOCHS = sim_dict['MAX_EPOCHS']
    BATCH_SIZE = sim_dict['BATCH_SIZE']
    
    LOG_DIR = str(Path(SAVE_DIR) / EXPERIMENT / JOB_NAME)
    print('__LOG__' + LOG_DIR)

    # Initialize configuration management system
    hydra.core.global_hydra.GlobalHydra.instance().clear()  # reinitialize hydra if already initialized
    hydra.initialize(config_path=CONFIG_PATH)
    
    # Compose the configuration
    cfg = hydra.compose(config_name=CONFIG_NAME, overrides=[
        f'group={str(SAVE_DIR)}',
        f'cache.cache_path={str(SAVE_DIR)}/cache',
        f'experiment_name={EXPERIMENT}',
        f'job_name={JOB_NAME}',
        f'py_func={PY_FUNC}', # ['train','test','cache']
        f'+training={TRAINING_MODEL}',  # raster model that consumes ego, agents and map raster layers and regresses the ego's trajectory
        f'scenario_builder={SCENARIO_BUILDER}',  # use nuplan mini database  # ['nuplan','nuplan_challenge','nuplan_mini']
        f'scenario_filter.limit_total_scenarios={SCENARIO_SELECTION}',  # Choose 500 scenarios to train with
        'lightning.trainer.params.accelerator=ddp_spawn',  # ddp is not allowed in interactive environment, using ddp_spawn instead - this can bottleneck the data pipeline, it is recommended to run training outside the notebook
        f'lightning.trainer.params.max_epochs={MAX_EPOCHS}',
        f'data_loader.params.batch_size={BATCH_SIZE}',
        'data_loader.params.num_workers=8',
    ])
    
    # Run the training loop, optionally inspect training artifacts through tensorboard (above cell)
    engine=main_train(cfg)
    engine.save_visualize_info('/data1/nuplan/jiale/model_vis')
    print("done. ")
    
    
if __name__ == '__main__': 
    train_dicts = []
    # # Raster Model
    # train_dicts.append(
    #     dict(
    #         # Location of path with all simulation configs
    #         CONFIG_PATH = '../nuplan/planning/script/config/training',
    #         CONFIG_NAME = 'default_training',
        
    #         # Name of the experiment
    #         EXPERIMENT = 'raster_experiment',
    #         JOB_NAME = 'raster_model',
    #         TRAINING_MODEL = 'training_raster_model',
            
    #         # Training params
    #         PY_FUNC = 'train', # ['train','test','cache']
    #         SCENARIO_BUILDER = 'nuplan_mini', # ['nuplan','nuplan_challenge','nuplan_mini']
    #         SCENARIO_SELECTION = 500,
    #         MAX_EPOCHS = 10,
    #         BATCH_SIZE = 8,
            
    #         # add save directory
    #         SAVE_DIR = '/data1/nuplan/jiale/exp'
    #     )
    # )
    # # Simple Vector Model
    # train_dicts.append(
    #     dict(
    #         # Location of path with all simulation configs
    #         CONFIG_PATH = '../nuplan/planning/script/config/training',
    #         CONFIG_NAME = 'default_training',
        
    #         # Name of the experiment
    #         EXPERIMENT = 'simple_vector_experiment',
    #         JOB_NAME = 'simple_vector_model',
    #         TRAINING_MODEL = 'training_simple_vector_model',
            
    #         # Training params
    #         PY_FUNC = 'train', # ['train','test','cache']
    #         SCENARIO_BUILDER = 'nuplan_mini', # ['nuplan','nuplan_challenge','nuplan_mini']
    #         SCENARIO_SELECTION = 500,
    #         MAX_EPOCHS = 10,
    #         BATCH_SIZE = 8,
            
    #         # add save directory
    #         SAVE_DIR = '/data1/nuplan/jiale/exp'
    #     )
    # )
    # # Vector Model
    train_dicts.append(
        dict(
            # Location of path with all simulation configs
            CONFIG_PATH = '../nuplan/planning/script/config/training',
            CONFIG_NAME = 'default_training',
        
            # Name of the experiment
            EXPERIMENT = 'vector_experiment',
            JOB_NAME = 'vector_model',
            TRAINING_MODEL = 'training_vector_model',
            
            # Training params
            PY_FUNC = 'train', # ['train','test','cache']
            SCENARIO_BUILDER = 'nuplan_mini', # ['nuplan','nuplan_challenge','nuplan_mini']
            SCENARIO_SELECTION = 20,
            MAX_EPOCHS = 1,
            BATCH_SIZE = 1,
            
            # add save directory
            SAVE_DIR = '/data1/nuplan/jiale/exp'
        )
    )

    # train_dicts.append(
    #     dict(
    #         # Location of path with all simulation configs
    #         CONFIG_PATH = '../nuplan/planning/script/config/training',
    #         CONFIG_NAME = 'default_training',
        
    #         # Name of the experiment
    #         EXPERIMENT = 'vector_experiment',
    #         JOB_NAME = 'vector_model',
    #         TRAINING_MODEL = 'training_vector_model',
            
    #         # Training params
    #         PY_FUNC = 'train', # ['train','test','cache']
    #         SCENARIO_BUILDER = 'nuplan', # ['nuplan','nuplan_challenge','nuplan_mini']
    #         SCENARIO_SELECTION = 200000, # paper: 0.2M 
    #         MAX_EPOCHS = 32,
    #         BATCH_SIZE = 8, # paper:128
            
    #         # add save directory
    #         SAVE_DIR = '/data1/nuplan/jiale/exp'
    #     )
    # )

    
    
    for train_dict in train_dicts:
        visualize(train_dict)

__LOG__/data1/nuplan/jiale/exp/vector_experiment/vector_model


Global seed set to 0


2023-03-12 20:05:43,838 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/script/builders/folder_builder.py:17}  Building experiment folders...
2023-03-12 20:05:43,838 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/script/builders/folder_builder.py:19}  Experimental folder: /data1/nuplan/jiale/exp/vector_experiment/vector_model/2023.03.12.20.05.43
2023-03-12 20:05:43,839 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/script/builders/worker_pool_builder.py:19}  Building WorkerPool...
2023-03-12 20:05:43,840 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/utils/multithreading/worker_ray.py:75}  Starting ray local!


2023-03-12 20:05:45,827	INFO worker.py:1553 -- Started a local Ray instance.


2023-03-12 20:05:48,000 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/utils/multithreading/worker_pool.py:101}  Worker: RayDistributed
2023-03-12 20:05:48,000 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/utils/multithreading/worker_pool.py:102}  Number of nodes: 1
Number of CPUs per node: 36
Number of GPUs per node: 3
Number of threads across all nodes: 36
2023-03-12 20:05:48,001 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/script/builders/worker_pool_builder.py:27}  Building WorkerPool...DONE!
2023-03-12 20:05:48,001 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/training/experiments/training.py:70}  Building training engine...
2023-03-12 20:05:48,002 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/script/builders/model_builder.py:18}  Building TorchModuleWrapper...
2023-03-12 20:05:48,220 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/script/builders/model_builder.py:21}  B

Ray objects: 100%|██████████| 36/36 [00:05<00:00,  7.07it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.


2023-03-12 20:05:54,687 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/script/builders/scenario_builder.py:170}  Extracted 20 scenarios for training
2023-03-12 20:05:54,689 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/training/experiments/training.py:81}  Updating configs based on ddp_spawn strategy is currently not supported. Optimizer and LR Scheduler configs will not be updated.
2023-03-12 20:05:54,696 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/script/builders/training_callback_builder.py:19}  Building callbacks...
2023-03-12 20:05:54,716 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/script/builders/training_callback_builder.py:37}  Building callbacks...DONE!
2023-03-12 20:05:54,721 INFO {/home/jiale/Documents/master/nuplan-devkit/nuplan/planning/script/run_training.py:64}  Starting training...


  warn(
  warn(
Traceback (most recent call last):
  File "/home/jiale/anaconda3/envs/nuplan/lib/python3.9/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/jiale/anaconda3/envs/nuplan/lib/python3.9/site-packages/traitlets/config/application.py", line 1042, in launch_instance
    app.initialize(argv)
  File "/home/jiale/anaconda3/envs/nuplan/lib/python3.9/site-packages/traitlets/config/application.py", line 113, in inner
    return method(app, *args, **kwargs)
  File "/home/jiale/anaconda3/envs/nuplan/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 678, in initialize
    self.init_sockets()
  File "/home/jiale/anaconda3/envs/nuplan/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 317, in init_sockets
    self.shell_port = self._bind_socket(self.shell_socket, self.shell_port)
  File "/home/jiale/anaconda3/envs/nuplan/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 252, in _bind_socket
    return self._try_

2023-03-12 20:36:03,328 INFO {/home/jiale/anaconda3/envs/nuplan/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:194}  Added key: store_based_barrier_key:1 to store for rank: 0
2023-03-12 20:36:13,502 INFO {/home/jiale/anaconda3/envs/nuplan/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:212}  Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=3, worker_count=1, timeout=0:30:00)
2023-03-12 20:36:23,506 INFO {/home/jiale/anaconda3/envs/nuplan/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:212}  Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=3, worker_count=1, timeout=0:30:00)
2023-03-12 20:36:33,510 INFO {/home/jiale/anaconda3/envs/nuplan/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:212}  Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier

RuntimeError: Timed out initializing process group in store based barrier on rank: 0, for key: store_based_barrier_key:1 (world_size=3, worker_count=1, timeout=0:30:00)