diff --git a/MANIFEST.in b/MANIFEST.in index 60f0b8b737888..513268b82816e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -13,11 +13,16 @@ include LICENSE exclude *.sh exclude *.toml exclude *.svg -recursive-include examples *.py recursive-include pytorch_lightning *.py +# include examples +recursive-include examples *.py +recursive-include examples *.md +recursive-include examples *.sh + # exclude tests from package recursive-exclude tests * +recursive-exclude site * exclude tests # Exclude the documentation files diff --git a/examples/new_project_templates/lightning_module_template.py b/examples/new_project_templates/lightning_module_template.py index 57f8c2cccde96..6cdd7677ad20f 100644 --- a/examples/new_project_templates/lightning_module_template.py +++ b/examples/new_project_templates/lightning_module_template.py @@ -240,15 +240,15 @@ def add_model_specific_args(parent_parser, root_dir): # pragma: no cover parser.add_argument('--out_features', default=10, type=int) # use 500 for CPU, 50000 for GPU to see speed difference parser.add_argument('--hidden_dim', default=50000, type=int) - parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=False) + parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=True) + parser.opt_list('--learning_rate', default=0.001 * 8, type=float, + options=[0.0001, 0.0005, 0.001], + tunable=True) # data parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str) # training params (opt) - parser.opt_list('--learning_rate', default=0.001 * 8, type=float, - options=[0.0001, 0.0005, 0.001, 0.005], - tunable=False) parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False) diff --git a/examples/new_project_templates/multi_node_examples/README.md b/examples/new_project_templates/multi_node_examples/README.md new file mode 100644 index 0000000000000..1150d47e4a8dc --- /dev/null +++ b/examples/new_project_templates/multi_node_examples/README.md @@ -0,0 +1,37 @@ +# Multi-node examples +Use these templates for multi-node training + +## Simplest example. +1. Modify this script with your CoolModel file. +2. Update and submit [this bash script]() +```bash +squeue minimal_multi_node_demo_script.sh +``` + +## Grid search on a cluster + +#### Option 1: Run on cluster using your own SLURM script +The trainer and model will work on a cluster if you configure your SLURM script correctly. + +1. Update [this demo slurm script](). +2. Submit the script +```bash +$ squeue demo_script.sh +``` + +Most people have some way they automatically generate their own scripts. +To run a grid search this way, you'd need a way to automatically generate scripts using all the combinations of +hyperparameters to search over. + +#### Option 2: Use test-tube for SLURM script +With test tube we can automatically generate slurm scripts for different hyperparameter options. + +To run this demo: +```bash +source activate YourCondaEnv + +python multi_node_cluster_auto_slurm.py --email your@email.com --gpu_partition your_partition --conda_env YourCondaEnv +``` + +That will submit 6 jobs. Each job will have a specific combination of hyperparams. Each job will also run on 2 nodes +where each node has 8 gpus. diff --git a/examples/new_project_templates/multi_node_examples/__init__.py b/examples/new_project_templates/multi_node_examples/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/examples/new_project_templates/multi_node_examples/demo_script.sh b/examples/new_project_templates/multi_node_examples/demo_script.sh new file mode 100644 index 0000000000000..8188f9aced61f --- /dev/null +++ b/examples/new_project_templates/multi_node_examples/demo_script.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# +# Auto-generated by test-tube (https://github.com/williamFalcon/test-tube) +################# + +# set a job name +#SBATCH --job-name=lightning_test +################# + +# a file for job output, you can check job progress +#SBATCH --output=/slurm_output_%j.out +################# + +# a file for errors +#SBATCH --error=/slurm_output_%j.err +################# + +# time needed for job +#SBATCH --time=01:00:00 +################# + +# gpus per node +#SBATCH --gres=gpu:8 +################# + +# cpus per job +#SBATCH --cpus-per-task=10 +################# + +# number of requested nodes +#SBATCH --nodes=2 +################# + +# memory per node (0 means all) +#SBATCH --mem=0 +################# + +# slurm will send a signal this far out before it kills the job +#SBATCH --signal=USR1@300 +################# + +# comment +#SBATCH --comment=lightning_demo +################# + +# 1 task per gpu +#SBATCH --ntasks-per-node=8 +################# + +source activate YourEnv + +# debugging flags (optional) +export NCCL_DEBUG=INFO +export PYTHONFAULTHANDLER=1 + +# random port between 12k and 20k +export MASTER_PORT=$((12000 + RANDOM % 20000))$ + +srun python multi_node_own_slurm_script.py \ No newline at end of file diff --git a/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo.py b/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo.py new file mode 100644 index 0000000000000..cdfaa62927dc0 --- /dev/null +++ b/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo.py @@ -0,0 +1,24 @@ +from pytorch_lightning import Trainer +from test_tube import Experiment +import os + + +def main(): + # use the cool model from the main README.md + model = CoolModel() # noqa: F821 + exp = Experiment(save_dir=os.getcwd()) + + # train on 4 GPUs across 4 nodes + trainer = Trainer( + experiment=exp, + distributed_backend='ddp', + max_nb_epochs=10, + gpus=4, + nb_gpu_nodes=4 + ) + + trainer.fit(model) + + +if __name__ == '__main__': + main() diff --git a/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo_script.sh b/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo_script.sh new file mode 100755 index 0000000000000..ffd1b532138dc --- /dev/null +++ b/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo_script.sh @@ -0,0 +1,14 @@ +#!/bin/bash -l + +# SLURM SUBMIT SCRIPT +#SBATCH --nodes=4 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=4 +#SBATCH --mem=0 +#SBATCH --time=0-02:00:00 + +# activate conda env +conda activate my_env + +# run script from above +python minimal_multi_node_demo.py \ No newline at end of file diff --git a/examples/new_project_templates/multi_node_cluster_template.py b/examples/new_project_templates/multi_node_examples/multi_node_cluster_auto_slurm.py similarity index 88% rename from examples/new_project_templates/multi_node_cluster_template.py rename to examples/new_project_templates/multi_node_examples/multi_node_cluster_auto_slurm.py index f47d84f1c7bd6..5ba6723ca9bfb 100644 --- a/examples/new_project_templates/multi_node_cluster_template.py +++ b/examples/new_project_templates/multi_node_examples/multi_node_cluster_auto_slurm.py @@ -75,12 +75,12 @@ def main(hparams, cluster): # ------------------------ # 4 INIT TRAINER # ------------------------ + gpus = list(range(0, hparams.per_experiment_nb_gpus)) trainer = Trainer( experiment=exp, - cluster=cluster, checkpoint_callback=checkpoint, early_stop_callback=early_stop, - gpus=hparams.gpus, + gpus=gpus, nb_gpu_nodes=hyperparams.nb_gpu_nodes ) @@ -99,7 +99,7 @@ def optimize_on_cluster(hyperparams): ) # email for cluster coms - cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True) + cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus @@ -109,7 +109,7 @@ def optimize_on_cluster(hyperparams): cluster.memory_mb_per_node = 0 # any modules for code to run in env - cluster.add_command('source activate lightning') + cluster.add_command(f'source activate {hyperparams.conda_env}') # run only on 32GB voltas cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', @@ -121,7 +121,7 @@ def optimize_on_cluster(hyperparams): # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, - nb_trials=hyperparams.nb_hopt_trials, + nb_trials=hyperparams.num_hyperparam_trials, job_name=hyperparams.experiment_name ) @@ -139,15 +139,10 @@ def optimize_on_cluster(hyperparams): parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) # cluster args not defined inside the model - parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual') - # TODO: make 1 param parent_parser.add_argument('--per_experiment_nb_gpus', type=int, - default=2, help='how many gpus to use in a node') - parent_parser.add_argument('--gpus', type=str, default='-1', - help='how many gpus to use in the node') - - parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1, + default=8, help='how many gpus to use in a node') + parent_parser.add_argument('--nb_gpu_nodes', type=int, default=2, help='how many nodes to use in a cluster') parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') @@ -157,9 +152,15 @@ def optimize_on_cluster(hyperparams): help='where to save model') parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') - parent_parser.add_argument('--nb_hopt_trials', type=int, default=1, + parent_parser.add_argument('--num_hyperparam_trials', type=int, default=6, help='how many grid search trials to run') + parent_parser.add_argument('--email', type=str, default='add@email.com', + help='email for jobs') + parent_parser.add_argument('--conda_env', type=str, default='base', + help='email for jobs') + parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual') + # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) hyperparams = parser.parse_args() diff --git a/examples/new_project_templates/multi_node_examples/multi_node_own_slurm_script.py b/examples/new_project_templates/multi_node_examples/multi_node_own_slurm_script.py new file mode 100644 index 0000000000000..7d4e286ab77c7 --- /dev/null +++ b/examples/new_project_templates/multi_node_examples/multi_node_own_slurm_script.py @@ -0,0 +1,70 @@ +""" +Multi-node example (GPU) +""" +import os +import numpy as np +import torch + +from test_tube import HyperOptArgumentParser, Experiment +from pytorch_lightning import Trainer +from examples.new_project_templates.lightning_module_template import LightningTemplateModel + +SEED = 2334 +torch.manual_seed(SEED) +np.random.seed(SEED) + + +def main(hparams): + """ + Main training routine specific for this project + :param hparams: + :return: + """ + # ------------------------ + # 1 INIT LIGHTNING MODEL + # ------------------------ + model = LightningTemplateModel(hparams) + + # ------------------------ + # 2 INIT TEST TUBE EXP + # ------------------------ + # init experiment + exp = Experiment( + name='test_exp', + save_dir=hyperparams.log_dir, + autosave=False, + description='test demo' + ) + + # ------------------------ + # 2 INIT TRAINER + # ------------------------ + trainer = Trainer( + experiment=exp, + gpus=[0, 1, 2, 3, 4, 5, 6, 7], + nb_gpu_nodes=2 + ) + + # ------------------------ + # 5 START TRAINING + # ------------------------ + trainer.fit(model) + + +if __name__ == '__main__': + # use current dir for logging + root_dir = os.path.dirname(os.path.realpath(__file__)) + log_dir = os.path.join(root_dir, 'pt_lightning_demo_logs') + + parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) + parent_parser.add_argument('--log_dir', type=str, default=log_dir, + help='where to save logs') + + # allow model to overwrite or extend args + parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) + hyperparams = parser.parse_args() + + # --------------------- + # RUN TRAINING + # --------------------- + main(hyperparams)