Skip to content
Merged

Demos #215

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,16 @@ include LICENSE
exclude *.sh
exclude *.toml
exclude *.svg
recursive-include examples *.py
recursive-include pytorch_lightning *.py

# include examples
recursive-include examples *.py
recursive-include examples *.md
recursive-include examples *.sh

# exclude tests from package
recursive-exclude tests *
recursive-exclude site *
exclude tests

# Exclude the documentation files
Expand Down
8 changes: 4 additions & 4 deletions examples/new_project_templates/lightning_module_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,15 +240,15 @@ def add_model_specific_args(parent_parser, root_dir): # pragma: no cover
parser.add_argument('--out_features', default=10, type=int)
# use 500 for CPU, 50000 for GPU to see speed difference
parser.add_argument('--hidden_dim', default=50000, type=int)
parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=False)
parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=True)
parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
options=[0.0001, 0.0005, 0.001],
tunable=True)

# data
parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)

# training params (opt)
parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
options=[0.0001, 0.0005, 0.001, 0.005],
tunable=False)
parser.opt_list('--optimizer_name', default='adam', type=str,
options=['adam'], tunable=False)

Expand Down
37 changes: 37 additions & 0 deletions examples/new_project_templates/multi_node_examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Multi-node examples
Use these templates for multi-node training

## Simplest example.
1. Modify this script with your CoolModel file.
2. Update and submit [this bash script]()
```bash
squeue minimal_multi_node_demo_script.sh
```

## Grid search on a cluster

#### Option 1: Run on cluster using your own SLURM script
The trainer and model will work on a cluster if you configure your SLURM script correctly.

1. Update [this demo slurm script]().
2. Submit the script
```bash
$ squeue demo_script.sh
```

Most people have some way they automatically generate their own scripts.
To run a grid search this way, you'd need a way to automatically generate scripts using all the combinations of
hyperparameters to search over.

#### Option 2: Use test-tube for SLURM script
With test tube we can automatically generate slurm scripts for different hyperparameter options.

To run this demo:
```bash
source activate YourCondaEnv

python multi_node_cluster_auto_slurm.py --email your@email.com --gpu_partition your_partition --conda_env YourCondaEnv
```

That will submit 6 jobs. Each job will have a specific combination of hyperparams. Each job will also run on 2 nodes
where each node has 8 gpus.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash
#
# Auto-generated by test-tube (https://github.com/williamFalcon/test-tube)
#################

# set a job name
#SBATCH --job-name=lightning_test
#################

# a file for job output, you can check job progress
#SBATCH --output=/slurm_output_%j.out
#################

# a file for errors
#SBATCH --error=/slurm_output_%j.err
#################

# time needed for job
#SBATCH --time=01:00:00
#################

# gpus per node
#SBATCH --gres=gpu:8
#################

# cpus per job
#SBATCH --cpus-per-task=10
#################

# number of requested nodes
#SBATCH --nodes=2
#################

# memory per node (0 means all)
#SBATCH --mem=0
#################

# slurm will send a signal this far out before it kills the job
#SBATCH --signal=USR1@300
#################

# comment
#SBATCH --comment=lightning_demo
#################

# 1 task per gpu
#SBATCH --ntasks-per-node=8
#################

source activate YourEnv

# debugging flags (optional)
export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1

# random port between 12k and 20k
export MASTER_PORT=$((12000 + RANDOM % 20000))$

srun python multi_node_own_slurm_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from pytorch_lightning import Trainer
from test_tube import Experiment
import os


def main():
# use the cool model from the main README.md
model = CoolModel() # noqa: F821
exp = Experiment(save_dir=os.getcwd())

# train on 4 GPUs across 4 nodes
trainer = Trainer(
experiment=exp,
distributed_backend='ddp',
max_nb_epochs=10,
gpus=4,
nb_gpu_nodes=4
)

trainer.fit(model)


if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash -l

# SLURM SUBMIT SCRIPT
#SBATCH --nodes=4
#SBATCH --gres=gpu:4
#SBATCH --ntasks-per-node=4
#SBATCH --mem=0
#SBATCH --time=0-02:00:00

# activate conda env
conda activate my_env

# run script from above
python minimal_multi_node_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,12 @@ def main(hparams, cluster):
# ------------------------
# 4 INIT TRAINER
# ------------------------
gpus = list(range(0, hparams.per_experiment_nb_gpus))
trainer = Trainer(
experiment=exp,
cluster=cluster,
checkpoint_callback=checkpoint,
early_stop_callback=early_stop,
gpus=hparams.gpus,
gpus=gpus,
nb_gpu_nodes=hyperparams.nb_gpu_nodes
)

Expand All @@ -99,7 +99,7 @@ def optimize_on_cluster(hyperparams):
)

# email for cluster coms
cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True)
cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True)

# configure cluster
cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
Expand All @@ -109,7 +109,7 @@ def optimize_on_cluster(hyperparams):
cluster.memory_mb_per_node = 0

# any modules for code to run in env
cluster.add_command('source activate lightning')
cluster.add_command(f'source activate {hyperparams.conda_env}')

# run only on 32GB voltas
cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
Expand All @@ -121,7 +121,7 @@ def optimize_on_cluster(hyperparams):
# creates and submits jobs to slurm
cluster.optimize_parallel_cluster_gpu(
main,
nb_trials=hyperparams.nb_hopt_trials,
nb_trials=hyperparams.num_hyperparam_trials,
job_name=hyperparams.experiment_name
)

Expand All @@ -139,15 +139,10 @@ def optimize_on_cluster(hyperparams):
parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)

# cluster args not defined inside the model
parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual')

# TODO: make 1 param
parent_parser.add_argument('--per_experiment_nb_gpus', type=int,
default=2, help='how many gpus to use in a node')
parent_parser.add_argument('--gpus', type=str, default='-1',
help='how many gpus to use in the node')

parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1,
default=8, help='how many gpus to use in a node')
parent_parser.add_argument('--nb_gpu_nodes', type=int, default=2,
help='how many nodes to use in a cluster')
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
help='where to save logs')
Expand All @@ -157,9 +152,15 @@ def optimize_on_cluster(hyperparams):
help='where to save model')
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
help='test tube exp name')
parent_parser.add_argument('--nb_hopt_trials', type=int, default=1,
parent_parser.add_argument('--num_hyperparam_trials', type=int, default=6,
help='how many grid search trials to run')

parent_parser.add_argument('--email', type=str, default='add@email.com',
help='email for jobs')
parent_parser.add_argument('--conda_env', type=str, default='base',
help='email for jobs')
parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual')

# allow model to overwrite or extend args
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
hyperparams = parser.parse_args()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
Multi-node example (GPU)
"""
import os
import numpy as np
import torch

from test_tube import HyperOptArgumentParser, Experiment
from pytorch_lightning import Trainer
from examples.new_project_templates.lightning_module_template import LightningTemplateModel

SEED = 2334
torch.manual_seed(SEED)
np.random.seed(SEED)


def main(hparams):
"""
Main training routine specific for this project
:param hparams:
:return:
"""
# ------------------------
# 1 INIT LIGHTNING MODEL
# ------------------------
model = LightningTemplateModel(hparams)

# ------------------------
# 2 INIT TEST TUBE EXP
# ------------------------
# init experiment
exp = Experiment(
name='test_exp',
save_dir=hyperparams.log_dir,
autosave=False,
description='test demo'
)

# ------------------------
# 2 INIT TRAINER
# ------------------------
trainer = Trainer(
experiment=exp,
gpus=[0, 1, 2, 3, 4, 5, 6, 7],
nb_gpu_nodes=2
)

# ------------------------
# 5 START TRAINING
# ------------------------
trainer.fit(model)


if __name__ == '__main__':
# use current dir for logging
root_dir = os.path.dirname(os.path.realpath(__file__))
log_dir = os.path.join(root_dir, 'pt_lightning_demo_logs')

parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
parent_parser.add_argument('--log_dir', type=str, default=log_dir,
help='where to save logs')

# allow model to overwrite or extend args
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
hyperparams = parser.parse_args()

# ---------------------
# RUN TRAINING
# ---------------------
main(hyperparams)