From 65013ae2d59d57e459815fb51d22d061467224d9 Mon Sep 17 00:00:00 2001 From: Harsh Sharma Date: Tue, 21 Jan 2020 11:42:58 -0700 Subject: [PATCH 1/5] implement forward and update args (#709) Fixes the following issues as discussed in issue #709 1) Implement forward method wrapped. 2) Set default value for seed. "None" breaks tensorboard. 3) Update redundant hparams.data to new hparams.data_path. 4) Update 'use-16bit' to 'use_16bit' to maintain consistency. --- .../full_examples/imagenet/imagenet_example.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pl_examples/full_examples/imagenet/imagenet_example.py b/pl_examples/full_examples/imagenet/imagenet_example.py index c2175da807e75..8458b6e39585d 100644 --- a/pl_examples/full_examples/imagenet/imagenet_example.py +++ b/pl_examples/full_examples/imagenet/imagenet_example.py @@ -34,9 +34,12 @@ def __init__(self, hparams): self.hparams = hparams self.model = models.__dict__[self.hparams.arch](pretrained=self.hparams.pretrained) + def forward(self, x): + return self.model(x) + def training_step(self, batch, batch_idx): images, target = batch - output = self.model(images) + output = self.forward(images) loss_val = F.cross_entropy(output, target) acc1, acc5 = self.__accuracy(output, target, topk=(1, 5)) @@ -132,7 +135,7 @@ def train_dataloader(self): std=[0.229, 0.224, 0.225], ) - train_dir = os.path.join(self.hparams.data, 'train') + train_dir = os.path.join(self.hparams.data_path, 'train') train_dataset = datasets.ImageFolder( train_dir, transforms.Compose([ @@ -162,7 +165,7 @@ def val_dataloader(self): mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], ) - val_dir = os.path.join(self.hparams.data, 'val') + val_dir = os.path.join(self.hparams.data_path, 'val') val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(val_dir, transforms.Compose([ transforms.Resize(256), @@ -185,7 +188,7 @@ def add_model_specific_args(parent_parser): # pragma: no cover ' (default: resnet18)') parser.add_argument('--epochs', default=90, type=int, metavar='N', help='number of total epochs to run') - parser.add_argument('--seed', type=int, default=None, + parser.add_argument('--seed', type=int, default=42, help='seed for initializing training. ') parser.add_argument('-b', '--batch-size', default=256, type=int, metavar='N', @@ -214,7 +217,7 @@ def get_args(): help='how many gpus') parent_parser.add_argument('--distributed-backend', type=str, default='dp', choices=('dp', 'ddp', 'ddp2'), help='supports three options dp, ddp, ddp2') - parent_parser.add_argument('--use-16bit', dest='use-16bit', action='store_true', + parent_parser.add_argument('--use-16bit', dest='use_16bit', action='store_true', help='if true uses 16 bit precision') parent_parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') From dfb6d3626eaa9ffaee7fe09c37659b9f21dfa6a0 Mon Sep 17 00:00:00 2001 From: Nic Eggert Date: Tue, 21 Jan 2020 13:26:43 -0600 Subject: [PATCH 2/5] Fix failing GPU tests (#722) * Fix distributed_backend=None test We now throw a warning instead of an exception. Update test to reflect this. * Fix test_tube logger close when debug=True --- pytorch_lightning/logging/test_tube.py | 5 +++-- tests/test_gpu_models.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/logging/test_tube.py b/pytorch_lightning/logging/test_tube.py index ccb50ef3ec700..c1ebc8cad8ff4 100644 --- a/pytorch_lightning/logging/test_tube.py +++ b/pytorch_lightning/logging/test_tube.py @@ -135,8 +135,9 @@ def finalize(self, status): def close(self): # TODO: HACK figure out where this is being set to true self.experiment.debug = self.debug - exp = self.experiment - exp.close() + if not self.debug: + exp = self.experiment + exp.close() @property def rank(self): diff --git a/tests/test_gpu_models.py b/tests/test_gpu_models.py index 4608966103c16..ec7f36e838472 100644 --- a/tests/test_gpu_models.py +++ b/tests/test_gpu_models.py @@ -183,7 +183,7 @@ def test_multi_gpu_none_backend(tmpdir): gpus='-1' ) - with pytest.raises(MisconfigurationException): + with pytest.warns(UserWarning): tutils.run_model_test(trainer_options, model) From f8d9f8f77364856c65da6d0743d3a7f98a788704 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 21 Jan 2020 15:18:32 -0500 Subject: [PATCH 3/5] Clean docs (#725) * updated gitignore * updated gitignore * updated links in ninja file * updated docs * finished callbacks * finished callbacks * finished callbacks * fixed left menu * added callbacks to menu * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * updated gitignore * updated links in ninja file * updated docs * finished callbacks * finished callbacks * finished callbacks * fixed left menu * added callbacks to menu * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * finished rebase * making private members * making private members * making private members * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * set auto dp if no backend * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * fixed lightning import * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * finished lightning module * finished lightning module * finished lightning module * finished lightning module * added callbacks * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * set auto dp if no backend * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * flake 8 * flake 8 * fix docs path * updated gitignore * updated gitignore * updated links in ninja file * updated docs * finished callbacks * finished callbacks * finished callbacks * fixed left menu * added callbacks to menu * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * updated gitignore * updated docs * finished callbacks * finished callbacks * finished callbacks * fixed left menu * added callbacks to menu * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * finished rebase * making private members * making private members * making private members * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * set auto dp if no backend * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * fixed lightning import * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * finished lightning module * finished lightning module * finished lightning module * finished lightning module * added callbacks * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * flake 8 * flake 8 * fix docs path * flake 8 * Update theme_variables.jinja --- .gitignore | 3 +++ docs/source/callbacks.rst | 14 ++++++++++++++ docs/source/common-cases.rst | 21 +++++++++++++++++++++ docs/source/index.rst | 4 +--- docs/source/lightning-module.rst | 10 ++++++++++ docs/source/logging.rst | 12 ++++++++++++ docs/source/modules.rst | 7 +++++++ docs/source/trainer.rst | 21 +++++++++++++++++++++ docs/source/tutorials.rst | 20 ++++++++++++++++++++ pytorch_lightning/callbacks/pt_callbacks.py | 10 +++++----- pytorch_lightning/core/lightning.py | 9 +++------ pytorch_lightning/logging/__init__.py | 1 - pytorch_lightning/trainer/__init__.py | 1 - pytorch_lightning/trainer/trainer.py | 7 +++---- pytorch_lightning/trainer/training_io.py | 1 - 15 files changed, 120 insertions(+), 21 deletions(-) create mode 100644 docs/source/callbacks.rst create mode 100644 docs/source/common-cases.rst create mode 100644 docs/source/lightning-module.rst create mode 100644 docs/source/logging.rst create mode 100644 docs/source/modules.rst create mode 100644 docs/source/trainer.rst create mode 100644 docs/source/tutorials.rst diff --git a/.gitignore b/.gitignore index d10d5aba3c082..41641211aff94 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ app/models/ pip-wheel-metadata/ lightning_logs/ + # Test-tube test_tube_logs/ test_tube_data/ @@ -14,6 +15,7 @@ test_tube_exp/ # Documentations docs/source/pl_examples*.rst docs/source/pytorch_lightning*.rst +tests/tests/ /docs/source/*.md # Byte-compiled / optimized / DLL files @@ -22,6 +24,7 @@ __pycache__/ *$py.class timit_data/ + # C extensions *.so diff --git a/docs/source/callbacks.rst b/docs/source/callbacks.rst new file mode 100644 index 0000000000000..ae8dd25f1f93b --- /dev/null +++ b/docs/source/callbacks.rst @@ -0,0 +1,14 @@ +.. role:: hidden + :class: hidden-section + +Callbacks +=========== +.. automodule:: pytorch_lightning.callbacks + :exclude-members: + _del_model, + _save_model, + on_epoch_end, + on_train_end, + on_epoch_begin, + check_monitor_top_k, + on_train_begin, \ No newline at end of file diff --git a/docs/source/common-cases.rst b/docs/source/common-cases.rst new file mode 100644 index 0000000000000..7b96a93d84660 --- /dev/null +++ b/docs/source/common-cases.rst @@ -0,0 +1,21 @@ +Multi-gpu (same node) training +============================== + +Multi-node training +==================== + +16-bit precision +================= + +gradient clipping +================= + +modifying training via hooks +============================= + + + +.. toctree:: + :maxdepth: 3 + + pl_examples \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 234331f1e248e..755bbfe5f2025 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -44,22 +44,20 @@ PyTorch-Lightning Documentation common-cases - .. toctree:: :maxdepth: 1 :name: community :caption: Community + CODE_OF_CONDUCT.md CONTRIBUTING.md BECOMING_A_CORE_CONTRIBUTOR.md governance.md - Indices and tables ------------------ * :ref:`genindex` * :ref:`modindex` * :ref:`search` - diff --git a/docs/source/lightning-module.rst b/docs/source/lightning-module.rst new file mode 100644 index 0000000000000..93c81537f65ca --- /dev/null +++ b/docs/source/lightning-module.rst @@ -0,0 +1,10 @@ +.. role:: hidden + :class: hidden-section + +LightningModule +=========== +.. automodule:: pytorch_lightning.core + :exclude-members: + _abc_impl, + summarize, + diff --git a/docs/source/logging.rst b/docs/source/logging.rst new file mode 100644 index 0000000000000..24f49f0ab15c0 --- /dev/null +++ b/docs/source/logging.rst @@ -0,0 +1,12 @@ +.. role:: hidden + :class: hidden-section + +Logging +=========== +.. automodule:: pytorch_lightning.logging + :exclude-members: + _abc_impl, + _save_model, + on_epoch_end, + on_train_end, + on_epoch_begin, diff --git a/docs/source/modules.rst b/docs/source/modules.rst new file mode 100644 index 0000000000000..e4c5121858c28 --- /dev/null +++ b/docs/source/modules.rst @@ -0,0 +1,7 @@ +pl_examples +=========== + +.. toctree:: + :maxdepth: 4 + + pl_examples diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst new file mode 100644 index 0000000000000..db2657dc2c429 --- /dev/null +++ b/docs/source/trainer.rst @@ -0,0 +1,21 @@ +.. role:: hidden + :class: hidden-section + +Trainer +=========== +.. automodule:: pytorch_lightning.trainer + :members: fit, test + :exclude-members: + run_pretrain_routine, + _abc_impl, + _Trainer__set_root_gpu, + _Trainer__init_optimizers, + _Trainer__parse_gpu_ids, + _Trainer__configure_schedulers, + data_parallel, + num_gpus, + slurm_job_id, + tng_tqdm_dic, + training_tqdm_dict, + init_optimizers, + configure_schedulers diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst new file mode 100644 index 0000000000000..7b66d141ff2f9 --- /dev/null +++ b/docs/source/tutorials.rst @@ -0,0 +1,20 @@ +Refactoring PyTorch into Lightning +================================== +`Tutorial `_ + +Start a research project +========================= +`Research seed `_ + +Basic Lightning use +==================== +`Tutorial `_ + +9 key Lightning tricks +======================== +`Tutorial <9 key speed features in Pytorch-Lightning>`_ + +Multi-node training on SLURM +============================= +`Tutorial `_ + diff --git a/pytorch_lightning/callbacks/pt_callbacks.py b/pytorch_lightning/callbacks/pt_callbacks.py index 42a8336fa7c9f..4c7d877a85bd6 100644 --- a/pytorch_lightning/callbacks/pt_callbacks.py +++ b/pytorch_lightning/callbacks/pt_callbacks.py @@ -1,15 +1,13 @@ """ Callbacks -========= - +==================================== Callbacks supported by Lightning """ -import logging import os import shutil +import logging import warnings - import numpy as np from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel @@ -165,7 +163,9 @@ def on_train_end(self, logs=None): class ModelCheckpoint(Callback): - r"""Save the model after every epoch. + r""" + + Save the model after every epoch. Args: filepath (str): path to save the model file. diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 2e8b634d8ce14..23757110c4aa8 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -5,15 +5,16 @@ from abc import ABC, abstractmethod from argparse import Namespace + import pandas as pd import torch import torch.distributed as dist - +# from pytorch_lightning.core.decorators import data_loader from pytorch_lightning.core.grads import GradInformation from pytorch_lightning.core.hooks import ModelHooks -from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.core.saving import ModelIO +from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel @@ -679,7 +680,6 @@ def configure_apex(self, amp, model, optimizers, amp_level): @abstractmethod def configure_optimizers(self): r""" - This is where you choose what optimizers and learning-rate schedulers to use in your optimization. Normally you'd need one. But in the case of GANs or something more esoteric you might have multiple. @@ -1015,7 +1015,6 @@ def load_from_metrics(cls, weights_path, tags_csv, map_location=None): map_location (dict): A dictionary mapping saved weight GPU devices to new GPU devices (example: {'cuda:1':'cuda:0'}) - Return: LightningModule with loaded weights @@ -1136,7 +1135,6 @@ def summarize(self, mode): def freeze(self): r""" - Freeze all params for inference Example @@ -1168,7 +1166,6 @@ def unfreeze(self): def on_load_checkpoint(self, checkpoint): r""" - Called by lightning to restore your model. If you saved something with **on_save_checkpoint** this is your chance to restore this. diff --git a/pytorch_lightning/logging/__init__.py b/pytorch_lightning/logging/__init__.py index 5fbb93cddc14d..2697e4e652aa8 100644 --- a/pytorch_lightning/logging/__init__.py +++ b/pytorch_lightning/logging/__init__.py @@ -71,7 +71,6 @@ def any_lightning_module_function_or_hook(...): Supported Loggers ----------------- """ - from os import environ from .base import LightningLoggerBase, rank_zero_only diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 98c2b99b56357..c18f22b821a3f 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -26,5 +26,4 @@ """ from .trainer import Trainer - __all__ = ['Trainer'] diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index cc9f9394f3961..820b2d8384858 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1,9 +1,8 @@ - - -import logging import os import sys import warnings +import logging + import torch import torch.distributed as dist @@ -20,6 +19,7 @@ parse_gpu_ids, determine_root_gpu_device ) + from pytorch_lightning.trainer.evaluation_loop import TrainerEvaluationLoopMixin from pytorch_lightning.trainer.logging import TrainerLoggingMixin from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin @@ -850,7 +850,6 @@ def test(self, model=None): # run test from a loaded model model = LightningModule.load_from_checkpoint('path/to/checkpoint.ckpt') trainer = Trainer() - trainer.test(model) """ self.testing = True diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index 6ea819ba1691c..3c489132c739c 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -98,7 +98,6 @@ from subprocess import call from argparse import Namespace -import pandas as pd import torch import torch.distributed as dist From 3ce6cd44e08e815bb1306dba72dbff45f642ab37 Mon Sep 17 00:00:00 2001 From: Harsh Sharma Date: Tue, 21 Jan 2020 11:42:58 -0700 Subject: [PATCH 4/5] implement forward and update args (#709) Fixes the following issues as discussed in issue #709 1) Implement forward method wrapped. 2) Set default value for seed. "None" breaks tensorboard. 3) Update redundant hparams.data to new hparams.data_path. 4) Update 'use-16bit' to 'use_16bit' to maintain consistency. --- .../full_examples/imagenet/imagenet_example.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pl_examples/full_examples/imagenet/imagenet_example.py b/pl_examples/full_examples/imagenet/imagenet_example.py index c2175da807e75..8458b6e39585d 100644 --- a/pl_examples/full_examples/imagenet/imagenet_example.py +++ b/pl_examples/full_examples/imagenet/imagenet_example.py @@ -34,9 +34,12 @@ def __init__(self, hparams): self.hparams = hparams self.model = models.__dict__[self.hparams.arch](pretrained=self.hparams.pretrained) + def forward(self, x): + return self.model(x) + def training_step(self, batch, batch_idx): images, target = batch - output = self.model(images) + output = self.forward(images) loss_val = F.cross_entropy(output, target) acc1, acc5 = self.__accuracy(output, target, topk=(1, 5)) @@ -132,7 +135,7 @@ def train_dataloader(self): std=[0.229, 0.224, 0.225], ) - train_dir = os.path.join(self.hparams.data, 'train') + train_dir = os.path.join(self.hparams.data_path, 'train') train_dataset = datasets.ImageFolder( train_dir, transforms.Compose([ @@ -162,7 +165,7 @@ def val_dataloader(self): mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], ) - val_dir = os.path.join(self.hparams.data, 'val') + val_dir = os.path.join(self.hparams.data_path, 'val') val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(val_dir, transforms.Compose([ transforms.Resize(256), @@ -185,7 +188,7 @@ def add_model_specific_args(parent_parser): # pragma: no cover ' (default: resnet18)') parser.add_argument('--epochs', default=90, type=int, metavar='N', help='number of total epochs to run') - parser.add_argument('--seed', type=int, default=None, + parser.add_argument('--seed', type=int, default=42, help='seed for initializing training. ') parser.add_argument('-b', '--batch-size', default=256, type=int, metavar='N', @@ -214,7 +217,7 @@ def get_args(): help='how many gpus') parent_parser.add_argument('--distributed-backend', type=str, default='dp', choices=('dp', 'ddp', 'ddp2'), help='supports three options dp, ddp, ddp2') - parent_parser.add_argument('--use-16bit', dest='use-16bit', action='store_true', + parent_parser.add_argument('--use-16bit', dest='use_16bit', action='store_true', help='if true uses 16 bit precision') parent_parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') From 3195d83d9c41ca401886b8d0ebc38438580171d7 Mon Sep 17 00:00:00 2001 From: Harsh Sharma Date: Tue, 21 Jan 2020 14:10:21 -0700 Subject: [PATCH 5/5] use self.forward for val step (#709) --- pl_examples/full_examples/imagenet/imagenet_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pl_examples/full_examples/imagenet/imagenet_example.py b/pl_examples/full_examples/imagenet/imagenet_example.py index 8458b6e39585d..ce2fbf6a12dd2 100644 --- a/pl_examples/full_examples/imagenet/imagenet_example.py +++ b/pl_examples/full_examples/imagenet/imagenet_example.py @@ -62,7 +62,7 @@ def training_step(self, batch, batch_idx): def validation_step(self, batch, batch_idx): images, target = batch - output = self.model(images) + output = self.forward(images) loss_val = F.cross_entropy(output, target) acc1, acc5 = self.__accuracy(output, target, topk=(1, 5))