diff --git a/docs/source/dataloaders.rst b/docs/source/dataloaders.rst
new file mode 100644
index 0000000000..efe932027b
--- /dev/null
+++ b/docs/source/dataloaders.rst
@@ -0,0 +1,21 @@
+AsynchronousLoader
+------------------
+This dataloader behaves identically to the standard pytorch dataloader, but will transfer
+data asynchronously to the GPU with training. You can also use it to wrap an existing dataloader.
+
+Example::
+    dataloader = AsynchronousLoader(DataLoader(ds, batch_size=16), device=device)
+
+    for b in dataloader:
+        ...
+
+.. autoclass:: pl_bolts.datamodules.async_dataloader.AsynchronousLoader
+   :noindex:
+
+------------------
+
+DummyDataset
+------------
+
+.. autoclass:: pl_bolts.datamodules.dummy_dataset.DummyDataset
+   :noindex:
diff --git a/docs/source/datamodules.rst b/docs/source/datamodules.rst
index 6158a7754c..6468326e5c 100644
--- a/docs/source/datamodules.rst
+++ b/docs/source/datamodules.rst
@@ -1,175 +1,36 @@
 .. role:: hidden
     :class: hidden-section
 
-Lightning DataModule
-====================
-Datasets in PyTorch, Lightning and general Deep learning research have 4 main parts:
+DataModules
+-----------
+DataModules (introduced in PyTorch Lightning 0.9.0) decouple the data from a model. A DataModule
+is simply a collection of a training dataloder, val dataloader and test dataloader. In addition,
+it specifies how to:
 
-    1. A train split + dataloader
-    2. A val split + dataloader
-    3. A test split + dataloader
-    4. A step to download, split, etc...
+- Downloading/preparing data.
+- Train/val/test splits.
+- Transforms
 
-Step 4, also needs special care to make sure that it's only done on 1 GPU in a multi-GPU set-up.
-In addition, there are other challenges such as models that are built using information from the dataset
-such as needing to know image dimensions or number of classes.
-
-A datamodule simplifies all of these parts and has been integrated directly into Lightning in version 0.9.0.
-You can view the documentation for the datamodule in the `Pytorch Lightning docs here. <https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html>`_
-
-.. code-block:: python
-
-    class LitModel(pl.LightningModule):
-
-        def __init__(self, datamodule):
-            c, w, h = datamodule.size()
-            self.l1 = nn.Linear(128, datamodule.num_classes)
-            self.datamodule = datamodule
-
-        def prepare_data(self):
-            self.datamodule.prepare_data()
-
-        def train_dataloader(self)
-            return self.datamodule.train_dataloader()
-
-        def val_dataloader(self)
-            return self.datamodule.val_dataloader()
-
-        def test_dataloader(self)
-            return self.datamodule.test_dataloader()
-
-DataModules can also be used with plain PyTorch
-
-.. code-block:: python
-
-    from pl_bolts.datamodules import MNISTDataModule, CIFAR10DataModule
-
-    datamodule = CIFAR10DataModule(PATH)
-    train_loader = datamodule.train_dataloader()
-    val_loader = datamodule.train_dataloader()
-    test_loader = datamodule.train_dataloader()
-
-An advantage is that you can parametrize the data of your LightningModule
-
-.. code-block:: python
-
-    model = LitModel(datamodule = CIFAR10DataModule(PATH))
-    model = LitModel(datamodule = ImagenetDataModule(PATH))
-
-Or even bridge between SKLearn or numpy datasets
-
-.. code-block:: python
-
-    from sklearn.datasets import load_boston
-    from pl_bolts.datamodules import SklearnDataModule
-
-    X, y = load_boston(return_X_y=True)
-    datamodule = SklearnDataModule(X, y)
-
-    model = LitModel(datamodule)
-
-
-DataModule Advantages
----------------------
-Datamodules have two advantages:
-
-    1. You can guarantee that the exact same train, val and test splits can be used across models.
-    2. You can parameterize your model to be dataset agnostic.
+Then you can use it like this:
 
 Example::
 
-    from pl_bolts.datamodules import STL10DataModule, CIFAR10DataModule
+    dm = MNISTDataModule('path/to/data')
+    model = LitModel()
 
-    # use the same dataset on different models (with exactly the same splits)
-    stl10_model = LitModel(STL10DataModule(PATH))
-    stl10_model = CoolModel(STL10DataModule(PATH))
+    trainer = Trainer()
+    trainer.fit(model, dm)
 
-    # or make your model dataset agnostic
-    cifar10_model = LitModel(CIFAR10DataModule(PATH))
-
-Build a DataModule
-------------------
-Use this to build your own consistent train, validation, test splits.
+Or use it manually with plain PyTorch
 
 Example::
 
-    from pytorch_lightning import LightningDataModule
-
-    class MyDataModule(LightningDataModule):
-
-        def __init__(self,...):
-
-        def prepare_data(self):
-            # download and do something to your data
-
-        def train_dataloader(self):
-            return DataLoader(...)
-
-        def val_dataloader(self):
-            return DataLoader(...)
-
-        def test_dataloader(self):
-            return DataLoader(...)
-
-Then use this in any model you want.
-
-Example::
-
-    class LitModel(pl.LightningModule):
-
-        def __init__(self, data_module=MyDataModule(PATH)):
-            super().__init()
-            self.dm = data_module
-
-        def prepare_data(self):
-            self.dm.prepare_data()
-
-        def train_dataloader(self):
-            return self.dm.train_dataloader()
-
-        def val_dataloader(self):
-            return self.dm.val_dataloader()
-
-        def test_dataloader(self):
-            return self.dm.test_dataloader()
-
-Asynchronous Loading
---------------------
-DataModules also includes an extra asynchronous dataloader for accelerating single GPU training.
-
-This dataloader behaves identically to the standard pytorch dataloader, but will transfer
-data asynchronously to the GPU with training. You can also use it to wrap an existing dataloader.
-
-Example::
-
-    from pl_bolts.datamodules.cifar10_dataset import CIFAR10
-    ds = CIFAR10(tmpdir)
-    device = torch.device('cuda', 0)
-
-    dataloader = AsynchronousLoader(ds, device=device)
-
-    for b in dataloader:
+    dm = MNISTDataModule('path/to/data')
+    for batch in dm.train_dataloader():
         ...
-
-or::
-
-    dataloader = AsynchronousLoader(DataLoader(ds, batch_size=16), device=device)
-
-    for b in dataloader:
+    for batch in dm.val_dataloader():
+        ...
+    for batch in dm.test_dataloader():
         ...
 
--------------
-
-DummyDataset
-------------
-
-.. autoclass:: pl_bolts.datamodules.dummy_dataset.DummyDataset
-   :noindex:
-
--------------
-
-AsynchronousLoader
-------------------
-
-.. autoclass:: pl_bolts.datamodules.async_dataloader.AsynchronousLoader
-   :noindex:
+Please visit the PyTorch Lightning documentation for more details on DataModules
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 3b8179df14..7edb8f0ead 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -33,6 +33,13 @@ PyTorch-Lightning-Bolts documentation
    sklearn_datamodule
    vision_datamodules
 
+.. toctree::
+   :maxdepth: 2
+   :name: dataloaders
+   :caption: DataLoaders
+
+   dataloaders
+
 .. toctree::
    :maxdepth: 2
    :name: losses
diff --git a/docs/source/self_supervised_callbacks.rst b/docs/source/self_supervised_callbacks.rst
index 8f9083f4f1..2739a5aa09 100644
--- a/docs/source/self_supervised_callbacks.rst
+++ b/docs/source/self_supervised_callbacks.rst
@@ -9,7 +9,7 @@ Useful callbacks for self-supervised learning models
 
 BYOLMAWeightUpdate
 ------------------
-The exponential moving average weight-update rule from Bring Your Own Latent Space (BYOL).
+The exponential moving average weight-update rule from Bring Your Own Latent (BYOL).
 
 .. autoclass:: pl_bolts.callbacks.self_supervised.BYOLMAWeightUpdate
    :noindex:
diff --git a/docs/source/vision_datamodules.rst b/docs/source/vision_datamodules.rst
index e44b1b1271..3ad554d347 100644
--- a/docs/source/vision_datamodules.rst
+++ b/docs/source/vision_datamodules.rst
@@ -9,16 +9,10 @@ Supervised learning
 These are standard vision datasets with the train, test, val splits pre-generated in DataLoaders with
 the standard transforms (and Normalization) values
 
-MNIST
-^^^^^
+CityScapes
+^^^^^^^^^^
 
-.. autoclass:: pl_bolts.datamodules.mnist_datamodule.MNISTDataModule
-    :noindex:
-
-FashionMNIST
-^^^^^^^^^^^^
-
-.. autoclass:: pl_bolts.datamodules.fashion_mnist_datamodule.FashionMNISTDataModule
+.. autoclass:: pl_bolts.datamodules.cityscapes_datamodule.CityscapesDataModule
     :noindex:
 
 CIFAR-10
@@ -27,6 +21,12 @@ CIFAR-10
 .. autoclass:: pl_bolts.datamodules.cifar10_datamodule.CIFAR10DataModule
     :noindex:
 
+FashionMNIST
+^^^^^^^^^^^^
+
+.. autoclass:: pl_bolts.datamodules.fashion_mnist_datamodule.FashionMNISTDataModule
+    :noindex:
+
 
 Imagenet
 ^^^^^^^^
@@ -34,6 +34,12 @@ Imagenet
 .. autoclass:: pl_bolts.datamodules.imagenet_datamodule.ImagenetDataModule
     :noindex:
 
+MNIST
+^^^^^
+
+.. autoclass:: pl_bolts.datamodules.mnist_datamodule.MNISTDataModule
+    :noindex:
+
 ------------
 
 Semi-supervised learning
diff --git a/pl_bolts/datamodules/cifar10_datamodule.py b/pl_bolts/datamodules/cifar10_datamodule.py
index 1f2db4399b..f24ea1f9e3 100644
--- a/pl_bolts/datamodules/cifar10_datamodule.py
+++ b/pl_bolts/datamodules/cifar10_datamodule.py
@@ -26,6 +26,15 @@ def __init__(
             **kwargs,
     ):
         """
+        .. figure:: https://3qeqpr26caki16dnhd19sv6by6v-wpengine.netdna-ssl.com/wp-content/uploads/2019/01/
+            Plot-of-a-Subset-of-Images-from-the-CIFAR-10-Dataset.png
+            :width: 400
+            :alt: CIFAR-10
+
+        Specs:
+            - 10 classes (1 per class)
+            - Each image is (3 x 32 x 32)
+
         Standard CIFAR10, train, val, test splits and transforms
 
         Transforms::
@@ -43,7 +52,9 @@ def __init__(
             from pl_bolts.datamodules import CIFAR10DataModule
 
             dm = CIFAR10DataModule(PATH)
-            model = LitModel(datamodule=dm)
+            model = LitModel()
+
+            Trainer().fit(model, dm)
 
         Or you can set your own transforms
 
diff --git a/pl_bolts/datamodules/cityscapes_datamodule.py b/pl_bolts/datamodules/cityscapes_datamodule.py
new file mode 100644
index 0000000000..731db055d1
--- /dev/null
+++ b/pl_bolts/datamodules/cityscapes_datamodule.py
@@ -0,0 +1,162 @@
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import DataLoader, random_split
+from torchvision import transforms as transform_lib
+from torchvision.datasets import Cityscapes
+import torch
+
+
+class CityscapesDataModule(LightningDataModule):
+
+    name = 'Cityscapes'
+    extra_args = {}
+
+    def __init__(
+            self,
+            data_dir,
+            val_split=5000,
+            num_workers=16,
+            batch_size=32,
+            seed=42,
+            *args,
+            **kwargs,
+    ):
+        """
+        .. figure:: https://www.cityscapes-dataset.com/wordpress/wp-content/uploads/2015/07/muenster00-1024x510.png
+            :width: 400
+            :alt: Cityscape
+
+        Standard Cityscapes, train, val, test splits and transforms
+
+        Specs:
+            - 30 classes (road, person, sidewalk, etc...)
+            - (image, target) - image dims: (3 x 32 x 32), target dims: (3 x 32 x 32)
+
+        Transforms::
+
+            transforms = transform_lib.Compose([
+                transform_lib.ToTensor(),
+                transform_lib.Normalize(
+                    mean=[0.28689554, 0.32513303, 0.28389177],
+                    std=[0.18696375, 0.19017339, 0.18720214]
+                )
+            ])
+
+        Example::
+
+            from pl_bolts.datamodules import CityscapesDataModule
+
+            dm = CityscapesDataModule(PATH)
+            model = LitModel()
+
+            Trainer().fit(model, dm)
+
+        Or you can set your own transforms
+
+        Example::
+
+            dm.train_transforms = ...
+            dm.test_transforms = ...
+            dm.val_transforms  = ...
+
+        Args:
+            data_dir: where to save/load the data
+            val_split: how many of the training images to use for the validation split
+            num_workers: how many workers to use for loading data
+            batch_size: number of examples per training/eval step
+        """
+        super().__init__(*args, **kwargs)
+        self.dims = (3, 32, 32)
+        self.DATASET = Cityscapes
+        self.data_dir = data_dir
+        self.val_split = val_split
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.seed = seed
+
+    @property
+    def num_classes(self):
+        """
+        Return:
+            30
+        """
+        return 30
+
+    def prepare_data(self):
+        """
+        Saves Cityscapes files to data_dir
+        """
+        self.DATASET(self.data_dir, train=True, download=True, transform=transform_lib.ToTensor(), **self.extra_args)
+        self.DATASET(self.data_dir, train=False, download=True, transform=transform_lib.ToTensor(), **self.extra_args)
+
+    def train_dataloader(self):
+        """
+        Cityscapes train set with removed subset to use for validation
+        """
+        transforms = self.default_transforms() if self.train_transforms is None else self.train_transforms
+
+        dataset = self.DATASET(self.data_dir, train=True, download=False, transform=transforms, **self.extra_args)
+        train_length = len(dataset)
+        dataset_train, _ = random_split(
+            dataset,
+            [train_length - self.val_split, self.val_split],
+            generator=torch.Generator().manual_seed(self.seed)
+        )
+        loader = DataLoader(
+            dataset_train,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            drop_last=True,
+            pin_memory=True
+        )
+        return loader
+
+    def val_dataloader(self):
+        """
+        Cityscapes val set uses a subset of the training set for validation
+        """
+        transforms = self.default_transforms() if self.val_transforms is None else self.val_transforms
+
+        dataset = self.DATASET(self.data_dir, train=True, download=False, transform=transforms, **self.extra_args)
+        train_length = len(dataset)
+        _, dataset_val = random_split(
+            dataset,
+            [train_length - self.val_split, self.val_split],
+            generator=torch.Generator().manual_seed(self.seed)
+        )
+        loader = DataLoader(
+            dataset_val,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            drop_last=True
+        )
+        return loader
+
+    def test_dataloader(self):
+        """
+        Cityscapes test set uses the test split
+        """
+        transforms = self.default_transforms() if self.test_transforms is None else self.test_transforms
+
+        dataset = self.DATASET(self.data_dir, train=False, download=False, transform=transforms, **self.extra_args)
+        loader = DataLoader(
+            dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            drop_last=True,
+            pin_memory=True
+        )
+        return loader
+
+    def default_transforms(self):
+        cityscapes_transforms = transform_lib.Compose([
+            transform_lib.ToTensor(),
+            transform_lib.Normalize(
+                mean=[0.28689554, 0.32513303, 0.28389177],
+                std=[0.18696375, 0.19017339, 0.18720214]
+            )
+        ])
+        return cityscapes_transforms
diff --git a/pl_bolts/datamodules/fashion_mnist_datamodule.py b/pl_bolts/datamodules/fashion_mnist_datamodule.py
index 2e97bde6b9..3e81ebdbf4 100644
--- a/pl_bolts/datamodules/fashion_mnist_datamodule.py
+++ b/pl_bolts/datamodules/fashion_mnist_datamodule.py
@@ -19,6 +19,15 @@ def __init__(
             **kwargs,
     ):
         """
+        .. figure:: https://3qeqpr26caki16dnhd19sv6by6v-wpengine.netdna-ssl.com/
+            wp-content/uploads/2019/02/Plot-of-a-Subset-of-Images-from-the-Fashion-MNIST-Dataset.png
+            :width: 400
+            :alt: Fashion MNIST
+
+        Specs:
+            - 10 classes (1 per type)
+            - Each image is (1 x 28 x 28)
+
         Standard FashionMNIST, train, val, test splits and transforms
 
         Transforms::
@@ -32,7 +41,9 @@ def __init__(
             from pl_bolts.datamodules import FashionMNISTDataModule
 
             dm = FashionMNISTDataModule('.')
-            model = LitModel(datamodule=dm)
+            model = LitModel()
+
+            Trainer().fit(model, dm)
 
         Args:
             data_dir: where to save/load the data
diff --git a/pl_bolts/datamodules/imagenet_datamodule.py b/pl_bolts/datamodules/imagenet_datamodule.py
index 248a54f39f..6daca3d6a8 100644
--- a/pl_bolts/datamodules/imagenet_datamodule.py
+++ b/pl_bolts/datamodules/imagenet_datamodule.py
@@ -24,6 +24,15 @@ def __init__(
             **kwargs,
     ):
         """
+        .. figure:: https://3qeqpr26caki16dnhd19sv6by6v-wpengine.netdna-ssl.com/wp-content/uploads/2017/08/
+            Sample-of-Images-from-the-ImageNet-Dataset-used-in-the-ILSVRC-Challenge.png
+            :width: 400
+            :alt: Imagenet
+
+        Specs:
+            - 1000 classes
+            - Each image is (3 x varies x varies) (here we default to 3 x 224 x 224)
+
         Imagenet train, val and test dataloaders.
 
         The train set is the imagenet train.
@@ -37,7 +46,10 @@ def __init__(
 
             from pl_bolts.datamodules import ImagenetDataModule
 
-            datamodule = ImagenetDataModule(IMAGENET_PATH)
+            dm = ImagenetDataModule(IMAGENET_PATH)
+            model = LitModel()
+
+            Trainer().fit(model, dm)
 
         Args:
 
diff --git a/pl_bolts/datamodules/mnist_datamodule.py b/pl_bolts/datamodules/mnist_datamodule.py
index 7b62c622ca..2adc03fc11 100644
--- a/pl_bolts/datamodules/mnist_datamodule.py
+++ b/pl_bolts/datamodules/mnist_datamodule.py
@@ -20,6 +20,14 @@ def __init__(
             **kwargs,
     ):
         """
+        .. figure:: https://miro.medium.com/max/744/1*AO2rIhzRYzFVQlFLx9DM9A.png
+            :width: 400
+            :alt: MNIST
+
+        Specs:
+            - 10 classes (1 per digit)
+            - Each image is (1 x 28 x 28)
+
         Standard MNIST, train, val, test splits and transforms
 
         Transforms::
@@ -33,7 +41,9 @@ def __init__(
             from pl_bolts.datamodules import MNISTDataModule
 
             dm = MNISTDataModule('.')
-            model = LitModel(datamodule=dm)
+            model = LitModel()
+
+            Trainer().fit(model, dm)
 
         Args:
             data_dir: where to save/load the data
diff --git a/pl_bolts/datamodules/stl10_datamodule.py b/pl_bolts/datamodules/stl10_datamodule.py
index 17b3dc4626..4e1961d627 100644
--- a/pl_bolts/datamodules/stl10_datamodule.py
+++ b/pl_bolts/datamodules/stl10_datamodule.py
@@ -24,6 +24,14 @@ def __init__(
             **kwargs,
     ):
         """
+        .. figure:: https://samyzaf.com/ML/cifar10/cifar1.jpg
+            :width: 400
+            :alt: STL-10
+
+        Specs:
+            - 10 classes (1 per type)
+            - Each image is (3 x 96 x 96)
+
         Standard STL-10, train, val, test splits and transforms.
         STL-10 has support for doing validation splits on the labeled or unlabeled splits
 
@@ -42,7 +50,9 @@ def __init__(
             from pl_bolts.datamodules import STL10DataModule
 
             dm = STL10DataModule(PATH)
-            model = LitModel(datamodule=dm)
+            model = LitModel()
+
+            Trainer().fit(model, dm)
 
         Args:
             data_dir: where to save/load the data
diff --git a/pl_bolts/models/self_supervised/byol/byol_module.py b/pl_bolts/models/self_supervised/byol/byol_module.py
index 45055f2a04..a918780e61 100644
--- a/pl_bolts/models/self_supervised/byol/byol_module.py
+++ b/pl_bolts/models/self_supervised/byol/byol_module.py
@@ -27,7 +27,7 @@ def __init__(self,
                  loss_temperature: float = 0.5,
                  **kwargs):
         """
-        PyTorch Lightning implementation of `Bring Your Own Latent Space (BYOL)
+        PyTorch Lightning implementation of `Bring Your Own Latent (BYOL)
         <https://arxiv.org/pdf/2006.07733.pdf.>`_
 
         Paper authors: Jean-Bastien Grill ,Florian Strub, Florent Altché, Corentin Tallec, Pierre H. Richemond, \
diff --git a/setup.cfg b/setup.cfg
index 62dd227648..644db29746 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -27,6 +27,7 @@ omit =
     pl_bolts/datamodules/concat_dataset.py
     pl_bolts/datamodules/cifar10_datamodule.py
     pl_bolts/datamodules/fashion_mnist_datamodule.py
+    pl_bolts/datamodules/*datamodule.py
     pl_bolts/datamodules/_datamodule.py
 
 [flake8]