diff --git a/.actions/helpers.py b/.actions/helpers.py
index e2abb58c1..2f53728e4 100644
--- a/.actions/helpers.py
+++ b/.actions/helpers.py
@@ -148,6 +148,7 @@ def _meta_file(folder: str) -> str:
     @staticmethod
     def augment_script(fpath: str):
         """Add template header and footer to the python base script.
+
         Args:
             fpath: path to python script
         """
@@ -313,6 +314,7 @@ def parse_requirements(dir_path: str):
     @staticmethod
     def copy_notebooks(path_root: str, path_docs_ipynb: str = "docs/source/notebooks"):
         """Copy all notebooks from a folder to doc folder.
+
         Args:
             path_root: source path to the project root in this tutorials
             path_docs_ipynb: destination path to the notebooks location
@@ -362,7 +364,7 @@ def update_env_details(dir_path: str):
         req = [r.strip() for r in req]
 
         def _parse(pkg: str, keys: str = " <=>") -> str:
-            """Parsing just the package name"""
+            """Parsing just the package name."""
             if any(c in pkg for c in keys):
                 ix = min(pkg.index(c) for c in keys if c in pkg)
                 pkg = pkg[:ix]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 638e36141..04bbd219f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,6 +29,12 @@ repos:
         args: [--py36-plus]
         name: Upgrade code
 
+  - repo: https://github.com/myint/docformatter
+    rev: v1.4
+    hooks:
+      - id: docformatter
+        args: [--in-place, --wrap-summaries=115, --wrap-descriptions=120]
+
   - repo: https://github.com/PyCQA/isort
     rev: 5.9.2
     hooks:
diff --git a/course_UvA-DL/autoregressive-image-modeling/Autoregressive_Image_Modeling.py b/course_UvA-DL/autoregressive-image-modeling/Autoregressive_Image_Modeling.py
index 70087490d..3d970563d 100644
--- a/course_UvA-DL/autoregressive-image-modeling/Autoregressive_Image_Modeling.py
+++ b/course_UvA-DL/autoregressive-image-modeling/Autoregressive_Image_Modeling.py
@@ -184,14 +184,14 @@ def show_imgs(imgs):
 class MaskedConvolution(nn.Module):
 
     def __init__(self, c_in, c_out, mask, **kwargs):
-        """
-        Implements a convolution with mask applied on its weights.
-        Inputs:
-            c_in - Number of input channels
-            c_out - Number of output channels
-            mask - Tensor of shape [kernel_size_H, kernel_size_W] with 0s where
+        """Implements a convolution with mask applied on its weights.
+
+        Args:
+            c_in: Number of input channels
+            c_out: Number of output channels
+            mask: Tensor of shape [kernel_size_H, kernel_size_W] with 0s where
                    the convolution should be masked, and 1s otherwise.
-            kwargs - Additional arguments for the convolution
+            kwargs: Additional arguments for the convolution
         """
         super().__init__()
         # For simplicity: calculate padding automatically
@@ -290,12 +290,12 @@ def __init__(self, c_in, c_out, kernel_size=3, mask_center=False, **kwargs):
 
 
 def show_center_recep_field(img, out):
-    """
-    Calculates the gradients of the input with respect to the output center pixel,
-    and visualizes the overall receptive field.
-    Inputs:
-        img - Input image for which we want to calculate the receptive field on.
-        out - Output features/loss which is used for backpropagation, and should be
+    """Calculates the gradients of the input with respect to the output center pixel, and visualizes the overall
+    receptive field.
+
+    Args:
+        img: Input image for which we want to calculate the receptive field on.
+        out: Output features/loss which is used for backpropagation, and should be
               the output of the network/computation graph.
     """
     # Determine gradients
@@ -476,9 +476,7 @@ def show_center_recep_field(img, out):
 class GatedMaskedConv(nn.Module):
 
     def __init__(self, c_in, **kwargs):
-        """
-        Gated Convolution block implemented the computation graph shown above.
-        """
+        """Gated Convolution block implemented the computation graph shown above."""
         super().__init__()
         self.conv_vert = VerticalStackConvolution(c_in, c_out=2 * c_in, **kwargs)
         self.conv_horiz = HorizontalStackConvolution(c_in, c_out=2 * c_in, **kwargs)
@@ -558,10 +556,10 @@ def __init__(self, c_in, c_hidden):
         self.example_input_array = train_set[0][0][None]
 
     def forward(self, x):
-        """
-        Forward image through model and return logits for each pixel.
-        Inputs:
-            x - Image tensor with integer values between 0 and 255.
+        """Forward image through model and return logits for each pixel.
+
+        Args:
+            x: Image tensor with integer values between 0 and 255.
         """
         # Scale input from 0 to 255 back to -1 to 1
         x = (x.float() / 255.0) * 2 - 1
@@ -589,11 +587,11 @@ def calc_likelihood(self, x):
 
     @torch.no_grad()
     def sample(self, img_shape, img=None):
-        """
-        Sampling function for the autoregressive model.
-        Inputs:
-            img_shape - Shape of the image to generate (B,C,H,W)
-            img (optional) - If given, this tensor will be used as
+        """Sampling function for the autoregressive model.
+
+        Args:
+            img_shape: Shape of the image to generate (B,C,H,W)
+            img (optional): If given, this tensor will be used as
                              a starting image. The pixels to fill
                              should be -1 in the input tensor.
         """
diff --git a/course_UvA-DL/deep-autoencoders/Deep_Autoencoders.py b/course_UvA-DL/deep-autoencoders/Deep_Autoencoders.py
index 43d60a024..0cbaf4c53 100644
--- a/course_UvA-DL/deep-autoencoders/Deep_Autoencoders.py
+++ b/course_UvA-DL/deep-autoencoders/Deep_Autoencoders.py
@@ -133,11 +133,11 @@ def __init__(
         self, num_input_channels: int, base_channel_size: int, latent_dim: int, act_fn: object = nn.GELU
     ):
         """
-        Inputs:
-            - num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
-            - base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
-            - latent_dim : Dimensionality of latent representation z
-            - act_fn : Activation function used throughout the encoder network
+        Args:
+           num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
+           base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
+           latent_dim : Dimensionality of latent representation z
+           act_fn : Activation function used throughout the encoder network
         """
         super().__init__()
         c_hid = base_channel_size
@@ -195,11 +195,11 @@ def __init__(
         self, num_input_channels: int, base_channel_size: int, latent_dim: int, act_fn: object = nn.GELU
     ):
         """
-        Inputs:
-            - num_input_channels : Number of channels of the image to reconstruct. For CIFAR, this parameter is 3
-            - base_channel_size : Number of channels we use in the last convolutional layers. Early layers might use a duplicate of it.
-            - latent_dim : Dimensionality of latent representation z
-            - act_fn : Activation function used throughout the decoder network
+        Args:
+           num_input_channels : Number of channels of the image to reconstruct. For CIFAR, this parameter is 3
+           base_channel_size : Number of channels we use in the last convolutional layers. Early layers might use a duplicate of it.
+           latent_dim : Dimensionality of latent representation z
+           act_fn : Activation function used throughout the decoder network
         """
         super().__init__()
         c_hid = base_channel_size
@@ -263,17 +263,13 @@ def __init__(
         self.example_input_array = torch.zeros(2, num_input_channels, width, height)
 
     def forward(self, x):
-        """
-        The forward function takes in an image and returns the reconstructed image
-        """
+        """The forward function takes in an image and returns the reconstructed image."""
         z = self.encoder(x)
         x_hat = self.decoder(z)
         return x_hat
 
     def _get_reconstruction_loss(self, batch):
-        """
-        Given a batch of images, this function returns the reconstruction loss (MSE in our case)
-        """
+        """Given a batch of images, this function returns the reconstruction loss (MSE in our case)"""
         x, _ = batch  # We do not need the labels
         x_hat = self.forward(x)
         loss = F.mse_loss(x, x_hat, reduction="none")
diff --git a/course_UvA-DL/deep-energy-based-generative-models/Deep_Energy_Models.py b/course_UvA-DL/deep-energy-based-generative-models/Deep_Energy_Models.py
index fedeff1cb..ecb5d5491 100644
--- a/course_UvA-DL/deep-energy-based-generative-models/Deep_Energy_Models.py
+++ b/course_UvA-DL/deep-energy-based-generative-models/Deep_Energy_Models.py
@@ -333,11 +333,11 @@ class Sampler:
 
     def __init__(self, model, img_shape, sample_size, max_len=8192):
         """
-        Inputs:
-            model - Neural network to use for modeling E_theta
-            img_shape - Shape of the images to model
-            sample_size - Batch size of the samples
-            max_len - Maximum number of data points to keep in the buffer
+        Args:
+            model: Neural network to use for modeling E_theta
+            img_shape: Shape of the images to model
+            sample_size: Batch size of the samples
+            max_len: Maximum number of data points to keep in the buffer
         """
         super().__init__()
         self.model = model
@@ -347,11 +347,11 @@ def __init__(self, model, img_shape, sample_size, max_len=8192):
         self.examples = [(torch.rand((1, ) + img_shape) * 2 - 1) for _ in range(self.sample_size)]
 
     def sample_new_exmps(self, steps=60, step_size=10):
-        """
-        Function for getting a new batch of "fake" images.
-        Inputs:
-            steps - Number of iterations in the MCMC algorithm
-            step_size - Learning rate nu in the algorithm above
+        """Function for getting a new batch of "fake" images.
+
+        Args:
+            steps: Number of iterations in the MCMC algorithm
+            step_size: Learning rate nu in the algorithm above
         """
         # Choose 95% of the batch from the buffer, 5% generate from scratch
         n_new = np.random.binomial(self.sample_size, 0.05)
@@ -369,14 +369,14 @@ def sample_new_exmps(self, steps=60, step_size=10):
 
     @staticmethod
     def generate_samples(model, inp_imgs, steps=60, step_size=10, return_img_per_step=False):
-        """
-        Function for sampling images for a given model.
-        Inputs:
-            model - Neural network to use for modeling E_theta
-            inp_imgs - Images to start from for sampling. If you want to generate new images, enter noise between -1 and 1.
-            steps - Number of iterations in the MCMC algorithm.
-            step_size - Learning rate nu in the algorithm above
-            return_img_per_step - If True, we return the sample at every iteration of the MCMC
+        """Function for sampling images for a given model.
+
+        Args:
+            model: Neural network to use for modeling E_theta
+            inp_imgs: Images to start from for sampling. If you want to generate new images, enter noise between -1 and 1.
+            steps: Number of iterations in the MCMC algorithm.
+            step_size: Learning rate nu in the algorithm above
+            return_img_per_step: If True, we return the sample at every iteration of the MCMC
         """
         # Before MCMC: set model parameters to "required_grad=False"
         # because we are only interested in the gradients of the input.
diff --git a/course_UvA-DL/graph-neural-networks/GNN_overview.py b/course_UvA-DL/graph-neural-networks/GNN_overview.py
index 460d434a5..5dd76395e 100644
--- a/course_UvA-DL/graph-neural-networks/GNN_overview.py
+++ b/course_UvA-DL/graph-neural-networks/GNN_overview.py
@@ -172,9 +172,9 @@ def __init__(self, c_in, c_out):
 
     def forward(self, node_feats, adj_matrix):
         """
-        Inputs:
-            node_feats - Tensor with node features of shape [batch_size, num_nodes, c_in]
-            adj_matrix - Batch of adjacency matrices of the graph. If there is an edge from i to j,
+        Args:
+            node_feats: Tensor with node features of shape [batch_size, num_nodes, c_in]
+            adj_matrix: Batch of adjacency matrices of the graph. If there is an edge from i to j,
                          adj_matrix[b,i,j]=1 else 0. Supports directed edges by non-symmetric matrices.
                          Assumes to already have added the identity connections.
                          Shape: [batch_size, num_nodes, num_nodes]
@@ -302,13 +302,13 @@ class GATLayer(nn.Module):
 
     def __init__(self, c_in, c_out, num_heads=1, concat_heads=True, alpha=0.2):
         """
-        Inputs:
-            c_in - Dimensionality of input features
-            c_out - Dimensionality of output features
-            num_heads - Number of heads, i.e. attention mechanisms to apply in parallel. The
+        Args:
+            c_in: Dimensionality of input features
+            c_out: Dimensionality of output features
+            num_heads: Number of heads, i.e. attention mechanisms to apply in parallel. The
                         output features are equally split up over the heads if concat_heads=True.
-            concat_heads - If True, the output of the different heads is concatenated instead of averaged.
-            alpha - Negative slope of the LeakyReLU activation.
+            concat_heads: If True, the output of the different heads is concatenated instead of averaged.
+            alpha: Negative slope of the LeakyReLU activation.
         """
         super().__init__()
         self.num_heads = num_heads
@@ -328,10 +328,10 @@ def __init__(self, c_in, c_out, num_heads=1, concat_heads=True, alpha=0.2):
 
     def forward(self, node_feats, adj_matrix, print_attn_probs=False):
         """
-        Inputs:
-            node_feats - Input features of the node. Shape: [batch_size, c_in]
-            adj_matrix - Adjacency matrix including self-connections. Shape: [batch_size, num_nodes, num_nodes]
-            print_attn_probs - If True, the attention weights are printed during the forward pass
+        Args:
+            node_feats: Input features of the node. Shape: [batch_size, c_in]
+            adj_matrix: Adjacency matrix including self-connections. Shape: [batch_size, num_nodes, num_nodes]
+            print_attn_probs: If True, the attention weights are printed during the forward pass
                                (for debugging purposes)
         """
         batch_size, num_nodes = node_feats.size(0), node_feats.size(1)
@@ -507,14 +507,14 @@ def __init__(
         **kwargs,
     ):
         """
-        Inputs:
-            c_in - Dimension of input features
-            c_hidden - Dimension of hidden features
-            c_out - Dimension of the output features. Usually number of classes in classification
-            num_layers - Number of "hidden" graph layers
-            layer_name - String of the graph layer to use
-            dp_rate - Dropout rate to apply throughout the network
-            kwargs - Additional arguments for the graph layer (e.g. number of heads for GAT)
+        Args:
+            c_in: Dimension of input features
+            c_hidden: Dimension of hidden features
+            c_out: Dimension of the output features. Usually number of classes in classification
+            num_layers: Number of "hidden" graph layers
+            layer_name: String of the graph layer to use
+            dp_rate: Dropout rate to apply throughout the network
+            kwargs: Additional arguments for the graph layer (e.g. number of heads for GAT)
         """
         super().__init__()
         gnn_layer = gnn_layer_by_name[layer_name]
@@ -533,9 +533,9 @@ def __init__(
 
     def forward(self, x, edge_index):
         """
-        Inputs:
-            x - Input features per node
-            edge_index - List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
+        Args:
+            x: Input features per node
+            edge_index: List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
         """
         for layer in self.layers:
             # For graph layers, we need to add the "edge_index" tensor as additional input
@@ -560,12 +560,12 @@ class MLPModel(nn.Module):
 
     def __init__(self, c_in, c_hidden, c_out, num_layers=2, dp_rate=0.1):
         """
-        Inputs:
-            c_in - Dimension of input features
-            c_hidden - Dimension of hidden features
-            c_out - Dimension of the output features. Usually number of classes in classification
-            num_layers - Number of hidden layers
-            dp_rate - Dropout rate to apply throughout the network
+        Args:
+            c_in: Dimension of input features
+            c_hidden: Dimension of hidden features
+            c_out: Dimension of the output features. Usually number of classes in classification
+            num_layers: Number of hidden layers
+            dp_rate: Dropout rate to apply throughout the network
         """
         super().__init__()
         layers = []
@@ -578,8 +578,8 @@ def __init__(self, c_in, c_hidden, c_out, num_layers=2, dp_rate=0.1):
 
     def forward(self, x, *args, **kwargs):
         """
-        Inputs:
-            x - Input features per node
+        Args:
+            x: Input features per node
         """
         return self.layers(x)
 
@@ -858,12 +858,12 @@ class GraphGNNModel(nn.Module):
 
     def __init__(self, c_in, c_hidden, c_out, dp_rate_linear=0.5, **kwargs):
         """
-        Inputs:
-            c_in - Dimension of input features
-            c_hidden - Dimension of hidden features
-            c_out - Dimension of output features (usually number of classes)
-            dp_rate_linear - Dropout rate before the linear layer (usually much higher than inside the GNN)
-            kwargs - Additional arguments for the GNNModel object
+        Args:
+            c_in: Dimension of input features
+            c_hidden: Dimension of hidden features
+            c_out: Dimension of output features (usually number of classes)
+            dp_rate_linear: Dropout rate before the linear layer (usually much higher than inside the GNN)
+            kwargs: Additional arguments for the GNNModel object
         """
         super().__init__()
         self.GNN = GNNModel(
@@ -876,10 +876,10 @@ def __init__(self, c_in, c_hidden, c_out, dp_rate_linear=0.5, **kwargs):
 
     def forward(self, x, edge_index, batch_idx):
         """
-        Inputs:
-            x - Input features per node
-            edge_index - List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
-            batch_idx - Index of batch element for each node
+        Args:
+            x: Input features per node
+            edge_index: List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
+            batch_idx: Index of batch element for each node
         """
         x = self.GNN(x, edge_index)
         x = geom_nn.global_mean_pool(x, batch_idx)  # Average pooling
diff --git a/course_UvA-DL/normalizing-flows/NF_image_modeling.py b/course_UvA-DL/normalizing-flows/NF_image_modeling.py
index 188881966..2ef9eb727 100644
--- a/course_UvA-DL/normalizing-flows/NF_image_modeling.py
+++ b/course_UvA-DL/normalizing-flows/NF_image_modeling.py
@@ -258,9 +258,9 @@ class ImageFlow(pl.LightningModule):
 
     def __init__(self, flows, import_samples=8):
         """
-        Inputs:
-            flows - A list of flows (each a nn.Module) that should be applied on the images.
-            import_samples - Number of importance samples to use during testing (see explanation below). Can be changed at any time
+        Args:
+            flows: A list of flows (each a nn.Module) that should be applied on the images.
+            import_samples: Number of importance samples to use during testing (see explanation below). Can be changed at any time
         """
         super().__init__()
         self.flows = nn.ModuleList(flows)
@@ -282,10 +282,10 @@ def encode(self, imgs):
         return z, ldj
 
     def _get_likelihood(self, imgs, return_ll=False):
-        """
-        Given a batch of images, return the likelihood of those.
-        If return_ll is True, this function returns the log likelihood of the input.
-        Otherwise, the ouptut metric is bits per dimension (scaled negative log likelihood)
+        """Given a batch of images, return the likelihood of those.
+
+        If return_ll is True, this function returns the log likelihood of the input. Otherwise, the ouptut metric is
+        bits per dimension (scaled negative log likelihood)
         """
         z, ldj = self.encode(imgs)
         log_pz = self.prior.log_prob(z).sum(dim=[1, 2, 3])
@@ -297,9 +297,7 @@ def _get_likelihood(self, imgs, return_ll=False):
 
     @torch.no_grad()
     def sample(self, img_shape, z_init=None):
-        """
-        Sample a batch of images from the flow.
-        """
+        """Sample a batch of images from the flow."""
         # Sample latent representation from prior
         if z_init is None:
             z = self.prior.sample(sample_shape=img_shape).to(device)
@@ -401,10 +399,10 @@ class Dequantization(nn.Module):
 
     def __init__(self, alpha=1e-5, quants=256):
         """
-        Inputs:
-            alpha - small constant that is used to scale the original input.
+        Args:
+            alpha: small constant that is used to scale the original input.
                     Prevents dealing with values very close to 0 and 1 when inverting the sigmoid
-            quants - Number of possible discrete values (usually 256 for 8-bit image)
+            quants: Number of possible discrete values (usually 256 for 8-bit image)
         """
         super().__init__()
         self.alpha = alpha
@@ -484,9 +482,7 @@ def dequant(self, z, ldj):
 
 
 def visualize_dequantization(quants, prior=None):
-    """
-    Function for visualizing the dequantization values of discrete values in continuous space
-    """
+    """Function for visualizing the dequantization values of discrete values in continuous space."""
     # Prior over discrete values. If not given, a uniform is assumed
     if prior is None:
         prior = np.ones(quants, dtype=np.float32) / quants
@@ -592,9 +588,9 @@ class VariationalDequantization(Dequantization):
 
     def __init__(self, var_flows, alpha=1e-5):
         """
-        Inputs:
-            var_flows - A list of flow transformations to use for modeling q(u|x)
-            alpha - Small constant, see Dequantization for details
+        Args:
+            var_flows: A list of flow transformations to use for modeling q(u|x)
+            alpha: Small constant, see Dequantization for details
         """
         super().__init__(alpha=alpha)
         self.flows = nn.ModuleList(var_flows)
@@ -658,14 +654,14 @@ def dequant(self, z, ldj):
 class CouplingLayer(nn.Module):
 
     def __init__(self, network, mask, c_in):
-        """
-        Coupling layer inside a normalizing flow.
-        Inputs:
-            network - A PyTorch nn.Module constituting the deep neural network for mu and sigma.
+        """Coupling layer inside a normalizing flow.
+
+        Args:
+            network: A PyTorch nn.Module constituting the deep neural network for mu and sigma.
                       Output shape should be twice the channel size as the input.
-            mask - Binary mask (0 or 1) where 0 denotes that the element should be transformed,
+            mask: Binary mask (0 or 1) where 0 denotes that the element should be transformed,
                    while 1 means the latent will be used as input to the NN.
-            c_in - Number of input channels
+            c_in: Number of input channels
         """
         super().__init__()
         self.network = network
@@ -676,12 +672,12 @@ def __init__(self, network, mask, c_in):
 
     def forward(self, z, ldj, reverse=False, orig_img=None):
         """
-        Inputs:
-            z - Latent input to the flow
-            ldj - The current ldj of the previous flows.
+        Args:
+            z: Latent input to the flow
+            ldj: The current ldj of the previous flows.
                   The ldj of this layer will be added to this tensor.
-            reverse - If True, we apply the inverse of the layer.
-            orig_img (optional) - Only needed in VarDeq. Allows external
+            reverse: If True, we apply the inverse of the layer.
+            orig_img (optional): Only needed in VarDeq. Allows external
                                   input to condition the flow on (e.g. original image)
         """
         # Apply network to masked input
@@ -794,8 +790,8 @@ def create_channel_mask(c_in, invert=False):
 
 # %%
 class ConcatELU(nn.Module):
-    """
-    Activation function that applies ELU in both direction (inverted and plain).
+    """Activation function that applies ELU in both direction (inverted and plain).
+
     Allows non-linearity while providing strong gradients for any input (important for final convolution)
     """
 
@@ -806,10 +802,11 @@ def forward(self, x):
 class LayerNormChannels(nn.Module):
 
     def __init__(self, c_in):
-        """
-        This module applies layer norm across channels in an image. Has been shown to work well with ResNet connections.
-        Inputs:
-            c_in - Number of channels of the input
+        """This module applies layer norm across channels in an image.
+
+        Has been shown to work well with ResNet connections.
+        Args:
+            c_in: Number of channels of the input
         """
         super().__init__()
         self.layer_norm = nn.LayerNorm(c_in)
@@ -826,9 +823,9 @@ class GatedConv(nn.Module):
     def __init__(self, c_in, c_hidden):
         """
         This module applies a two-layer convolutional ResNet block with input gate
-        Inputs:
-            c_in - Number of channels of the input
-            c_hidden - Number of hidden dimensions we want to model (usually similar to c_in)
+        Args:
+            c_in: Number of channels of the input
+            c_hidden: Number of hidden dimensions we want to model (usually similar to c_in)
         """
         super().__init__()
         self.net = nn.Sequential(
@@ -845,13 +842,13 @@ def forward(self, x):
 class GatedConvNet(nn.Module):
 
     def __init__(self, c_in, c_hidden=32, c_out=-1, num_layers=3):
-        """
-        Module that summarizes the previous blocks to a full convolutional neural network.
-        Inputs:
-            c_in - Number of input channels
-            c_hidden - Number of hidden dimensions to use within the network
-            c_out - Number of output channels. If -1, 2 times the input channels are used (affine coupling)
-            num_layers - Number of gated ResNet blocks to apply
+        """Module that summarizes the previous blocks to a full convolutional neural network.
+
+        Args:
+            c_in: Number of input channels
+            c_hidden: Number of hidden dimensions to use within the network
+            c_out: Number of output channels. If -1, 2 times the input channels are used (affine coupling)
+            num_layers: Number of gated ResNet blocks to apply
         """
         super().__init__()
         c_out = c_out if c_out > 0 else 2 * c_in
@@ -1257,10 +1254,10 @@ def print_num_params(model):
 @torch.no_grad()
 def interpolate(model, img1, img2, num_steps=8):
     """
-    Inputs:
-        model - object of ImageFlow class that represents the (trained) flow model
-        img1, img2 - Image tensors of shape [1, 28, 28]. Images between which should be interpolated.
-        num_steps - Number of interpolation steps. 8 interpolation steps mean 6 intermediate pictures besides img1 and img2
+    Args:
+        model: object of ImageFlow class that represents the (trained) flow model
+        img1, img2: Image tensors of shape [1, 28, 28]. Images between which should be interpolated.
+        num_steps: Number of interpolation steps. 8 interpolation steps mean 6 intermediate pictures besides img1 and img2
     """
     imgs = torch.stack([img1, img2], dim=0).to(model.device)
     z, _ = model.encode(imgs)
@@ -1331,9 +1328,9 @@ def interpolate(model, img1, img2, num_steps=8):
 # %%
 def visualize_dequant_distribution(model: ImageFlow, imgs: torch.Tensor, title: str = None):
     """
-    Inputs:
-        model - The flow of which we want to visualize the dequantization distribution
-        imgs - Example training images of which we want to visualize the dequantization distribution
+    Args:
+        model: The flow of which we want to visualize the dequantization distribution
+        imgs: Example training images of which we want to visualize the dequantization distribution
     """
     imgs = imgs.to(device)
     ldj = torch.zeros(imgs.shape[0], dtype=torch.float32).to(device)
diff --git a/course_UvA-DL/transformers-and-MH-attention/Transformers_MHAttention.py b/course_UvA-DL/transformers-and-MH-attention/Transformers_MHAttention.py
index a04e1474e..85215af77 100644
--- a/course_UvA-DL/transformers-and-MH-attention/Transformers_MHAttention.py
+++ b/course_UvA-DL/transformers-and-MH-attention/Transformers_MHAttention.py
@@ -459,11 +459,11 @@ class EncoderBlock(nn.Module):
 
     def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
         """
-        Inputs:
-            input_dim - Dimensionality of the input
-            num_heads - Number of heads to use in the attention block
-            dim_feedforward - Dimensionality of the hidden layer in the MLP
-            dropout - Dropout probability to use in the dropout layers
+        Args:
+            input_dim: Dimensionality of the input
+            num_heads: Number of heads to use in the attention block
+            dim_feedforward: Dimensionality of the hidden layer in the MLP
+            dropout: Dropout probability to use in the dropout layers
         """
         super().__init__()
 
@@ -568,9 +568,9 @@ class PositionalEncoding(nn.Module):
 
     def __init__(self, d_model, max_len=5000):
         """
-        Inputs
-            d_model - Hidden dimensionality of the input.
-            max_len - Maximum length of a sequence to expect.
+        Args
+            d_model: Hidden dimensionality of the input.
+            max_len: Maximum length of a sequence to expect.
         """
         super().__init__()
 
@@ -759,17 +759,17 @@ def __init__(
         input_dropout=0.0
     ):
         """
-        Inputs:
-            input_dim - Hidden dimensionality of the input
-            model_dim - Hidden dimensionality to use inside the Transformer
-            num_classes - Number of classes to predict per sequence element
-            num_heads - Number of heads to use in the Multi-Head Attention blocks
-            num_layers - Number of encoder blocks to use.
-            lr - Learning rate in the optimizer
-            warmup - Number of warmup steps. Usually between 50 and 500
-            max_iters - Number of maximum iterations the model is trained for. This is needed for the CosineWarmup scheduler
-            dropout - Dropout to apply inside the model
-            input_dropout - Dropout to apply on the input features
+        Args:
+            input_dim: Hidden dimensionality of the input
+            model_dim: Hidden dimensionality to use inside the Transformer
+            num_classes: Number of classes to predict per sequence element
+            num_heads: Number of heads to use in the Multi-Head Attention blocks
+            num_layers: Number of encoder blocks to use.
+            lr: Learning rate in the optimizer
+            warmup: Number of warmup steps. Usually between 50 and 500
+            max_iters: Number of maximum iterations the model is trained for. This is needed for the CosineWarmup scheduler
+            dropout: Dropout to apply inside the model
+            input_dropout: Dropout to apply on the input features
         """
         super().__init__()
         self.save_hyperparameters()
@@ -799,10 +799,10 @@ def _create_model(self):
 
     def forward(self, x, mask=None, add_positional_encoding=True):
         """
-        Inputs:
-            x - Input features of shape [Batch, SeqLen, input_dim]
-            mask - Mask to apply on the attention outputs (optional)
-            add_positional_encoding - If True, we add the positional encoding to the input.
+        Args:
+            x: Input features of shape [Batch, SeqLen, input_dim]
+            mask: Mask to apply on the attention outputs (optional)
+            add_positional_encoding: If True, we add the positional encoding to the input.
                                       Might not be desired for some tasks.
         """
         x = self.input_net(x)
@@ -814,8 +814,8 @@ def forward(self, x, mask=None, add_positional_encoding=True):
 
     @torch.no_grad()
     def get_attention_maps(self, x, mask=None, add_positional_encoding=True):
-        """
-        Function for extracting the attention matrices of the whole Transformer for a single batch.
+        """Function for extracting the attention matrices of the whole Transformer for a single batch.
+
         Input arguments same as the forward pass.
         """
         x = self.input_net(x)
@@ -1280,11 +1280,11 @@ class SetAnomalyDataset(data.Dataset):
 
     def __init__(self, img_feats, labels, set_size=10, train=True):
         """
-        Inputs:
-            img_feats - Tensor of shape [num_imgs, img_dim]. Represents the high-level features.
-            labels - Tensor of shape [num_imgs], containing the class labels for the images
-            set_size - Number of elements in a set. N-1 are sampled from one class, and one from another one.
-            train - If True, a new set will be sampled every time __getitem__ is called.
+        Args:
+            img_feats: Tensor of shape [num_imgs, img_dim]. Represents the high-level features.
+            labels: Tensor of shape [num_imgs], containing the class labels for the images
+            set_size: Number of elements in a set. N-1 are sampled from one class, and one from another one.
+            train: If True, a new set will be sampled every time __getitem__ is called.
         """
         super().__init__()
         self.img_feats = img_feats
@@ -1309,8 +1309,8 @@ def _create_test_sets(self):
         return test_sets
 
     def sample_img_set(self, anomaly_label):
-        """
-        Samples a new set of images, given the label of the anomaly.
+        """Samples a new set of images, given the label of the anomaly.
+
         The sampled images come from a different class than anomaly_label
         """
         # Sample class from 0,...,num_classes-1 while skipping anomaly_label as class
diff --git a/lightning_examples/reinforce-learning-DQN/dqn.py b/lightning_examples/reinforce-learning-DQN/dqn.py
index 26176d00d..112916da3 100644
--- a/lightning_examples/reinforce-learning-DQN/dqn.py
+++ b/lightning_examples/reinforce-learning-DQN/dqn.py
@@ -7,6 +7,7 @@
 import numpy as np
 import torch
 from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning.utilities import DistributedType
 from torch import nn, Tensor
 from torch.optim import Adam, Optimizer
 from torch.utils.data import DataLoader
@@ -18,9 +19,7 @@
 
 # %%
 class DQN(nn.Module):
-    """
-    Simple MLP network
-    """
+    """Simple MLP network."""
 
     def __init__(self, obs_size: int, n_actions: int, hidden_size: int = 128):
         """
@@ -54,8 +53,7 @@ def forward(self, x):
 
 # %%
 class ReplayBuffer:
-    """
-    Replay Buffer for storing past experiences allowing the agent to learn from them
+    """Replay Buffer for storing past experiences allowing the agent to learn from them.
 
     Args:
         capacity: size of the buffer
@@ -68,8 +66,7 @@ def __len__(self) -> None:
         return len(self.buffer)
 
     def append(self, experience: Experience) -> None:
-        """
-        Add experience to the buffer
+        """Add experience to the buffer.
 
         Args:
             experience: tuple (state, action, reward, done, new_state)
@@ -91,9 +88,7 @@ def sample(self, batch_size: int) -> Tuple:
 
 # %%
 class RLDataset(IterableDataset):
-    """
-    Iterable Dataset containing the ExperienceBuffer
-    which will be updated with new experiences during training
+    """Iterable Dataset containing the ExperienceBuffer which will be updated with new experiences during training.
 
     Args:
         buffer: replay buffer
@@ -116,9 +111,7 @@ def __iter__(self) -> Tuple:
 
 # %%
 class Agent:
-    """
-    Base Agent class handeling the interaction with the environment
-    """
+    """Base Agent class handeling the interaction with the environment."""
 
     def __init__(self, env: gym.Env, replay_buffer: ReplayBuffer) -> None:
         """
@@ -132,12 +125,11 @@ def __init__(self, env: gym.Env, replay_buffer: ReplayBuffer) -> None:
         self.state = self.env.reset()
 
     def reset(self) -> None:
-        """ Resents the environment and updates the state"""
+        """Resents the environment and updates the state."""
         self.state = self.env.reset()
 
     def get_action(self, net: nn.Module, epsilon: float, device: str) -> int:
-        """Using the given network, decide what action to carry out
-        using an epsilon-greedy policy
+        """Using the given network, decide what action to carry out using an epsilon-greedy policy.
 
         Args:
             net: DQN network
@@ -168,7 +160,7 @@ def play_step(
         epsilon: float = 0.0,
         device: str = 'cpu',
     ) -> Tuple[float, bool]:
-        """Carries out a single interaction step between the agent and the environment
+        """Carries out a single interaction step between the agent and the environment.
 
         Args:
             net: DQN network
@@ -200,7 +192,7 @@ def play_step(
 
 # %%
 class DQNLightning(LightningModule):
-    """ Basic DQN Model """
+    """Basic DQN Model."""
 
     def __init__(
         self,
@@ -249,9 +241,8 @@ def __init__(
         self.populate(self.hparams.warm_start_steps)
 
     def populate(self, steps: int = 1000) -> None:
-        """
-        Carries out several random steps through the environment to initially fill
-        up the replay buffer with experiences
+        """Carries out several random steps through the environment to initially fill up the replay buffer with
+        experiences.
 
         Args:
             steps: number of random steps to populate the buffer with
@@ -260,8 +251,7 @@ def populate(self, steps: int = 1000) -> None:
             self.agent.play_step(self.net, epsilon=1.0)
 
     def forward(self, x: Tensor) -> Tensor:
-        """
-        Passes in a state x through the network and gets the q_values of each action as an output
+        """Passes in a state x through the network and gets the q_values of each action as an output.
 
         Args:
             x: environment state
@@ -273,8 +263,7 @@ def forward(self, x: Tensor) -> Tensor:
         return output
 
     def dqn_mse_loss(self, batch: Tuple[Tensor, Tensor]) -> Tensor:
-        """
-        Calculates the mse loss using a mini batch from the replay buffer
+        """Calculates the mse loss using a mini batch from the replay buffer.
 
         Args:
             batch: current mini batch of replay data
@@ -296,9 +285,8 @@ def dqn_mse_loss(self, batch: Tuple[Tensor, Tensor]) -> Tensor:
         return nn.MSELoss()(state_action_values, expected_state_action_values)
 
     def training_step(self, batch: Tuple[Tensor, Tensor], nb_batch) -> OrderedDict:
-        """
-        Carries out a single step through the environment to update the replay buffer.
-        Then calculates loss based on the minibatch recieved
+        """Carries out a single step through the environment to update the replay buffer. Then calculates loss
+        based on the minibatch recieved.
 
         Args:
             batch: current mini batch of replay data
@@ -320,7 +308,7 @@ def training_step(self, batch: Tuple[Tensor, Tensor], nb_batch) -> OrderedDict:
         # calculates training loss
         loss = self.dqn_mse_loss(batch)
 
-        if self.trainer.use_dp or self.trainer.use_ddp2:
+        if self.trainer._distrib_type in {DistributedType.DP, DistributedType.DDP2}:
             loss = loss.unsqueeze(0)
 
         if done:
@@ -344,12 +332,12 @@ def training_step(self, batch: Tuple[Tensor, Tensor], nb_batch) -> OrderedDict:
         return OrderedDict({'loss': loss, 'log': log, 'progress_bar': status})
 
     def configure_optimizers(self) -> List[Optimizer]:
-        """ Initialize Adam optimizer"""
+        """Initialize Adam optimizer."""
         optimizer = Adam(self.net.parameters(), lr=self.hparams.lr)
         return [optimizer]
 
     def __dataloader(self) -> DataLoader:
-        """Initialize the Replay Buffer dataset used for retrieving experiences"""
+        """Initialize the Replay Buffer dataset used for retrieving experiences."""
         dataset = RLDataset(self.buffer, self.hparams.episode_length)
         dataloader = DataLoader(
             dataset=dataset,
@@ -358,11 +346,11 @@ def __dataloader(self) -> DataLoader:
         return dataloader
 
     def train_dataloader(self) -> DataLoader:
-        """Get train loader"""
+        """Get train loader."""
         return self.__dataloader()
 
     def get_device(self, batch) -> str:
-        """Retrieve device currently being used by minibatch"""
+        """Retrieve device currently being used by minibatch."""
         return batch[0].device.index if self.on_gpu else 'cpu'