In [7]:
import torch
import torch.nn as nn
import torchvision
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms

import pytorch_lightning as pl

import os
import random

In [2]:
# Reproducibility
seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
random.seed(0)

In [3]:
# Hyperparameters
input_size = 784
hidden_size = 500
num_classes = 10
num_epochs = 10
batch_size = 100
learning_rate = 0.001

In [4]:
class LitNeuralNetwork(pl.LightningModule):
    def __init__(self, input_size, hidden_size, output_size, datasets):
        super(LitNeuralNetwork, self).__init__()

        self.train_dataset = datasets["train"]
        self.val_dataset = datasets["val"]
        self.test_dataset = datasets["test"]

        self.input_size = input_size
        self.l1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(hidden_size, output_size)

        # For accumulating loss (manually)
        # self.train_step_losses = []
        # self.validation_step_losses = [] 

    def forward(self, x):
        x = self.l1(x)
        x = self.relu(x)
        x = self.l2(x)
        return x
    
    # Called for every training batch during training
    def training_step(self, batch, batch_idx):
        images, labels = batch
        images = images.reshape(-1, 28*28)

        # Forward pass
        outputs = self(images)
        loss = F.cross_entropy(outputs, labels)

        # self.train_step_losses.append(loss)
        self.log("train_loss", loss, on_epoch=True, prog_bar=True)

        return {"loss": loss} # Must return a dictionary
    
    # Called for every validation batch during training
    # Only works if you have a validation dataloader
    def validation_step(self, batch, batch_idx):
        images, labels = batch
        images = images.reshape(-1, 28*28)

        # Forward pass
        outputs = self(images)
        loss = F.cross_entropy(outputs, labels)

        # self.validation_step_losses.append(loss)
        # Accumulates loss during epoch and does mean reduction by default
        self.log("val_loss", loss, on_epoch=True, prog_bar=True)

        return {"loss": loss} # Must return a dictionary
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        return optimizer
    
    # Returns the dataloader for the training
    def train_dataloader(self):
        train_loader = torch.utils.data.DataLoader(dataset=self.train_dataset,
                                                    batch_size=batch_size,
                                                    shuffle=True)
        return train_loader
    
    def val_dataloader(self):
        val_loader = torch.utils.data.DataLoader(dataset=self.val_dataset,
                                                    batch_size=batch_size,
                                                    shuffle=False)
        return val_loader
    
    def test_dataloader(self):
        test_loader = torch.utils.data.DataLoader(dataset=self.test_dataset,
                                            batch_size=batch_size,
                                            shuffle=False)
        return test_loader
    
    # def on_train_epoch_end(self, ):
    #     avg_loss = torch.stack(self.train_step_losses).mean()
    #     self.log("train_loss", avg_loss, prog_bar=True)
    #     self.train_step_losses = []
    
    # def on_validation_epoch_end(self):
    #     avg_loss = torch.stack(self.validation_step_losses).mean()
    #     self.log("val_loss", avg_loss, prog_bar=True)
    #     self.validation_step_losses = []

In [5]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train_dataset = torchvision.datasets.MNIST(root="./data", train=True,
                                            transform=transform,
                                            download=True)
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [0.8, 0.2])

test_dataset = torchvision.datasets.MNIST(root="./data", train=False,
                                            transform=transform)

datasets = {"train": train_dataset, "val": val_dataset, "test": test_dataset}

In [6]:
# Use fast_dev_run=True to test the model
trainer = pl.Trainer(max_epochs=num_epochs)
model = LitNeuralNetwork(input_size, hidden_size, num_classes, datasets)
trainer.fit(model)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name | Type   | Params
--------------------------------
0 | l1   | Linear | 392 K 
1 | relu | ReLU   | 0     
2 | l2   | Linear | 5.0 K 
------------------

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\localuserBW\anaconda3\envs\BA_Benedikt_Wille\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


                                                                           

c:\Users\localuserBW\anaconda3\envs\BA_Benedikt_Wille\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 9: 100%|██████████| 480/480 [00:05<00:00, 95.71it/s, v_num=0, train_loss_step=0.0524, val_loss=0.0941, train_loss_epoch=0.042]   

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 480/480 [00:05<00:00, 95.44it/s, v_num=0, train_loss_step=0.0524, val_loss=0.0941, train_loss_epoch=0.042]
