In [1]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

In [2]:
# # pip install pytorch-lightning
# import pytorch_lightning as pl
# from pytorch_lightning.loggers import CSVLogger, TensorBoardLogger
from pytorch_lightning.callbacks import (
    Callback, EarlyStopping, ModelCheckpoint, LearningRateMonitor,
    DeviceStatsMonitor, GradientAccumulationScheduler
)

# pip install lightning
import lightning.pytorch as pl
from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger
from lightning.pytorch.callbacks import (
    Callback, EarlyStopping, ModelCheckpoint, LearningRateMonitor,
    DeviceStatsMonitor, GradientAccumulationScheduler
)

In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# 1. Prepare data

In [4]:
california_housing = fetch_california_housing()
california_housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [5]:
X = california_housing.data
X

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [6]:
X.shape

(20640, 8)

In [7]:
y = california_housing.target.reshape(-1, 1)
y

array([[4.526],
       [3.585],
       [3.521],
       ...,
       [0.923],
       [0.847],
       [0.894]])

In [8]:
y.shape

(20640, 1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16512, 8), (4128, 8), (16512, 1), (4128, 1))

In [10]:
class CaliforniaHousingDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [11]:
train_dataset = CaliforniaHousingDataset(X_train, y_train)

In [12]:
test_dataset = CaliforniaHousingDataset(X_test, y_test)

In [13]:
class CaliforniaHousingDataModule(pl.LightningDataModule):
    def __init__(self, train_dataset, test_dataset, batch_size=64):
        super().__init__()
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.batch_size = batch_size

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size
        )

In [14]:
data_module = CaliforniaHousingDataModule(train_dataset, test_dataset)

# 2. Prepare model

In [15]:
class SimpleNN(pl.LightningModule):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(8, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = nn.MSELoss()(y_pred, y)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = nn.MSELoss()(y_pred, y)
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.001)

In [16]:
model = SimpleNN()
model

SimpleNN(
  (fc1): Linear(in_features=8, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)

# 3. Train model

In [17]:
class PrintCallback(Callback):
    def on_train_start(self, trainer, pl_module):
        print("Training is started!")
        print("Model summary...")
        print(pl_module)
        print("Number of training batches...", trainer.num_training_batches)
        print("Number of validation batches...", trainer.num_val_batches)

    def on_train_end(self, trainer, pl_module):
        print("Training is done.")

In [18]:
experiment_name = 'TestPyTorchLightning'

In [19]:
trainer = pl.Trainer(
    max_epochs=100,
    precision=32,
    callbacks=[
        PrintCallback(),
        EarlyStopping('val_loss'),
        ModelCheckpoint(dirpath='ckpts'),
        LearningRateMonitor(logging_interval='epoch'),
        DeviceStatsMonitor(cpu_stats=True),
        GradientAccumulationScheduler(scheduling={3: 2}) # from epoch 4th, accumulate 2 batches
    ],
    logger=[
        CSVLogger(save_dir='csv_logs', name=experiment_name),
        TensorBoardLogger(save_dir='tb_logs', name=experiment_name)
    ],
    log_every_n_steps=1,
    check_val_every_n_epoch=1
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [20]:
# trainer = pl.Trainer(
#     accelerator="gpu",
#     devices=[1, 2],
#     strategy="ddp", # distributed data parallel
#     num_nodes=4
# )
# # https://lightning.ai/docs/pytorch/stable/accelerators/mps_basic.html

In [21]:
trainer.fit(model, data_module)

/Users/minhhuunguyen/REPOSITORY/minhhuunguyen.github.io/posts/ai-lectures/venv/torch_20_venv/.venv/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:652: Checkpoint directory /Users/minhhuunguyen/REPOSITORY/minhhuunguyen.github.io/posts/ai-lectures/2_data_analysis_with_python/notebook/11-pytorch-lightning/ckpts exists and is not empty.

  | Name | Type   | Params | Mode 
----------------------------------------
0 | fc1  | Linear | 576    | train
1 | relu | ReLU   | 0      | train
2 | fc2  | Linear | 65     | train
----------------------------------------
641       Trainable params
0         Non-trainable params
641       Total params
0.003     Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

/Users/minhhuunguyen/REPOSITORY/minhhuunguyen.github.io/posts/ai-lectures/venv/torch_20_venv/.venv/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training is started!
Model summary...
SimpleNN(
  (fc1): Linear(in_features=8, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)
Number of training batches... 258
Number of validation batches... [65]


/Users/minhhuunguyen/REPOSITORY/minhhuunguyen.github.io/posts/ai-lectures/venv/torch_20_venv/.venv/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |                                                                                                   …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |                                                                                                 …

Validation: |                                                                                                 …

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |                                                                                                 …

Validation: |                                                                                                 …

Training is done.


In [22]:
%reload_ext tensorboard
%tensorboard --logdir=tb_logs/TestPyTorchLightning