In [8]:
import torch
import torchvision
import os
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms

In [9]:
#torch function
transform=transforms.ToTensor()
mnist_train=torchvision.datasets.MNIST(root='/Users/isabelleliu/Desktop/code practice',train=True, download=False, transform=transform)
mnist_test=torchvision.datasets.MNIST(root='/Users/isabelleliu/Desktop/code practice',train=False, download=False, transform=transform)

#split train into train and validation
train_set, val_set=random_split(mnist_train,[len(mnist_train)-10000,10000])
batch_size=64

#create dataloader use default function
train_loader=DataLoader(train_set,batch_size,shuffle=True)
val_loader=DataLoader(val_set,batch_size,shuffle=True)
test_loader=DataLoader(mnist_test,batch_size,shuffle=True)

In [31]:
#define whole process
class Trainer:
    def __init__(self, model_paths, model, optimizer, scheduler):
        #device = torch.device('cuda:0') if use_gpu else torch.device('cpu')
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model_path=model_paths #model storage path
        self.model=model.to(self.device) #define model
        self.optimizer=optimizer #define optimizer
        self.scheduler=scheduler

    def save(self):
        torch.save(self.model.state_dict(), self.model_path)


    def train_step(self, data):
        images,labels=data
        images, labels = images.to(self.device), labels.to(self.device)
        self.optimizer.zero_grad()
        #forward
        predicts=self.model(images)
        loss=F.cross_entropy(predicts,labels)
        avg_loss=torch.mean(loss)
        avg_loss.backward()
        self.optimizer.step()
        self.scheduler.step()
        

        return avg_loss.item()
    
    def train_epoch(self,datasets,epoch):
        self.model.train()
        for batch, data in enumerate(datasets):
            loss=self.train_step(data)

            if batch%500==0:
                print('epoch: {}, batch: {}, loss is: {}'.format(epoch,batch,loss))


    def train(self, train_datasets,start_epoch, end_epoch,save_path):
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        for epoch in range(start_epoch,end_epoch):
            self.train_epoch(train_datasets,epoch)

            torch.save(self.optimizer.state_dict(),'./{}/mnist_epoch{}'.format(save_path,epoch)+'.ptopt')
            torch.save(self.model.state_dict(), './{}/mnist_epoch{}'.format(save_path,epoch)+'.pth')

In [32]:
class MNISTOPTM(nn.Module):
    def __init__(self):
        super(MNISTOPTM,self).__init__()

        self.conv1=nn.Conv2d(in_channels=1, out_channels=20,kernel_size=5,stride=1,padding=2)
        self.max_pool1=nn.MaxPool2d(kernel_size=2,stride=2)
        self.conv2=nn.Conv2d(in_channels=20,out_channels=20,kernel_size=5,stride=1,padding=2)
        self.max_pool2=nn.MaxPool2d(kernel_size=2,stride=2)
        self.fc=nn.Linear(in_features=980,out_features=10)

    def forward(self,inputs,labels=None):
        x=self.conv1(inputs)
        x=F.relu(x)
        x=self.max_pool1(x)
        x=self.conv2(x)
        x=F.relu(x)
        x=self.max_pool2(x)
        x=torch.flatten(x, start_dim=1)
        x=self.fc(x)
        return x

In [33]:
torch.manual_seed(1024)
epochs=3
model_path='mnistoptim.pth'

model=MNISTOPTM()

total_steps=(int(50000//batch_size)+1)*epochs

optimizer=torch.optim.Adam(model.parameters(),lr=0.01)
#Polynomial Decay
lr_schedule=torch.optim.lr_scheduler.LambdaLR(optimizer,lambda step:0.01*(1-step/total_steps)**0.001)

trainer=Trainer(model_paths=model_path, model=model,optimizer=optimizer,scheduler=lr_schedule)

trainer.train(train_datasets=train_loader, start_epoch=0, end_epoch=epochs,save_path='checkpoint')

epoch: 0, batch: 0, loss is: 2.302391767501831
epoch: 0, batch: 500, loss is: 0.33742180466651917
epoch: 1, batch: 0, loss is: 0.3419605493545532
epoch: 1, batch: 500, loss is: 0.28651097416877747
epoch: 2, batch: 0, loss is: 0.2055339813232422
epoch: 2, batch: 500, loss is: 0.12934784591197968


### Polynomial Decay
`PolynomialDecay` Polynomial decay is a method used to adjust the learning rate during the training process. It gradually reduces the learning rate based on a polynomial function. A lower learning rate helps the model converge to a better solution and reduces oscillation around the minimum. 
It 

It is not a built-in scheduler in PyTorch, but you can create a similar learning rate scheduler using the `torch.optim.lr_scheduler.LambdaLR` class.

Here's an example of how to create a polynomial decay learning rate scheduler in PyTorch using `LambdaLR`:

```python
import torch.optim as optim

def polynomial_decay_lr_scheduler(optimizer, initial_lr, decay_steps, end_lr, power=1.0):
    def lr_lambda(step):
        return ((initial_lr - end_lr) * (1 - step / decay_steps) ** power) + end_lr

    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

# Example usage
model = ...  # Your model definition
initial_lr = 0.01
decay_steps = 1000
end_lr = 0.001
power = 1.0

optimizer = optim.Adam(model.parameters(), lr=initial_lr)
scheduler = polynomial_decay_lr_scheduler(optimizer, initial_lr, decay_steps, end_lr, power)
```

In the example above, we define a function `polynomial_decay_lr_scheduler` that takes the optimizer, initial learning rate, decay steps, end learning rate, and the power of the polynomial decay. It then returns a `LambdaLR` scheduler with a custom lambda function that implements the desired polynomial decay.

After defining the scheduler, you can use it during training by calling `scheduler.step()` after each optimizer step:

```python
for epoch in range(num_epochs):
    for batch in data_loader:
        # Your training loop
        optimizer.step()
        scheduler.step()
```


This will adjust the learning rate of the optimizer according to the polynomial decay schedule.

### Common Learning Rate Scheduling Method
Several common learning rate scheduling methods, often referred to as decay methods, help adjust the learning rate during the training process. Here are some of the most commonly used learning rate scheduling techniques:

1. Step Decay: The learning rate is reduced by a constant factor after a fixed number of epochs. This method is simple to implement and allows for a manual reduction of the learning rate at predetermined intervals.
```python
from torch.optim import lr_scheduler

scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
```

2. Exponential Decay: The learning rate decreases exponentially over time. It can be expressed as `lr = initial_lr * exp(-decay_rate * current_step)`, where `initial_lr` is the initial learning rate, `decay_rate` is a constant, and `current_step` is the current training step.
```python
from torch.optim import lr_scheduler

scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
```

3. Polynomial Decay: The learning rate decreases according to a polynomial function. It can be expressed as `lr = initial_lr * (1 - current_step / decay_steps) ** power`, where `initial_lr` is the initial learning rate, `decay_steps` is the number of steps until the learning rate reaches `end_lr`, `power` is the exponent of the polynomial, and `current_step` is the current training step.
```python
from torch.optim import lr_scheduler

decay_steps = 1000
end_lr = 0.001
power = 1.0

scheduler = lr_scheduler.LambdaLR(optimizer, lambda step: (1 - step / decay_steps) ** power)
```

4. Cosine Annealing: The learning rate decreases following a cosine curve. It is inspired by simulated annealing and can help the model escape local minima. This method typically involves warm restarts, which reset the learning rate to its initial value after each annealing cycle.
```python
from torch.optim import lr_scheduler

scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=0.001)
```


5. Cyclical Learning Rates: This method involves cycling the learning rate between a minimum and maximum value during training. It can help the model escape saddle points and find better minima. The learning rate often follows a triangular or sinusoidal pattern.
```python
from torch.optim import lr_scheduler

scheduler = lr_scheduler.CyclicLR(optimizer, base_lr=0.001, max_lr=0.01, step_size_up=2000, mode='triangular')
```


6. One-cycle Policy: This is a variation of cyclical learning rates in which the learning rate starts at a lower value, increases to a maximum value, and then decreases again during training. This method is designed to combine the benefits of both high and low learning rates.
```python
from torch.optim import lr_scheduler

scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, total_steps=5000, epochs=50, steps_per_epoch=None, pct_start=0.3, anneal_strategy='cos', cycle_momentum=True, base_momentum=0.85, max_momentum=0.95, div_factor=25.0, final_div_factor=10000.0, last_epoch=-1)
```


7. ReduceLROnPlateau: The learning rate is reduced when a performance metric (e.g., validation loss) stops improving. This method monitors the performance metric and reduces the learning rate by a specified factor if there's no improvement for a certain number of epochs.
```python
from torch.optim import lr_scheduler

scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, threshold=0.0001, min_lr=0.00001)
```
To use the schedulers in your training loop, call the `scheduler.step()` method after each optimizer step for most schedulers:

```python
for epoch in range(num_epochs):
    for batch in data_loader:
        # Your training loop
        optimizer.step()
        scheduler.step()
```

For the `ReduceLROnPlateau` scheduler, you need to provide a metric (e.g., validation loss) when calling the `scheduler.step()` method:

```python
for epoch in range(num_epochs):
    for batch in data_loader:
        # Your training loop
    val_loss = compute_validation_loss()
    scheduler.step(val_loss)
```


These learning rate scheduling techniques are used in different situations, depending on the specific problem and model architecture. It is often necessary to experiment with various decay methods and their hyperparameters to find the best approach for a given problem.

In [36]:
#continue training
torch.manual_seed(1024)
model_path='mnistoptim.pth'

model=MNISTOPTM()

total_steps=(int(50000//batch_size)+1)*epochs

optimizer=torch.optim.Adam(model.parameters(),lr=0.01)
#Polynomial Decay
lr_schedule=torch.optim.lr_scheduler.LambdaLR(optimizer,lambda step:0.01*(1-step/total_steps)**0.001)

params_dict=torch.load('checkpoint/mnist_epoch0.pth')
opt_dict=torch.load('checkpoint/mnist_epoch0.ptopt')

#load the param
model.load_state_dict(params_dict)
optimizer.load_state_dict(opt_dict)

trainer=Trainer(model_paths=model_path, model=model,optimizer=optimizer,scheduler=lr_schedule)

trainer.train(train_datasets=train_loader, start_epoch=1, end_epoch=epochs,save_path='checkpoint_con')

epoch: 1, batch: 0, loss is: 0.22667242586612701
epoch: 1, batch: 500, loss is: 0.1558067500591278
epoch: 2, batch: 0, loss is: 0.20005211234092712
epoch: 2, batch: 500, loss is: 0.22454290091991425


### optimizer state dict (Adam, Adagrad)
Saving the optimizer's state along with the model is not always necessary but can be beneficial in certain situations. Here are some reasons why you might want to save the optimizer's state:

1. Resuming training: If you plan to resume training from a checkpoint, having the optimizer state saved will allow you to continue the training process with the same optimizer settings and internal state (e.g., momentum, learning rate schedule) as before. This can lead to smoother convergence and better results when compared to starting with a new optimizer or a reset optimizer state.

2. Warm-starting: If you want to use the pre-trained model as a starting point for training on a related task, having the optimizer state can help speed up the initial convergence. The optimizer state might contain useful information about the gradients and weight updates that can help in adapting to the new task.

3. Reproducibility: Saving the optimizer state can help with reproducibility, as it allows others to continue training the model from the same checkpoint, using the same optimizer settings and internal state.

4. Finetuning: In some cases, you may want to finetune the model after the main training process is complete. In this situation, having the optimizer's state saved can be beneficial to ensure a smooth finetuning process.

To save the optimizer state along with the model in PyTorch, you can create a dictionary containing both the model state and optimizer state and save it using `torch.save()`:

```python
checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}
torch.save(checkpoint, 'model_checkpoint.pth')
```

To load the saved states back into the model and optimizer, use the following code:

```python
checkpoint = torch.load('model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
```

Remember to call the appropriate methods to set the model in training or evaluation mode (`model.train()` or `model.eval()`) after loading the states.