In [1]:
!pip install wandb torchinfo

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[33mDEPRECATION: devscripts 2.22.1ubuntu1 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of devscripts or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import os
from pathlib import Path
import torch
import wandb
from torch import nn, optim
from torch.optim import lr_scheduler

In [3]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtmdrjs0040[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
from torch.utils.data import DataLoader, random_split
from torchvision import datasets
from torchvision.transforms import transforms
from torchinfo import summary
from utils import get_num_cpu_cores, is_linux, is_windows

In [5]:
from torchvision.datasets import FashionMNIST
def get_fashion_mnist_data():
    #데이터 다운
    #만약 root 경로에 데이터가 없다면 다운로드 
    #train=True/False는 데이터의 용도 구분 따로 validation이 없기 때문에 validation data는 train_data에서 알아서 분할해야 할듯
    f_mnist_train = FashionMNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
    f_mnist_train, f_mnist_validation = random_split(f_mnist_train, [55_000, 5_000])

    
    print("Num Train Samples: ", len(f_mnist_train))
    print("Num Validation Samples: ", len(f_mnist_validation))
    print("Sample Shape: ", f_mnist_train[0][0].shape)  # torch.Size([1, 28, 28])
    
    num_data_loading_workers = get_num_cpu_cores() if is_linux() or is_windows() else 0
    print("Number of Data Loading Workers:", num_data_loading_workers)

    train_data_loader = DataLoader(
        dataset=f_mnist_train, batch_size=wandb.config.batch_size, shuffle=True,
        pin_memory=True, num_workers=num_data_loading_workers
    )

    validation_data_loader = DataLoader(
        dataset=f_mnist_validation, batch_size=wandb.config.batch_size,
        pin_memory=True, num_workers=num_data_loading_workers
    )

    f_mnist_transforms = nn.Sequential(
        transforms.ConvertImageDtype(torch.float),
        transforms.Normalize(mean=0.2860495448112488, std=0.32041478157043457), 
                                #mean값과 std값을 이용해 데이터의 평균이 0 표준편차가 1이 되도록 한다.
                                #gray image이므로 모든 픽셀의 값을 평균내고 표준편차값을 구한다.
                                #train 데이터의 값만으로 정한다.(검증과 테스트는 성능 확인용이므로 통계에 포함하면 일반화 성능 왜곡)
    )

    return train_data_loader, validation_data_loader, f_mnist_transforms

In [6]:
#정규화를 위한 mean과 std 구하기

f_mnist_train = FashionMNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
f_mnist_train, f_mnist_validation = random_split(f_mnist_train, [55_000, 5_000])

mean = 0
std = 0
total_images = len(f_mnist_train)

for img,_ in f_mnist_train: #이미지 별로 평균과 표준편차를 구해서 합산
    img = img.view(1, -1)
    mean += img.mean()
    std += img.std()

#합산된 평균과 표준편차를 평균내어 구하기
mean /= total_images
std /= total_images

print(f"Mean: {mean}, Std: {std}")

Mean: 0.2861834168434143, Std: 0.3204808533191681


In [8]:
def get_cnn_model(dropout=0.5):
    class MyModel(nn.Module):
        def __init__(self, in_channels, n_output, dropout):
            super().__init__()

            self.model = nn.Sequential(
                # B x 1 x 28 x 28 --> B x 6 x (28 - 5 + 1) x (28 - 5 + 1) = B x 6 x 24 x 24
                nn.Conv2d(in_channels=in_channels, out_channels=6, kernel_size=(5, 5), stride=(1, 1)),
                # B x 6 x 24 x 24 --> B x 6 x 12 x 12
                nn.MaxPool2d(kernel_size=2, stride=2),
                nn.BatchNorm2d(
                    num_features = 6, eps= 1e-05, momentum=0.1
                ),
                nn.ReLU(),
                # B x 6 x 12 x 12 --> B x 16 x (12 - 5 + 1) x (12 - 5 + 1) = B x 16 x 8 x 8
                nn.Conv2d(in_channels=6, out_channels=16, kernel_size=(5, 5), stride=(1, 1)),
                # B x 16 x 8 x 8 --> B x 16 x 4 x 4
                nn.MaxPool2d(kernel_size=2, stride=2),
                nn.BatchNorm2d(
                    num_features = 16, eps= 1e-05, momentum=0.1
                ),
                nn.ReLU(),
                # B x 16 x 4 x 4 --> B x 256
                nn.Flatten(),
                nn.Dropout(p=dropout),
                nn.Linear(256, 128),
                nn.BatchNorm1d(
                    num_features = 128, eps=1e-05, momentum=0.1
                ),
                nn.ReLU(),
                nn.Dropout(p=dropout),
                nn.Linear(128, n_output),
            )

        def forward(self, x):
            x = self.model(x)
            return x

    # 1 * 28 * 28
    my_model = MyModel(in_channels=1, n_output=10, dropout = dropout)

    return my_model

In [31]:
def get_vgg_model(dropout=0.5):
    def vgg_block(num_conv_layer, out_channels):
        layers = []
        for _ in range(num_conv_layer):
            layers.append(nn.LazyConv2d(
                out_channels=out_channels, kernel_size=3, padding=1
                )
            )
            layers.append(nn.BatchNorm2d(num_features=out_channels))
            layers.append(nn.ReLU())
        layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
        block = nn.Sequential(*layers)
        return block
    
    class VGG(nn.Module):
        def __init__(self, block_info, n_output=10):
            super().__init__()
            conv_blocks=[]
            for(num_conv_layers, out_channels) in block_info:
                conv_blocks.append(vgg_block(num_conv_layers, out_channels))
            
            self.model = nn.Sequential(
                *conv_blocks,
                nn.Flatten(), 
                nn.LazyLinear(out_features = 256),
                nn.ReLU(),
                nn.Dropout(p=dropout),
                nn.LazyLinear(out_features = 128),
                nn.ReLU(),
                nn.LazyLinear(out_features = 64),
                nn.ReLU(),
                nn.Dropout(p=dropout),
                nn.LazyLinear(n_output)
            )
            
        def forward(self, x):
            x = self.model(x)
            return x
    
    my_model = VGG(
        block_info = (
            (3, 128), (3, 256)
        ),
        n_output = 10
    )
    return my_model
        

In [32]:
class EarlyStopping:
    def __init__(
            self, patience = 10, delta =0.0001, project_name = None, checkpoint_file_path = None, run_time_str = None
    ):
        self.patience = patience
        self.counter = 0
        self.delta = delta
        self.val_loss_min = None
        self.file_path = os.path.join(
            checkpoint_file_path, f"{project_name}_checkpoint_{run_time_str}.pt"
        )
        self.latest_file_path = os.path.join(
            checkpoint_file_path, f"{project_name}_checkpoint_{run_time_str}_latest.pt"# 여러 테스트를 확인할때 똑같이 latest로만 하니까 테스트 별로 구분이 안되서 변경
        )
        
    def check_and_save(self, new_validation_loss, model): #val_loss를 확인하고, 작아졌는지 확인하고 count 여부를 정하며, 만약 counter가 patience보다 커지면 이를 알린다.
        early_stop = False
        message = None
        
        if self.val_loss_min is None:
            self.val_loss_min = new_validation_loss
            message =f"Early stopping is stated!"
        elif new_validation_loss < self.val_loss_min - self.delta:
            message =f'V_loss decreased ({self.val_loss_min:6.3f} --> {new_validation_loss:6.3f}). Saving model..'
            self.save_checkpoint(new_validation_loss, model)
            self.val_loss_min = new_validation_loss
            self.counter = 0
        else:
            self.counter += 1
            message = f'Early stopping counter: {self.counter} out of {self.patience}'
            if self.counter >= self.patience:
                early_stop = True
                message += "*** TRAIN EARLY STOPPED! ***"
            
        return message, early_stop, self.counter
    
    def save_checkpoint(self, val_loss, model): #모델을 저장하는 함수
        torch.save(model.state_dict(), self.file_path)
        torch.save(model.state_dict(), self.latest_file_path)
        self.val_loss_min = val_loss

In [33]:
from datetime import datetime
from utils import strfdelta


class ClassificationTrainer:
  def __init__(
    self, project_name, model, optimizer, train_data_loader, validation_data_loader, transforms,
    run_time_str, wandb, device, checkpoint_file_path
  ):
    self.project_name = project_name
    self.model = model
    self.optimizer = optimizer
    self.train_data_loader = train_data_loader
    self.validation_data_loader = validation_data_loader
    self.transforms = transforms
    self.run_time_str = run_time_str
    self.wandb = wandb
    self.device = device
    self.checkpoint_file_path = checkpoint_file_path
    
    self.exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)
    
    # Use a built-in loss function
    self.loss_fn = nn.CrossEntropyLoss()

  def do_train(self):
    self.model.train()  # Will be explained at 'Diverse Techniques' section

    loss_train = 0.0
    num_corrects_train = 0
    num_trained_samples = 0
    num_trains = 0

    for train_batch in self.train_data_loader:
      input_train, target_train = train_batch
      input_train = input_train.to(device=self.device)
      target_train = target_train.to(device=self.device)

      if self.transforms:
        input_train = self.transforms(input_train)

      output_train = self.model(input_train)
      loss = self.loss_fn(output_train, target_train)
      loss_train += loss.item()

      predicted_train = torch.argmax(output_train, dim=-1)

      # >>> predicted_train: tensor([5, 8, 9, 0, 9, 8, 9, 8, ..., 0, 1, 3, 7, 1, 4, 3])
      # >>> target_train:    tensor([5, 8, 9, 2, 9, 8, 7, 8, ..., 4, 1, 9, 6, 1, 4, 3])
      num_corrects_train += torch.sum(torch.eq(predicted_train, target_train)).item()

      num_trained_samples += len(input_train)
      num_trains += 1

      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()

    train_loss = loss_train / num_trains
    train_accuracy = 100.0 * num_corrects_train / num_trained_samples

    return train_loss, train_accuracy

  def do_validation(self):
    self.model.eval()   # Explained at 'Diverse Techniques' section

    loss_validation = 0.0
    num_corrects_validation = 0
    num_validated_samples = 0
    num_validations = 0

    with torch.no_grad():
      for validation_batch in self.validation_data_loader:
        input_validation, target_validation = validation_batch
        input_validation = input_validation.to(device=self.device)
        target_validation = target_validation.to(device=self.device)

        if self.transforms:
          input_validation = self.transforms(input_validation)

        output_validation = self.model(input_validation)
        loss_validation += self.loss_fn(output_validation, target_validation).item()

        predicted_validation = torch.argmax(output_validation, dim=1)
        num_corrects_validation += torch.sum(torch.eq(predicted_validation, target_validation)).item()

        num_validated_samples += len(input_validation)
        num_validations += 1

    validation_loss = loss_validation / num_validations
    validation_accuracy = 100.0 * num_corrects_validation / num_validated_samples

    return validation_loss, validation_accuracy

  def train_loop(self):
    early_stopping = EarlyStopping(
      patience=self.wandb.config.early_stop_patience,
      delta=self.wandb.config.early_stop_delta,
      project_name=self.project_name,
      checkpoint_file_path=self.checkpoint_file_path,
      run_time_str=self.run_time_str
    )
    n_epochs = self.wandb.config.epochs
    training_start_time = datetime.now()

    for epoch in range(1, n_epochs + 1):
      train_loss, train_accuracy = self.do_train()   
    
      if epoch == 1 or epoch % self.wandb.config.validation_intervals == 0:
        validation_loss, validation_accuracy = self.do_validation()

        elapsed_time = datetime.now() - training_start_time
        epoch_per_second = 0 if elapsed_time.seconds == 0 else epoch / elapsed_time.seconds

        message, early_stop, _ = early_stopping.check_and_save(validation_loss, self.model)
        self.exp_lr_scheduler.step()
        
        print(
          f"[Epoch {epoch:>3}] "
          f"T_loss: {train_loss:7.5f}, "
          f"T_accuracy: {train_accuracy:6.4f} | "
          f"V_loss: {validation_loss:7.5f}, "
          f"V_accuracy: {validation_accuracy:6.4f} | "
          f"{message} | "
          f"T_time: {strfdelta(elapsed_time, '%H:%M:%S')}, "
          f"T_speed: {epoch_per_second:4.3f}"
        )

        self.wandb.log({
          "Epoch": epoch,
          "Training loss": train_loss,
          "Training accuracy (%)": train_accuracy,
          "Validation loss": validation_loss,
          "Validation accuracy (%)": validation_accuracy,
          "Training speed (epochs/sec.)": epoch_per_second,
        })

        if early_stop:
          break

    elapsed_time = datetime.now() - training_start_time
    print(f"Final training time: {strfdelta(elapsed_time, '%H:%M:%S')}")


In [34]:
def main(epochs = 1000, batch_size = 2048, validation_intervals = 10, learning_rate = 1e-3, early_stop_patience = 10, early_stop_delta = 0.0001, weight_decay = 0.001, dropout = 0.5):
    #checkpoint의 경로 설정 + 디렉토리 생성
    CHECKPOINT_FILE_PATH = os.path.join(os.getcwd(), "checkpoints")
    if not os.path.isdir(CHECKPOINT_FILE_PATH):
        os.makedirs(CHECKPOINT_FILE_PATH)
        
    run_time_str = datetime.now().astimezone().strftime('%Y-%m-%d_%H-%M-%S')
    config = {
    'epochs': epochs,
    'batch_size': batch_size,
    'validation_intervals': validation_intervals,
    'learning_rate': learning_rate,
    'early_stop_patience': early_stop_patience,
    'early_stop_delta': early_stop_delta,
    'weight_decay': weight_decay,
    'dropout': dropout
  }
    
    project_name = "FashionMNIST"
    wandb.init(
        mode="online",
        project=project_name,
        notes = "Fashion MNIST",
        tags = ["cnn", "FashionMNIST"],
        name = run_time_str,
        config = config
    )
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Training on device {device}.")
  
    train_data_loader, validation_data_loader, f_mnist_transforms = get_fashion_mnist_data()
    #model = get_cnn_model(dropout)
    model = get_vgg_model()
    model.to(device)
    
    print(summary(model=model, input_size=(1, 1, 28, 28)))
    
    optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate, betas=(0.9, 0.999), weight_decay=wandb.config.weight_decay)
    
    classification_trainer = ClassificationTrainer(
        project_name, model, optimizer, train_data_loader, validation_data_loader, f_mnist_transforms, run_time_str, wandb, device, CHECKPOINT_FILE_PATH
    )
    classification_trainer.train_loop()
    
    wandb.finish() 

In [37]:
if __name__ == "__main__":
    
    epochs = 1000
    batch_size = 4096
    validation_intervals = 1
    learning_rate = 1e-3
    early_stop_patience = 15
    early_stop_delta = 0.0001
    weight_decay = 0.001
    dropout = 0.5
    main(epochs, batch_size, validation_intervals, learning_rate, early_stop_patience, early_stop_delta, weight_decay, dropout)   

Training on device cuda:0.
Num Train Samples:  55000
Num Validation Samples:  5000
Sample Shape:  torch.Size([1, 28, 28])
Number of Data Loading Workers: 2
Layer (type:depth-idx)                   Output Shape              Param #
VGG                                      [1, 10]                   --
├─Sequential: 1-1                        [1, 10]                   --
│    └─Sequential: 2-1                   [1, 128, 14, 14]          --
│    │    └─Conv2d: 3-1                  [1, 128, 28, 28]          1,280
│    │    └─BatchNorm2d: 3-2             [1, 128, 28, 28]          256
│    │    └─ReLU: 3-3                    [1, 128, 28, 28]          --
│    │    └─Conv2d: 3-4                  [1, 128, 28, 28]          147,584
│    │    └─BatchNorm2d: 3-5             [1, 128, 28, 28]          256
│    │    └─ReLU: 3-6                    [1, 128, 28, 28]          --
│    │    └─Conv2d: 3-7                  [1, 128, 28, 28]          147,584
│    │    └─BatchNorm2d: 3-8             [1, 128, 28, 

[Epoch  30] T_loss: 0.09792, T_accuracy: 97.1927 | V_loss: 0.20912, V_accuracy: 93.9800 | Early stopping counter: 4 out of 15 | T_time: 00:05:44, T_speed: 0.087
[Epoch  31] T_loss: 0.09512, T_accuracy: 97.3400 | V_loss: 0.20508, V_accuracy: 94.2600 | Early stopping counter: 5 out of 15 | T_time: 00:05:54, T_speed: 0.088
[Epoch  32] T_loss: 0.09247, T_accuracy: 97.3818 | V_loss: 0.20972, V_accuracy: 93.8400 | Early stopping counter: 6 out of 15 | T_time: 00:06:05, T_speed: 0.088
[Epoch  33] T_loss: 0.08774, T_accuracy: 97.5145 | V_loss: 0.20642, V_accuracy: 94.0000 | Early stopping counter: 7 out of 15 | T_time: 00:06:16, T_speed: 0.088
[Epoch  34] T_loss: 0.08597, T_accuracy: 97.5327 | V_loss: 0.20592, V_accuracy: 94.1600 | Early stopping counter: 8 out of 15 | T_time: 00:06:27, T_speed: 0.088
[Epoch  35] T_loss: 0.08400, T_accuracy: 97.6564 | V_loss: 0.20528, V_accuracy: 94.2000 | Early stopping counter: 9 out of 15 | T_time: 00:06:38, T_speed: 0.088
[Epoch  36] T_loss: 0.08207, T_acc

0,1
Epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
Training accuracy (%),▁▅▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████████████████
Training loss,█▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Training speed (epochs/sec.),▃▃▃▂▂▁▁▂▁▁▁▂▂▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇█████████
Validation accuracy (%),▁▅▆▇▇▇▇▇▇███████████████████████████████
Validation loss,█▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Epoch,41.0
Training accuracy (%),97.86545
Training loss,0.07679
Training speed (epochs/sec.),0.08817
Validation accuracy (%),94.22
Validation loss,0.21


# 정리
* 22와 23번 시도에서 원하는 성능(validation >=94, test>=93)을 얻음
    - 22: validation_accuracy: 94.0000, test_accuracy: 93.220%
    - 23: validation_accuracy: 94.2600, test_accuracy: 93.660%
* 여러번의 시도를 하나의 main cell에서 시행해서 23의 훈련 과정의 출력이 나와았지만 22의 훈련 과정은 출력이 없음 -> 밑의 cell에 wandb에 기록된 결과를 가져옴
* test과정은 개인 컴퓨터에서 진행
    - 이유: Backend.AI에서 test 진행할 경우 세션이 종료
    - 훈련과정만 이 노트북에 있고, 이후의 과정은 다른 노트북(homework3_test.ipynb) 에서 진행
    
-------------------------------------------------------------------------- 
# 하이퍼파라미터
## 22
    epochs = 1000
    batch_size = 4096
    validation_intervals = 5
    learning_rate = 1e-3
    early_stop_patience = 5
    early_stop_delta = 0.0001
    weight_decay = 0.001
    dropout = 0.5

## 23
    epochs = 1000
    batch_size = 4096
    validation_intervals = 1
    learning_rate = 1e-3
    early_stop_patience = 15
    early_stop_delta = 0.0001
    weight_decay = 0.001
    dropout = 0.5

## 22번 훈련 과정(wandb 기록)
     1 Training on device cuda:0.
     2 Num Train Samples:  55000
     3 Num Validation Samples:  5000
     4 Sample Shape:  torch.Size([1, 28, 28])
     5 Number of Data Loading Workers: 2
     6 ==========================================================================================
     7 Layer (type:depth-idx)                   Output Shape              Param #
     8 ==========================================================================================
     9 VGG                                      [1, 10]                   --
    10 ├─Sequential: 1-1                        [1, 10]                   --
    11 │    └─Sequential: 2-1                   [1, 128, 14, 14]          --
    12 │    │    └─Conv2d: 3-1                  [1, 128, 28, 28]          1,280
    13 │    │    └─BatchNorm2d: 3-2             [1, 128, 28, 28]          256
    14 │    │    └─ReLU: 3-3                    [1, 128, 28, 28]          --
    15 │    │    └─Conv2d: 3-4                  [1, 128, 28, 28]          147,584
    16 │    │    └─BatchNorm2d: 3-5             [1, 128, 28, 28]          256
    17 │    │    └─ReLU: 3-6                    [1, 128, 28, 28]          --
    18 │    │    └─Conv2d: 3-7                  [1, 128, 28, 28]          147,584
    19 │    │    └─BatchNorm2d: 3-8             [1, 128, 28, 28]          256
    20 │    │    └─ReLU: 3-9                    [1, 128, 28, 28]          --
    21 │    │    └─MaxPool2d: 3-10              [1, 128, 14, 14]          --
    22 │    └─Sequential: 2-2                   [1, 256, 7, 7]            --
    23 │    │    └─Conv2d: 3-11                 [1, 256, 14, 14]          295,168
    24 │    │    └─BatchNorm2d: 3-12            [1, 256, 14, 14]          512
    25 │    │    └─ReLU: 3-13                   [1, 256, 14, 14]          --
    26 │    │    └─Conv2d: 3-14                 [1, 256, 14, 14]          590,080
    27 │    │    └─BatchNorm2d: 3-15            [1, 256, 14, 14]          512
    28 │    │    └─ReLU: 3-16                   [1, 256, 14, 14]          --
    29 │    │    └─Conv2d: 3-17                 [1, 256, 14, 14]          590,080
    30 │    │    └─BatchNorm2d: 3-18            [1, 256, 14, 14]          512
    31 │    │    └─ReLU: 3-19                   [1, 256, 14, 14]          --
    32 │    │    └─MaxPool2d: 3-20              [1, 256, 7, 7]            --
    33 │    └─Flatten: 2-3                      [1, 12544]                --
    34 │    └─Linear: 2-4                       [1, 256]                  3,211,520
    35 │    └─ReLU: 2-5                         [1, 256]                  --
    36 │    └─Dropout: 2-6                      [1, 256]                  --
    37 │    └─Linear: 2-7                       [1, 128]                  32,896
    38 │    └─ReLU: 2-8                         [1, 128]                  --
    39 │    └─Linear: 2-9                       [1, 64]                   8,256
    40 │    └─ReLU: 2-10                        [1, 64]                   --
    41 │    └─Dropout: 2-11                     [1, 64]                   --
    42 │    └─Linear: 2-12                      [1, 10]                   650
    43 ==========================================================================================
    44 Total params: 5,027,402
    45 Trainable params: 5,027,402
    46 Non-trainable params: 0
    47 Total mult-adds (M): 524.84
    48 ==========================================================================================
    49 Input size (MB): 0.00
    50 Forward/backward pass size (MB): 7.23
    51 Params size (MB): 20.11
    52 Estimated Total Size (MB): 27.34
    53 ==========================================================================================
    54 [Epoch   1] T_loss: 1.68430, T_accuracy: 36.5564 | V_loss: 0.94141, V_accuracy: 69.7800 | Early stopping is stated! | T_time: 00:00:15, T_speed: 0.067
    55 [Epoch   5] T_loss: 0.42778, T_accuracy: 85.6600 | V_loss: 0.33866, V_accuracy: 89.5600 | V_loss decreased ( 0.941 -->  0.339). Saving model.. | T_time: 00:01:14, T_speed: 0.068

## 과정 기록
----------------------------------------------------------------------
# 1
### 10_a,b 코드 거의 복붙
#### optimizer = optim.SDG
### 하이퍼파라미터
   * epochs = 1000
   * batch_size = 2048
   * validation_intervals = 10
   * learning_rate = 1e-3
   * early_stop_patience = 10
   * early_stop_delta = 0.0001
  
### 결과    
   * T_loss: 0.44936, T_accuracy: 83.7200 | V_loss: 0.46497, V_accuracy: 83.1400  
### test
   * Num Test Samples:  10000
   * Sample Shape:  torch.Size([1, 28, 28])
   * MODEL FILE: /home/work/DL/3/checkpoints/FashionMNIST_checkpoint_latest.pt
   * TEST RESULTS: 82.980%


# 2 
### 1에서 optimizer를 Adam으로 교체
### 하이퍼파라미터
   * epochs = 1000
   * batch_size = 2048
   * validation_intervals = 10
   * learning_rate = 1e-3
   * early_stop_patience = 10
   * early_stop_delta = 0.0001

### 결과    
   * T_loss: 0.12889, T_accuracy: 95.3745 | V_loss: 0.32309, V_accuracy: 90.0400 | Early stopping counter: 10 out of 10*** TRAIN EARLY STOPPED! *** | T_time: 00:09:07, T_speed: 0.384
### test
   * Num Test Samples:  10000
   * Sample Shape:  torch.Size([1, 28, 28])
   * MODEL FILE: /home/work/DL/3/checkpoints/FashionMNIST_checkpoint_latest.pt
   * TEST RESULTS: 89.700%


# 3
### 2에서 weight_decay 추가
### 하이퍼파라미터
   * epochs = 1000
   * batch_size = 2048
   * validation_intervals = 10
   * learning_rate = 1e-3
   * early_stop_patience = 10
   * early_stop_delta = 0.0001
   * weight_decay = 0.001
   
### 결과
   * T_loss: 0.14863, T_accuracy: 94.7127 | V_loss: 0.28628, V_accuracy: 89.7200 | Early stopping counter: 10 out of 10*** TRAIN EARLY STOPPED! *** | T_time: 00:07:52, T_speed: 0.424
### test
   * Num Test Samples:  10000
   * Sample Shape:  torch.Size([1, 28, 28])
   * MODEL FILE: /home/work/DL/3/checkpoints/FashionMNIST_checkpoint_latest.pt
   * TEST RESULTS: 89.590%

# 4
### 2에서 dropout 추가
### 하이퍼파라미터
   * epochs = 1000
   * batch_size = 2048
   * validation_intervals = 10
   * learning_rate = 1e-3
   * early_stop_patience = 10
   * early_stop_delta = 0.0001
   * dropout(p = 0.5)

### 결과
   * T_loss: 0.29177, T_accuracy: 89.2764 | V_loss: 0.25406, V_accuracy: 90.9400 | Early stopping counter: 10 out of 10*** TRAIN EARLY STOPPED! *** | T_time: 00:29:59, T_speed: 0.434
  
### test
   * Num Test Samples:  10000
   * Sample Shape:  torch.Size([1, 28, 28])
   * MODEL FILE: /home/work/DL/3/checkpoints/FashionMNIST_checkpoint_2024-11-20_12-44-29_latest.pt
   * TEST RESULTS: 90.520%
  
----------------------------------------------------------
### 결과쪽에서 stop된 곳을 기준으로 하는 것이 아니라 Early stoppint counter가 올라가기 직전의 결과를 가져와야 맞는 것임을 생각 못하고 마지막 출력을 가져옴
--------------------------------------------------------------

# 5
### 2에서 decay와 dropout 추가
### 하이퍼파라미터
   * epochs = 1000
   * batch_size = 2048
   * validation_intervals = 10
   * learning_rate = 1e-3
   * early_stop_patience = 10
   * early_stop_delta = 0.0001
   * weight_decay = 0.001
   * dropout(p = 0.5)

### 결과
   * T_loss: 0.33893, T_accuracy: 87.8073 | V_loss: 0.26139, V_accuracy: 90.7000 | V_loss decreased ( 0.265 -->  0.261). Saving model.. | T_time: 00:18:25, T_speed: 0.434
  
### test
   * Num Test Samples:  10000
   * Sample Shape:  torch.Size([1, 28, 28])
   * MODEL FILE: /home/work/DL/3/checkpoints/FashionMNIST_checkpoint_2024-11-20_13-32-16_latest.pt
   * TEST RESULTS: 89.440%
  
# 6
### 5에서 Batch Normalization 추가
### 하이퍼파라미터 동일
   * T_loss: 0.33092, T_accuracy: 88.2436 | V_loss: 0.24330, V_accuracy: 90.6000 | V_loss decreased ( 0.244 -->  0.243). Saving model.. | T_time: 00:15:23, T_speed: 0.433

### test  
   * Num Test Samples:  10000
   * Sample Shape:  torch.Size([1, 28, 28])
   * MODEL FILE: /home/work/DL/3/checkpoints/FashionMNIST_checkpoint_2024-11-20_14-15-44_latest.pt
   * TEST RESULTS: 89.670%
 
# 7
### 6에서 하이퍼파라미터 조절해보기
## 1
### 하이퍼파라미터
   * epochs = 1000
   * batch_size = 2048
   * validation_intervals = 10
   * learning_rate = 1e-3
   * early_stop_patience = 30
   * early_stop_delta = 0.01
   * weight_decay = 0.001
   * dropout = 0.3

### 결과
   * T_loss: 0.25547, T_accuracy: 90.8582 | V_loss: 0.24847, V_accuracy: 90.9600 | V_loss decreased ( 0.260 -->  0.248). Saving model.. | T_time: 00:12:45, T_speed: 0.431
 
### test
   * Num Test Samples:  10000
   * Sample Shape:  torch.Size([1, 28, 28])
   * MODEL FILE: /home/work/DL/3/checkpoints/FashionMNIST_checkpoint_2024-11-20_16-24-01_latest.pt
   * TEST RESULTS: 90.090%
# 8
### vgg이용
   * block_info = ((2, 64), (2, 128)), n_output = 10
### 하이퍼파라미터
### 결과
   * T_loss: 0.24728, T_accuracy: 91.6636 | V_loss: 0.21479, V_accuracy: 92.8000 | V_loss decreased ( 0.258 -->  0.215). Saving model.. | T_time: 00:01:01, T_speed: 0.328
### test
   * Num Test Samples:  10000
   * Sample Shape:  torch.Size([1, 28, 28])
   * MODEL FILE: /home/work/DL/3/checkpoints/FashionMNIST_checkpoint_2024-11-21_07-33-02_latest.pt
   * /usr/local/lib/python3.10/dist-packages/torch/nn/modules/lazy.py:181: UserWarning: Lazy modules are a new feature under heavy development so changes to the API or functionality can happen at any moment.
   * warnings.warn('Lazy modules are a new feature under heavy development '
   
# 9
### 8에서 learning rate decay(lr_scheduler.StepLR)
### 하이퍼파라미터
   * epochs = 1000
   * batch_size = 2048
   * validation_intervals = 10
   * learning_rate = 1e-3
   * early_stop_patience = 30
   * early_stop_delta = 0.0001
   * weight_decay = 0.001
   * dropout = 0.5
### 결과
   * T_loss: 0.26904, T_accuracy: 91.0345 | V_loss: 0.25182, V_accuracy: 91.3400 | V_loss decreased ( 0.252 -->  0.252). Saving model.. | T_time: 00:02:29, T_speed: 0.336
### 테스트
   * The kernel appears to have died. It will restart automatically.

# 10
### 9에서 LazyConv에서 일반 Conv로 변경해서 실행
   * The kernel appears to have died. It will restart automatically.
   * vgg자체가 문제?
   
# 11
### 10에서 Learning rate decay 제거

   * T_loss: 0.20541, T_accuracy: 93.0745 | V_loss: 0.21959, V_accuracy: 92.7800 | V_loss decreased ( 0.235 -->  0.220). Saving model.. | T_time: 00:01:30, T_speed: 0.333
   * 똑같이 test가 안나옴
   * learning rate decay 문제도 아니고, LazyConv도 아니고... 
   * 뤼튼에 'The kernel appears to have died. It will restart automatically.' 검색 결과 메모리의 문제일 수있다 해서 colab에서 진행해봄
# 12
### colab으로 이동해서 실행
   * T_loss: 0.16558, T_accuracy: 94.3345 | V_loss: 0.21178, V_accuracy: 93.2400 | V_loss decreased ( 0.213 -->  0.212). Saving model.. | T_time: 00:04:56, T_speed: 0.135
   * TEST RESULTS: 92.680%
   * colab에서는 무리 없이 되는 것으로 보아 메모리 문제일 확률이 높아 보임
# 13
### 12에서 learning rate decay추가 단 eraly stop count가 있을 경우에만 count해서 
   * T_loss: 0.13224, T_accuracy: 95.3345 | V_loss: 0.20729, V_accuracy: 93.5800 | V_loss decreased ( 0.210 -->  0.207). Saving model.. | T_time: 00:07:29, T_speed: 0.134
   * TEST RESULTS: 92.770% 
   * 성능은 좋아졌지만 원하는 성능에는 부족
   * learning rate decay를 validation을 확인하는 위치를 기준으로 step하기, 하이퍼파라미터만 변경해보기
   * 여태 확인한 것으로 early_stop_patience이 커 봤자 train_loss만 더 작아지는 overfitting이 확인되고, validation_loss는 작아지지 않음 -> 처음 있던 10까지만 확인 해도 좋을 것 으로 보인다.
# 14
### learning rate decay 위치 do_validation 다음으로 변경
   * T_loss: 0.17335, T_accuracy: 94.0309 | V_loss: 0.20633, V_accuracy: 93.1600 | V_loss decreased ( 0.221 -->  0.206). Saving model.. | T_time: 00:04:59, T_speed: 0.134
   * TEST RESULTS: 92.640%
   * 마찬가지로 early_stop_patience을 그대로 둔 상태로 상황을 보니 overfitting 상황발생-> 시간만 잡아먹는다. -> 다음부터는 줄여서 시행
   * 1000이전에 early_stop이 발생하므로 epochs을 높일 필요는 없어 보임
     
# 15
### 하이퍼파라미터 조절
#### dropout =0.5 -> 0.3 (이전까지는 그냥 모델에 0.5로 박은상태로 진행해서 wandb에 잘못 적힘)
#### weight_decay = 0.001 -> 0.01
#### early_stop_patience = 30 -> 5
   * T_loss: 0.28774, T_accuracy: 90.7218 | V_loss: 0.26197, V_accuracy: 90.3800 | V_loss decreased ( 0.262 -->  0.262). Saving model.. | T_time: 00:53:35, T_speed: 0.134
   * TEST RESULTS: 90.060%

   * 학습 효과가 이전보다 좋지 못함 하지만, 한번 early stop count가 한 이후 거의 끝나는 이전과 달리 count했다가 validation이 줄어들고가 많이 반복됨 -> dropout을 다시 0.5로 올려서 dropout과 weight_decay중 어느 것이 영향을 주었는지 확인
   
# 16
### dropout 0.5로 상승
### colab 사용시간으로 다시 학교 backAI로 돌아옴
   * T_loss: 0.27838, T_accuracy: 90.9309 | V_loss: 0.25139, V_accuracy: 90.9600 | V_loss decreased ( 0.252 -->  0.251). Saving model.. | T_time: 00:22:53, T_speed: 0.335
   * TEST RESULTS: 90.270%
   * 별 차이 없음 -> weight_decay가 높아져서 성능이 낮아졌다. 하지만, Train과 validation의 차이는 줄어듬 -> 일반화 성능이 올랐다라고 봐도 되나?

# 17
### 모델 변경 -> vgg블록 추가해서
   *  T_loss: 2.30260, T_accuracy: 10.0055 | V_loss: 2.30268, V_accuracy: 9.9400 | V_loss decreased ( 2.304 -->  2.303). Saving model.. | T_time: 00:00:32, T_speed: 0.312
   * 시작부터 2점대의 Loss로 시작 -> 모델 구조는 기존것이 더 좋아 보임
   
# 18
### 모델에 vgg에 batch norm추가
   * T_loss: 0.12093, T_accuracy: 96.8564 | V_loss: 0.20334, V_accuracy: 93.1200 | V_loss decreased ( 0.208 -->  0.203). Saving model.. | T_time: 00:07:04, T_speed: 0.165
   * TEST RESULTS: 92.300%

# 19
### 모델 구조 변경. flatten 이후를 변경
   * T_loss: 0.16988, T_accuracy: 95.0764 | V_loss: 0.21688, V_accuracy: 92.5600 | V_loss decreased ( 0.252 -->  0.217). Saving model.. | T_time: 00:05:50, T_speed: 0.143
   
# 20
### batch 를 4096으로 늘림, weight_decay를 0.001로 돌림
   * T_loss: 0.17377, T_accuracy: 94.3182 | V_loss: 0.20258, V_accuracy: 92.8400 | V_loss decreased ( 0.241 -->  0.203). Saving model.. | T_time: 00:03:00, T_speed: 0.111
   * TEST RESULTS: 92.620%
   
# 21
### VGG BLOCK의 CONV개수를 3개로
   * T_loss: 0.16957, T_accuracy: 94.4545 | V_loss: 0.20401, V_accuracy: 92.2400 | V_loss decreased ( 0.220 -->  0.204). Saving model.. | T_time: 00:04:39, T_speed: 0.072
   * early stop이 count된 다음에 intervals 10이 의미가 있는가?
   * intervals를 낮춰보자

# 22
### intervals를 5로 낮춤
   * T_loss: 0.12988, T_accuracy: 95.9182 | V_loss: 0.18195, V_accuracy: 94.0000 | V_loss decreased ( 0.200 -->  0.182). Saving model.. | T_time: 00:06:09, T_speed: 0.068
   * TEST RESULTS: 93.220%
   
# 23
### intervals를 1로 낮추는 대신에 early stop을 15로 늘림
   * T_loss: 0.11290, T_accuracy: 96.5709 | V_loss: 0.19737, V_accuracy: 94.2600 | V_loss decreased ( 0.200 -->  0.197). Saving model.. | T_time: 00:05:00, T_speed: 0.087
   * TEST RESULTS: 93.660%