# PREP

In [1]:
import torch
from torchvision import transforms as T
from lightly.transforms import SimCLRTransform, DINOTransform, MAETransform, MoCoV2Transform, utils
from datasets import create_dataset
from models import MAEModel
import pytorch_lightning as pl
import os
import copy
import gc
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

from torchvision import transforms as T
from lightly.transforms import SimCLRTransform, DINOTransform, MAETransform, MoCoV2Transform, utils
from datasets import create_dataset
from models import MAEModel
import pytorch_lightning as pl
import os
import copy
import gc
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader


In [3]:
from pytorch_lightning.callbacks import Callback
import os

class SaveAtEpochsCallback(Callback):
    def __init__(self, save_epochs, dirpath="checkpoints"):
        super().__init__()
        self.save_epochs = set(save_epochs)
        self.dirpath = dirpath
        os.makedirs(self.dirpath, exist_ok=True)

    def on_train_epoch_end(self, trainer, pl_module):
        current_epoch = trainer.current_epoch + 1  # epoka 0-based
        if current_epoch in self.save_epochs:
            filename = f"model_epoch_{current_epoch}.ckpt"
            path = os.path.join(self.dirpath, filename)
            trainer.save_checkpoint(path)
            print(f"Zapisano model po epoce {current_epoch}: {path}")


In [4]:
SEED = 42

def seed_everything(seed: int=42):
    pl.seed_everything(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    ####### Normaly you would also need to seed those generators but `pytorch_lightning` does it in one func
    # random.seed(seed)
    # np.random.seed(seed)
    # torch.manual_seed(seed)
    ######
    torch.cuda.manual_seed(seed) # Don't know if pytorch lightning does this
    torch.cuda.manual_seed_all(seed) # Don't know if pytorch lightning does this
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


seed_everything(SEED)

Seed set to 42


In [5]:
from lightly.transforms import MAETransform

mae_transform = MAETransform(
    input_size=224,
    normalize={
        "mean": utils.IMAGENET_NORMALIZE["mean"],
        "std": utils.IMAGENET_NORMALIZE["std"]
    }
)
# dla supervised treningu
scratch_transform = T.v2.Compose([
    T.RandomResizedCrop((224, 224)),
    T.RandomHorizontalFlip(),
    T.v2.ToImage(),
    T.v2.ToDtype(torch.float32, scale=True),
    T.Normalize(
        mean=utils.IMAGENET_NORMALIZE["mean"],
        std=utils.IMAGENET_NORMALIZE["std"],
    )
])

# dla testu/ewaluacji
test_transform = T.v2.Compose([
    T.Resize((224, 224)),
    T.v2.ToImage(),
    T.v2.ToDtype(torch.float32, scale=True),
    T.Normalize(
        mean=utils.IMAGENET_NORMALIZE["mean"],
        std=utils.IMAGENET_NORMALIZE["std"],
    )
])



In [6]:
from_path = "./data"
SSL_proportion = 0.9

train_full_dataset_MAE, train_ssl_dataset_MAE, train_dataset_MAE, test_dataset_MAE = create_dataset(
    set_name='CIFAR100',
    SSL_proportion=SSL_proportion,
    train_transform=scratch_transform,              
    train_full_transform=mae_transform.transform,    
    test_transform=test_transform,
    path_to_data=from_path,
    seed=42,
    download=True
)


100%|██████████| 169M/169M [02:00<00:00, 1.40MB/s] 


Length of entire train dataset:  50000
Length of SSL train dataset:  45000
Length of classification train dataset:  5000
Length of test dataset:  10000


In [7]:
BATCH_SIZE = 128
NUM_EPOCHS = 20
LEARNING_RATE = 0.001
NUM_WORKERS = 4
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
#GARBAGE COLLECTOR FAJNA SPRAWA - BEZ NIEGO VRAMu BRAKUJE
if device == "gpu":
    torch.cuda.empty_cache()
    gc.collect()

Using device: cuda


In [8]:
train_loader = DataLoader(train_ssl_dataset_MAE, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, persistent_workers=True)
val_loader = DataLoader(test_dataset_MAE, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, persistent_workers=True)

In [9]:
model1 = MAEModel(
    lr=LEARNING_RATE,
    weight_decay=1e-4,
    max_epochs=50,
    backbone_type="pretrained_resnet18", 
    input_dim=3 * 224 * 224, 
    mask_ratio=0.75
)




In [10]:

model2 = MAEModel(
    lr=LEARNING_RATE,
    weight_decay=1e-4,
    max_epochs=50,
    backbone_type="random_resnet18", 
    input_dim=3 * 224 * 224, 
    mask_ratio=0.75
)




In [11]:
LEARNING_RATE = 0.01

In [12]:

model3 = MAEModel(
    lr=LEARNING_RATE,
    weight_decay=1e-4,
    max_epochs=50,
    backbone_type="pretrained_resnet18", 
    input_dim=3 * 224 * 224, 
    mask_ratio=0.75
)


In [13]:
model4 = MAEModel(
    lr=LEARNING_RATE,
    weight_decay=1e-4,
    max_epochs=50,
    backbone_type="random_resnet18", 
    input_dim=3 * 224 * 224, 
    mask_ratio=0.75
)

In [14]:
checkpoint_callback_1 = SaveAtEpochsCallback(save_epochs=[10, 15, 20], dirpath="checkpoints/lr_0.001_mae_cifar100")

trainer1 = pl.Trainer(
    max_epochs=20,
    callbacks=[checkpoint_callback_1],
    accelerator='auto',
    devices=1,
    log_every_n_steps=10
)


Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
d:\aaaaaaaaaaaaaaaaaaaaaaaaaaaaa\Warsztaty_Badawcze\.conda\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [15]:

checkpoint_callback_2 = SaveAtEpochsCallback(save_epochs=[10, 15, 20], dirpath="checkpoints/lr_0.001_mae_cifar100_random")

trainer2 = pl.Trainer(
    max_epochs=20,
    callbacks=[checkpoint_callback_2],
    accelerator='auto',
    devices=1,
    log_every_n_steps=10
)

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [16]:

checkpoint_callback_3 = SaveAtEpochsCallback(save_epochs=[10, 15, 20], dirpath="checkpoints/lr_0.01_mae_cifar100")

trainer3 = pl.Trainer(
    max_epochs=20,
    callbacks=[checkpoint_callback_3],
    accelerator='auto',
    devices=1,
    log_every_n_steps=10
)

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [17]:

checkpoint_callback_4 = SaveAtEpochsCallback(save_epochs=[10, 15, 20], dirpath="checkpoints/lr_0.01_mae_cifar100_random")

trainer4 = pl.Trainer(
    max_epochs=20,
    callbacks=[checkpoint_callback_4],
    accelerator='auto',
    devices=1,
    log_every_n_steps=10
)

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


# TRAIN

In [18]:
trainer1.fit(model1, train_dataloaders=train_loader, val_dataloaders=val_loader)
trainer4.fit(model4, train_dataloaders=train_loader, val_dataloaders=val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | encoder | MAEEncoder | 11.2 M | train
1 | decoder | MAEDecoder | 787 K  | train
-----------------------------------------------
12.0 M    Trainable params
0         Non-trainable params
12.0 M    Total params
47.858    Total estimated model params size (MB)
75        Modules in train mode
0         Modules in eval mode


Epoch 9: 100%|██████████| 352/352 [18:52<00:00,  0.31it/s, v_num=4, train_loss=5.16e-6, val_loss=0.00529, val_ari=0.000352, val_nmi=0.00949, train_ari=4.18e-6, train_nmi=0.00115]Zapisano model po epoce 10: checkpoints/lr_0.001_mae_cifar100\model_epoch_10.ckpt
Epoch 14: 100%|██████████| 352/352 [19:21<00:00,  0.30it/s, v_num=4, train_loss=9.49e-7, val_loss=0.00169, val_ari=0.000952, val_nmi=0.0159, train_ari=-6.8e-6, train_nmi=0.00112] Zapisano model po epoce 15: checkpoints/lr_0.001_mae_cifar100\model_epoch_15.ckpt
Epoch 19: 100%|██████████| 352/352 [19:17<00:00,  0.30it/s, v_num=4, train_loss=2.52e-6, val_loss=9.66e-5, val_ari=0.000415, val_nmi=0.00945, train_ari=-1.44e-6, train_nmi=0.00122]Zapisano model po epoce 20: checkpoints/lr_0.001_mae_cifar100\model_epoch_20.ckpt
Epoch 19: 100%|██████████| 352/352 [19:18<00:00,  0.30it/s, v_num=4, train_loss=2.52e-6, val_loss=9.66e-5, val_ari=0.000415, val_nmi=0.00945, train_ari=-1.44e-6, train_nmi=0.00122]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 352/352 [19:27<00:00,  0.30it/s, v_num=4, train_loss=2.52e-6, val_loss=9.66e-5, val_ari=0.000415, val_nmi=0.00945, train_ari=-1.44e-6, train_nmi=0.00122]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | encoder | MAEEncoder | 11.2 M | train
1 | decoder | MAEDecoder | 787 K  | train
-----------------------------------------------
12.0 M    Trainable params
0         Non-trainable params
12.0 M    Total params
47.858    Total estimated model params size (MB)
75        Modules in train mode
0         Modules in eval mode


Epoch 9: 100%|██████████| 352/352 [27:51<00:00,  0.21it/s, v_num=5, train_loss=6.44e-6, val_loss=6.5e-6, val_ari=0.000168, val_nmi=0.0213, train_ari=0.000136, train_nmi=0.00396] Zapisano model po epoce 10: checkpoints/lr_0.01_mae_cifar100_random\model_epoch_10.ckpt
Epoch 14: 100%|██████████| 352/352 [27:41<00:00,  0.21it/s, v_num=5, train_loss=4.62e-8, val_loss=3.18e-7, val_ari=5.74e-5, val_nmi=0.0131, train_ari=5.38e-5, train_nmi=0.00485] Zapisano model po epoce 15: checkpoints/lr_0.01_mae_cifar100_random\model_epoch_15.ckpt
Epoch 19: 100%|██████████| 352/352 [28:06<00:00,  0.21it/s, v_num=5, train_loss=1.76e-8, val_loss=5.38e-8, val_ari=3.2e-5, val_nmi=0.0105, train_ari=6.62e-6, train_nmi=0.002]    Zapisano model po epoce 20: checkpoints/lr_0.01_mae_cifar100_random\model_epoch_20.ckpt
Epoch 19: 100%|██████████| 352/352 [28:07<00:00,  0.21it/s, v_num=5, train_loss=1.76e-8, val_loss=5.38e-8, val_ari=3.2e-5, val_nmi=0.0105, train_ari=6.62e-6, train_nmi=0.002]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 352/352 [28:14<00:00,  0.21it/s, v_num=5, train_loss=1.76e-8, val_loss=5.38e-8, val_ari=3.2e-5, val_nmi=0.0105, train_ari=6.62e-6, train_nmi=0.002]


In [None]:

trainer3.fit(model3, train_dataloaders=train_loader, val_dataloaders=val_loader)
trainer2.fit(model2, train_dataloaders=train_loader, val_dataloaders=val_loader)
