## In this notebook an ensembled model will be created and trained
Ensembling is possible in two options:
- Combine two RIS1 models, with that the second will receive an image from preceding time
- Combine RIS1 and RIS2 models.

In [1]:
import os
from pathlib import Path
import matplotlib.pyplot as plt
from torch import cuda
import confinement_mode_classifier as cmc
from datetime import datetime
import time 
import torchvision
import torch
from torch.optim import lr_scheduler
from tqdm.notebook import tqdm
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

import re

path = Path(os.getcwd())
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

Seed set to 42


Device: cuda:0


In [9]:

data_dir_path = f'{path}/data/LH_alpha'
file_names = os.listdir(data_dir_path)

batch_size = 16

#Chose what shots will be used in notebook. Removed shots have different dimensions -> useless for training
shot_numbers = [re.search(r'shot_(\d+)', file_name).group(1) for file_name in file_names]
removed_shots = ['19915', '19925', '13182', '20009', '20112', 
                 '20143', '20145', '20146', '20147', '16989', 
                 '16987', '20144', '18263', '18267', '18266', 
                 '18279', '20098', '18260', '18200', '18268', '18261']
shot_numbers = [valid_shot for valid_shot in shot_numbers if valid_shot not in removed_shots]
shots_for_testing = ['18130', '16773', '16534', '19094', '18133']
shots_for_validation = ['16769', '19379', '18057', '18132']


shot_df, test_df, val_df, train_df = cmc.load_and_split_dataframes(path, shot_numbers, shots_for_testing, shots_for_validation, use_ELMS=False)

#Get dataloaders. second_img_opt='RIS1' indicates that two RIS1 models will be ensembled
test_dataloader = cmc.get_dloader(test_df, path=path, batch_size=batch_size,
                                   shuffle=False, balance_data=True, second_img_opt='RIS1')

val_dataloader = cmc.get_dloader(val_df, path=path, batch_size=batch_size,
                                   shuffle=False, balance_data=True, second_img_opt='RIS1')

train_dataloader = cmc.get_dloader(train_df, path=path, batch_size=batch_size,
                                   shuffle=False, balance_data=True, second_img_opt='RIS1')

dataloaders = {'train':train_dataloader, 'val':val_dataloader}
dataset_sizes = {x: len(dataloaders[x].dataset) for x in ['train', 'val']}



In [10]:
import copy
pretrained_model = torchvision.models.resnet18(weights='IMAGENET1K_V1', )
# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = pretrained_model.fc.in_features
pretrained_model.fc = nn.Linear(num_ftrs, 3) #3 classes: L-mode, H-mode, ELM
pretrained_model = pretrained_model.to(device)

#Load pretrained RIS2 model
#ris2_model = copy.deepcopy(pretrained_model)
#ris2_model.load_state_dict(torch.load(f'{path}/runs/17-01-2024, 09-32-27 RIS2 no elms_all_layers/model_fully_trained.pt'))

#Load pretrained model. RIS1 in this case
pretrained_model.load_state_dict(torch.load(f'{path}/runs/18-01-24, 17-50-47 RIS1 no elms_all_layers/model.pt'))

untrained_ensembled_model = cmc.TwoImagesModel(modelA=pretrained_model, modelB=pretrained_model, hidden_units=30).to(device)

### Freeze all the weights except the classifier's weights

In [11]:

resnet_params = [resnet_name for resnet_name, resnet_param in pretrained_model.named_parameters()]

for name, param in untrained_ensembled_model.named_parameters():
    # Check if the current parameter is part of the MLP
    if 'classifier' in name or 'fc' in name or 'last_fully_connected' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Verify that only the MLP parameters have requires_grad set to True
for name, param in untrained_ensembled_model.named_parameters():
    if param.requires_grad:
        print(f"{name}: requires_grad = {param.requires_grad}")

modelA.fc.weight: requires_grad = True
modelA.fc.bias: requires_grad = True
modelB.fc.weight: requires_grad = True
modelB.fc.bias: requires_grad = True
classifier.0.weight: requires_grad = True
classifier.0.bias: requires_grad = True
classifier.2.weight: requires_grad = True
classifier.2.bias: requires_grad = True


In [12]:
timestamp =  datetime.fromtimestamp(time.time()).strftime("%d-%m-%y, %H-%M-%S ") + input('add comment: ')
writer = SummaryWriter(f'runs/{timestamp}_classifier_training')

### Train the classifier

In [8]:
#
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer = torch.optim.Adam(untrained_ensembled_model.parameters(), lr=0.01) #pouzit adam

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

num_epochs = 6
model_path = Path(f'{path}/runs/{timestamp}_classifier_training/model.pt')

ensembled_model = cmc.train_model(untrained_ensembled_model, criterion, optimizer, exp_lr_scheduler, 
                       dataloaders, writer, dataset_sizes, num_epochs=num_epochs, 
                       chkpt_path = model_path.with_name(f'{model_path.stem}_chkpt{model_path.suffix}'))

torch.save(ensembled_model.state_dict(), model_path)

Epoch 1/6
----------




  0%|          | 0/3804 [00:00<?, ?it/s]

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/compass/Shared/Users/bogdanov/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/compass/Shared/Users/bogdanov/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "/compass/Shared/Users/bogdanov/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "/compass/Shared/Users/bogdanov/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 127, in collate
    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
  File "/compass/Shared/Users/bogdanov/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 127, in <dictcomp>
    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
  File "/compass/Shared/Users/bogdanov/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 119, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/compass/Shared/Users/bogdanov/.venv/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 161, in collate_tensor_fn
    out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable


## Train all the weights

In [8]:
# Clear cash
if cuda.is_available():
    # Do i have a single GPU?
    cuda.empty_cache()
    
    # Do i have multiple GPUs?
    for i in range(cuda.device_count()):
        cuda.reset_max_memory_allocated(i)
        cuda.empty_cache()


criterion = nn.CrossEntropyLoss()

num_epochs = 6
# Decay LR by a factor of 0.1 every 7 epochs


model_path = f'{path}/runs/18-01-2024, 20-17-11 RIS1xRIS1 no elms_classifier_training/model.pt'
untrained_ensembled_model = cmc.TwoImagesModel(modelA=pretrained_model, modelB=pretrained_model, hidden_units=30).to(device)
untrained_ensembled_model.load_state_dict(torch.load(model_path))

writer = SummaryWriter(f'runs/{timestamp}_all_layers')

for name, param in untrained_ensembled_model.named_parameters():
    param.requires_grad = True

optimizer = torch.optim.Adam(untrained_ensembled_model.parameters(), lr=0.001) #pouzit adam
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
model_path = Path(f'{path}/runs/{timestamp}_all_layers/model.pt')

ensembled_model = cmc.train_model(untrained_ensembled_model, criterion, optimizer, exp_lr_scheduler, 
                                  dataloaders, writer, dataset_sizes, num_epochs=num_epochs,
                                  chkpt_path=model_path.with_name(f'{model_path.stem}_chkpt{model_path.suffix}'))


torch.save(ensembled_model.state_dict(), model_path)



Epoch 1/6
----------


  0%|          | 0/960 [00:00<?, ?it/s]

train Loss: 0.1129 Acc: 0.9773


  0%|          | 0/515 [00:00<?, ?it/s]

val Loss: 0.3720 Acc: 0.8913
Epoch 2/6
----------


  0%|          | 0/960 [00:00<?, ?it/s]

train Loss: 0.0470 Acc: 0.9870


  0%|          | 0/515 [00:00<?, ?it/s]

val Loss: 0.8909 Acc: 0.7913
Epoch 3/6
----------


  0%|          | 0/960 [00:00<?, ?it/s]

train Loss: 0.0215 Acc: 0.9932


  0%|          | 0/515 [00:00<?, ?it/s]

val Loss: 0.3452 Acc: 0.8666
Epoch 4/6
----------


  0%|          | 0/960 [00:00<?, ?it/s]

train Loss: 0.0267 Acc: 0.9924


  0%|          | 0/515 [00:00<?, ?it/s]

val Loss: 0.4024 Acc: 0.8079
Epoch 5/6
----------


  0%|          | 0/960 [00:00<?, ?it/s]

train Loss: 0.0305 Acc: 0.9917


  0%|          | 0/515 [00:00<?, ?it/s]

val Loss: 0.2661 Acc: 0.9095
Epoch 6/6
----------


  0%|          | 0/960 [00:00<?, ?it/s]

train Loss: 0.0262 Acc: 0.9914


  0%|          | 0/515 [00:00<?, ?it/s]

val Loss: 0.3210 Acc: 0.8900
Training complete in 93m 34s
Best val Acc: 0.909478
