# Setup

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
import os
project_folder = '/gdrive/MyDrive/ProjectCIRI'
my_module_path = os.path.join(project_folder, 'code', 'ciri_utils')

In [None]:
!pip install $my_module_path

Processing /gdrive/MyDrive/ProjectCIRI/code/ciri_utils
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch>=2.2.2 (from ciri-utils==0.1)
  Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m797.1 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision>=0.17.2 (from ciri-utils==0.1)
  Downloading torchvision-0.17.2-cp310-cp310-manylinux1_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ray[tune] (from ciri-utils==0.1)
  Downloading ray-2.10.0-cp310-cp310-manylinux2014_x86_64.whl (65.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.1/65.1 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.2.2->ciri-utils==0.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_6

In [None]:
import ray

from ray import tune
from ciri_utils.engine_v2 import CIRI_trainer

In [None]:
# Paths for the dataset
root_base = os.path.join(project_folder, 'Incidents-subset')
root_augmented = os.path.join(project_folder, 'augmented_images')

data_folders=[root_base, root_augmented]
persistence_path = os.path.join(project_folder, 'checkpoints', 'HPT_resnet_50')

# Preparation

In [None]:
selected_model="resnet50"

In [None]:
ciri_trainer = CIRI_trainer(model=selected_model,
                            data_folders=data_folders,
                            data_prop=0.8,
                            sample_indices=0.2)

# Hyperparameter tuning

Perform hyper parameter tuning with nested cross-validation on a 20% sample of the dataset.

In [None]:
search_space={
	'epochs': tune.choice([5, 10, 20]),
	'batch_size': tune.choice([32, 64]),
	'lr': tune.loguniform(1e-4, 1e-2)
}

In [None]:
hp_nest_cv = ciri_trainer.cross_validate(
    run_name="resnet50_hpt",
    config=search_space,
    outer_cv_k=5,
    inner_cv_k=3,
    tune_hyperparams=True,
    num_samples=5,
    results_persist_dir=persistence_path
)

Output hidden; open in https://colab.research.google.com to view.

## Examine results

In [None]:
import json
import re
import pandas as pd

from pprint import pprint

In [None]:
best_experiments = []
for i in range(5):
  for j in range(3):
    file_summary = f"hpt_{selected_model}_hpt_outer_{i}_inner_{j}_exp_summary.csv"
    file_path = os.path.join(persistence_path, file_summary)
    tmp_df = pd.read_csv(file_path, usecols=["loss", "accuracy", "trial_id", "time_total_s"])
    tmp_df = tmp_df.loc[tmp_df['accuracy'] == tmp_df['accuracy'].max()]
    tmp_df['outer_fold'] = i
    tmp_df['inner_fold'] = j
    best_experiments.append(
        tmp_df
    )

best_experiments = pd.concat(best_experiments)
best_experiments = best_experiments.sort_values(by=['accuracy', 'loss', 'time_total_s'], ascending=[False, True, True])
best_experiments

Unnamed: 0,loss,accuracy,trial_id,time_total_s,outer_fold,inner_fold
0,2.839263,0.391576,6df97_00000,1531.414467,1,0
3,2.25157,0.379095,5d361_00003,1407.880165,2,1
0,2.042714,0.357812,04ddc_00000,709.957051,0,2
0,3.019495,0.35,e8be0_00000,1515.005401,3,2
2,1.960826,0.329173,ccff1_00002,346.073121,0,0
0,2.905614,0.326562,a9418_00000,1386.689075,0,1
3,2.117947,0.319813,340bf_00003,293.687831,2,0
0,2.061533,0.267188,fc155_00000,1462.282066,1,2
0,2.319158,0.265625,51d1e_00000,360.89756,1,1
0,2.124572,0.25429,50b8f_00000,1580.006551,3,1


In [None]:
#best_experiments.to_csv(os.path.join(persistence_path, f"HP_RANKING.{selected_model}_overview.csv"), index=False)

In [None]:
best_params_path = os.path.join(persistence_path,
                                f"{selected_model}_hpt_outer_{best_experiments.iloc[0].outer_fold}_inner_{best_experiments.iloc[0].inner_fold}",
                                "params.json")
with open(best_params_path, 'r') as params_file:
  best_params = json.load(params_file)

best_params = {k: v for k, v in best_params['train_loop_config'].items() if k in search_space.keys()}

pprint(best_params)


{'batch_size': 32, 'epochs': 20, 'lr': 0.0006769998458972315}


# Cross-validation on entire dataset

In [None]:
ciri_trainer = CIRI_trainer(model=selected_model,
                            data_folders=data_folders,
                            data_prop=0.8)

In [None]:
persistence_path = os.path.join(project_folder, 'checkpoints', f'CV_{selected_model}')
os.makedirs(persistence_path, exist_ok=True)

In [None]:
cv_whole = ciri_trainer.cross_validate(
    run_name="resnet50_cv",
    config={
        **best_params,
        "additional_metrics": ['precision', 'recall', 'f1', 'confusion_matrix']
    },
    outer_cv_k=5,
    inner_cv_k=0,
    tune_hyperparams=False,
    results_persist_dir=persistence_path
)

Output hidden; open in https://colab.research.google.com to view.

# Transfer learning

In [None]:
ciri_trainer = CIRI_trainer(model=selected_model,
                            data_folders=data_folders,
                            data_prop=0.8)

In [None]:
# project_folder = ... # Redefine here project folder if necessary
persistence_path = os.path.join(project_folder, 'checkpoints', f'TransferLearning_{selected_model}')
os.makedirs(persistence_path, exist_ok=True)

In [None]:
tl_results = ciri_trainer.train(
    run_name=f'{selected_model}_transfer_learning_uftest',
    config={
        **{'batch_size': 32, 'epochs': 5, 'lr': 0.0006769998458972315},
        'additional_metrics': ['precision', 'recall', 'f1', 'confusion_matrix'],
        'weights': 'IMAGENET1K_V2',
        'unfreeze': ['layer4']
    },
    persist_dir=persistence_path
)

  self.pid = _posixsubprocess.fork_exec(
2024-04-17 14:39:09,738	INFO worker.py:1752 -- Started a local Ray instance.
2024-04-17 14:39:11,813	INFO tune.py:263 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `<FrameworkTrainer>(...)`.
2024-04-17 14:39:11,825	INFO tune.py:622 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949



View detailed results here: /root/ray_results/resnet50_transfer_learning_uftest
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-04-17_14-39-05_869552_215/artifacts/2024-04-17_14-39-11/resnet50_transfer_learning_uftest/driver_artifacts`


[36m(TrainTrainable pid=1649)[0m 2024-04-17 14:39:35.588215: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(TrainTrainable pid=1649)[0m 2024-04-17 14:39:35.588678: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(TrainTrainable pid=1649)[0m 2024-04-17 14:39:35.590545: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



Training started with configuration:
+--------------------------------------------------------------+
| Training config                                              |
+--------------------------------------------------------------+
| train_loop_config/additional_metrics    ...onfusion_matrix'] |
| train_loop_config/batch_size                              32 |
| train_loop_config/data_folders          ...ugmented_images'] |
| train_loop_config/data_prop                              0.8 |
| train_loop_config/epochs                                   5 |
| train_loop_config/lr                   0.0006769998458972315 |
| train_loop_config/model                             resnet50 |
| train_loop_config/sample_indices                             |
| train_loop_config/unfreeze                        ['layer4'] |
| train_loop_config/weights                      IMAGENET1K_V2 |
+--------------------------------------------------------------+


[36m(TorchTrainer pid=1649)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=1649)[0m - (ip=172.28.0.12, pid=1802) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=1802)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(RayTrainWorker pid=1802)[0m 2024-04-17 14:39:44.232690: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(RayTrainWorker pid=1802)[0m 2024-04-17 14:39:44.232769: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(RayTrainWorker pid=1802)[0m 2024-04-17 14:39:44.234160: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been re


Training finished iteration 1 at 2024-04-17 16:27:06. Total running time: 1hr 47min 53s
+---------------------------------------------+
| Training result                             |
+---------------------------------------------+
| checkpoint_dir_name       checkpoint_000000 |
| time_this_iter_s                 6447.79693 |
| time_total_s                     6447.79693 |
| training_iteration                        1 |
| accuracy                            0.85458 |
| confusion_matrix       ..., 1, 2, 9, 9, 0]) |
| f1                                  0.85312 |
| loss                                0.48321 |
| precision                           0.85886 |
| recall                              0.85458 |
| summary/epoch/0                         1.0 |
| summary/train_acc/0      0.7905208333333333 |
| summary/train_loss/0     0.6672585769448169 |
| summary/val_acc/0        0.8545833333333334 |
| summary/val_loss/0       0.4832071150211911 |
+---------------------------------------------+

Epoch (training) 2:   0%|          | 1/301 [00:03<17:34,  3.52s/it]
Epoch (training) 2:   1%|          | 2/301 [00:05<13:32,  2.72s/it]
Epoch (training) 2:   1%|          | 3/301 [00:06<09:58,  2.01s/it]
Epoch (training) 2:   1%|▏         | 4/301 [00:08<09:18,  1.88s/it]
Epoch (training) 2:   2%|▏         | 5/301 [00:09<08:18,  1.68s/it]
Epoch (training) 2:   2%|▏         | 6/301 [00:10<07:05,  1.44s/it]
Epoch (training) 2:   2%|▏         | 7/301 [00:12<07:23,  1.51s/it]
Epoch (training) 2:   3%|▎         | 8/301 [00:14<08:25,  1.73s/it]
Epoch (training) 2:   3%|▎         | 9/301 [00:16<07:48,  1.60s/it]
Epoch (training) 2:   3%|▎         | 10/301 [00:17<06:56,  1.43s/it]
Epoch (training) 2:   4%|▎         | 11/301 [00:17<06:04,  1.26s/it]
Epoch (training) 2:   4%|▍         | 12/301 [00:20<07:58,  1.66s/it]
Epoch (training) 2:   4%|▍         | 13/301 [00:22<07:45,  1.62s/it]
Epoch (training) 2:   5%|▍         | 14/301 [00:22<06:30,  1.36s/it]
Epoch (training) 2:   5%|▍         | 15/301


Training finished iteration 2 at 2024-04-17 16:36:20. Total running time: 1hr 57min 7s
+---------------------------------------------+
| Training result                             |
+---------------------------------------------+
| checkpoint_dir_name       checkpoint_000001 |
| time_this_iter_s                  553.63217 |
| time_total_s                      7001.4291 |
| training_iteration                        2 |
| accuracy                             0.8975 |
| confusion_matrix       ..., 1, 2, 9, 9, 7]) |
| f1                                  0.89754 |
| loss                                0.34293 |
| precision                           0.90037 |
| recall                               0.8975 |
| summary/epoch/0                         1.0 |
| summary/epoch/1                         2.0 |
| summary/train_acc/0      0.7905208333333333 |
| summary/train_acc/1      0.9183333333333333 |
| summary/train_loss/0     0.6672585769448169 |
| summary/train_loss/1    0.25891431158662237 |


Epoch (training) 3:   0%|          | 1/301 [00:04<22:42,  4.54s/it]
Epoch (training) 3:   1%|          | 2/301 [00:06<13:44,  2.76s/it]
Epoch (training) 3:   1%|          | 3/301 [00:07<10:01,  2.02s/it]
Epoch (training) 3:   1%|▏         | 4/301 [00:08<08:12,  1.66s/it]
Epoch (training) 3:   2%|▏         | 5/301 [00:09<07:01,  1.42s/it]
Epoch (training) 3:   2%|▏         | 6/301 [00:10<06:22,  1.30s/it]
Epoch (training) 3:   2%|▏         | 7/301 [00:12<07:51,  1.60s/it]
Epoch (training) 3:   3%|▎         | 8/301 [00:13<07:29,  1.53s/it]
Epoch (training) 3:   3%|▎         | 9/301 [00:15<07:45,  1.59s/it]
Epoch (training) 3:   3%|▎         | 10/301 [00:17<07:32,  1.55s/it]
Epoch (training) 3:   4%|▎         | 11/301 [00:18<07:05,  1.47s/it]
Epoch (training) 3:   4%|▍         | 12/301 [00:19<06:21,  1.32s/it]
Epoch (training) 3:   4%|▍         | 13/301 [00:20<05:52,  1.22s/it]
Epoch (training) 3:   5%|▍         | 14/301 [00:21<05:34,  1.17s/it]
Epoch (training) 3:   5%|▍         | 15/301


Training finished iteration 3 at 2024-04-17 16:45:30. Total running time: 2hr 6min 17s
+---------------------------------------------+
| Training result                             |
+---------------------------------------------+
| checkpoint_dir_name       checkpoint_000002 |
| time_this_iter_s                  549.81427 |
| time_total_s                     7551.24338 |
| training_iteration                        3 |
| accuracy                            0.87833 |
| confusion_matrix       ..., 1, 2, 9, 9, 7]) |
| f1                                  0.87997 |
| loss                                 0.4179 |
| precision                           0.88834 |
| recall                              0.87833 |
| summary/epoch/0                         1.0 |
| summary/epoch/1                         2.0 |
| summary/epoch/2                         3.0 |
| summary/train_acc/0      0.7905208333333333 |
| summary/train_acc/1      0.9183333333333333 |
| summary/train_acc/2      0.9573958333333333 |


Epoch (training) 4:   0%|          | 1/301 [00:03<19:02,  3.81s/it]
Epoch (training) 4:   1%|          | 2/301 [00:05<11:45,  2.36s/it]
Epoch (training) 4:   1%|          | 3/301 [00:06<08:54,  1.79s/it]
Epoch (training) 4:   1%|▏         | 4/301 [00:07<07:53,  1.59s/it]
Epoch (training) 4:   2%|▏         | 5/301 [00:10<09:36,  1.95s/it]
Epoch (training) 4:   2%|▏         | 6/301 [00:11<08:11,  1.67s/it]
Epoch (training) 4:   2%|▏         | 7/301 [00:12<07:34,  1.55s/it]
Epoch (training) 4:   3%|▎         | 8/301 [00:13<06:33,  1.34s/it]
Epoch (training) 4:   3%|▎         | 9/301 [00:14<06:20,  1.30s/it]
Epoch (training) 4:   3%|▎         | 10/301 [00:16<06:56,  1.43s/it]
Epoch (training) 4:   4%|▎         | 11/301 [00:18<07:35,  1.57s/it]
Epoch (training) 4:   4%|▍         | 12/301 [00:20<08:05,  1.68s/it]
Epoch (training) 4:   4%|▍         | 13/301 [00:21<08:00,  1.67s/it]
Epoch (training) 4:   5%|▍         | 14/301 [00:23<07:48,  1.63s/it]
Epoch (training) 4:   5%|▍         | 15/301