# CNN model hyper experiments

## CIFAR10 problem setting

Lets consider the CIFAR10 dataset available in Tensorflow. https://www.cs.toronto.edu/~kriz/cifar.html

The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images.

We take as reference the trivial example CNN from Tensorflow to extend it with dropout layers and display pixel dropout for data augmentation.

We will implement a custom version of the following KERAS model: https://www.tensorflow.org/tutorials/images/cnn, enhanced with dropout.

     model = models.Sequential()
     model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))
     model.add(layers.MaxPooling2D((2, 2)))
     model.add(layers.Dropout(0.05))
     model.add(layers.Conv2D(64, (3, 3), activation='relu'))
     model.add(layers.MaxPooling2D((2, 2)))
     model.add(layers.Dropout(0.05))
     model.add(layers.Conv2D(64, (3, 3), activation='relu'))
     model.add(layers.Flatten())
     model.add(layers.Dropout(0.05))
     model.add(layers.Dense(64, activation='relu'))
     model.add(layers.Dense(10))
     return model

Hyperparameters:
    * Network architecture (CNN)
        * Dropout prob 1
        * Dropout prob 2
        * Dropout prob 3
    * Data augmentation
        * Prob to modify batch sample
        * Pixel dropout prob

## Dataset inspection

In [None]:
import matplotlib.pyplot as plt
from tensorflow.keras import datasets, layers, models

In [None]:
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()
train_images.shape, test_images.shape

In [None]:
import numpy as np

In [None]:
np.nonzero(train_labels == 1)

In [None]:
rng = np.random.default_rng(42)
validation_samples = np.concatenate([rng.choice(np.nonzero(train_labels == i)[0], 1000, replace=False) for i in range(10)], axis=0)
training_samples = np.setdiff1d(np.array(range(50000)), validation_samples, assume_unique=True)
rng.shuffle(validation_samples)
rng.shuffle(training_samples)

In [None]:
train_labels[training_samples].size

In [None]:
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']

plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
    # The CIFAR labels happen to be arrays, 
    # which is why you need the extra index
    plt.xlabel(class_names[train_labels[i][0]])
plt.show()

## Experiment cache utils

Considering experiments can take more than 15m to turn, here a small decorator function to store the experiment outputs if it does not fail. I rely on a frozen dataclass Config to hash the results.

In [None]:
import os
import hashlib
import pickle

def cache_results(exp_func, exp_dir="cnn_experiments"):
    def cached(*args):
        config = args[0]
        cache_dir = hashlib.md5(str.encode(str(config.__hash__()))).hexdigest()
        results_dir = f'_cache/{exp_dir}/{cache_dir}'
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)
        exp_results_file = f'{results_dir}/results.pkl'
        if os.path.exists(exp_results_file):
            print("results loaded from cache for: ", config)
            with open(exp_results_file, 'rb') as f:
                return pickle.load(f)
        results = exp_func(*args)
        with open(exp_results_file, 'wb') as f:
            pickle.dump(results, f)
        return results
    return cached

## CNN hyper training example

In [None]:
import matplotlib.pyplot as plt
from self_tuning_nets.visualization import function_animation, trajectories_plot, \
    trajectories_legend, trajectories_dist_from_target, trajectories_general_plot
from self_tuning_nets.hyper.experiments.cnn_models import \
    ExperimentConfig, run_deterministic_cpu_hyper_cnn_experiment
from itertools import product
import numpy as np
from self_tuning_nets.visualization import function_animation, trajectories_plot, \
    trajectories_legend, trajectories_dist_from_target, trajectories_general_plot
from dataclasses import replace

In [None]:
experiment_config = ExperimentConfig()
wlosses, hlosses, param_trajectories, scale_trajectories, accuracy = \
cache_results(run_deterministic_cpu_hyper_cnn_experiment)(experiment_config)

In [None]:
plt.plot(accuracy)
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
print("Accuracy: ", max(accuracy))

In [None]:
plt.plot(wlosses, label="w_loss")
plt.plot(hlosses, label="h_loss")
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Test loss")

In [None]:
lines_palette = [plt.get_cmap('Reds')(x) for x in np.linspace(0.3, 1.0, num=len(param_trajectories.keys()))]
trajectories_legend(param_trajectories.keys(), lines_palette)
plt.gcf().set_size_inches(1,1)
plt.show()

In [None]:
trajectories = [param_trajectories[k] for k in param_trajectories.keys()]
trajectories_general_plot(trajectories, lines_palette, ylabel="probabilities")
plt.show()
trajectories = [scale_trajectories[k] for k in param_trajectories.keys()]
trajectories_general_plot(trajectories, lines_palette, ylabel="scaling")
plt.show()

## CNN Fixed hyperparameters example

In [None]:
experiment_config = ExperimentConfig(WITH_HYPER_TRAINING=False)
wlosses, hlosses, param_trajectories, scale_trajectories, accuracy = \
cache_results(run_deterministic_cpu_hyper_cnn_experiment)(experiment_config)

In [None]:
plt.plot(accuracy)
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
print("Accuracy: ", max(accuracy))

In [None]:
plt.plot(wlosses, label="w_loss")
assert not hlosses
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Test loss")

In [None]:
lines_palette = [plt.get_cmap('Reds')(x) for x in np.linspace(0.3, 1.0, num=len(param_trajectories.keys()))]
trajectories_legend(param_trajectories.keys(), lines_palette)
plt.gcf().set_size_inches(1,1)
plt.show()

In [None]:
trajectories = [param_trajectories[k] for k in param_trajectories.keys()]
trajectories_general_plot(trajectories, lines_palette, ylabel="probabilities")
plt.show()
trajectories = [scale_trajectories[k] for k in param_trajectories.keys()]
trajectories_general_plot(trajectories, lines_palette, ylabel="scaling")
plt.show()

## Effect of random initialization

In [None]:
experiment_config = ExperimentConfig()
sample_seeds = range(40, 48)
# sample_seeds = range(40, 42)
results = [
    cache_results(run_deterministic_cpu_hyper_cnn_experiment)(
        replace(experiment_config,
                FRAMEWORK_SEED=sample_seed,
                MAX_EPOCHS=20
               ))
    for sample_seed in sample_seeds
]

In [None]:
wlosses_n, hlosses_n, param_trajectories_n, scale_trajectories_n, accuracy_n = \
zip(*results)

In [None]:
best_acc = [max(acc) for acc in accuracy_n]
rescaled_acc = [0.3 + ((1 - 0.3) / (max(best_acc) - min(best_acc))) * (acc - min(best_acc)) for acc in best_acc]
lines_palette = [plt.get_cmap('Reds')(acc) for acc in rescaled_acc]

In [None]:
sorted_info = sorted(zip(lines_palette, sample_seeds, best_acc), key=lambda x: x[2])
sorted_settings = [f"Acc: {acc:.{3}f} -> Param: {param}" for _, param, acc in sorted_info]
sorted_palette = [palette for palette, _, _ in sorted_info]
print("Accuracy -> init_seed")
trajectories_legend(sorted_settings, sorted_palette)
plt.gcf().set_size_inches(1.0, 1.0)
plt.show()

In [None]:
from matplotlib import cm
mappable = cm.ScalarMappable(cmap=plt.get_cmap('Reds'))
mappable.set_clim(vmin=np.min(best_acc), vmax=np.max(best_acc))
plt.colorbar(mappable, ax=plt.gca(), orientation='horizontal')
plt.gca().set_visible(False)
plt.show()

In [None]:
trajectories_general_plot(accuracy_n, lines_palette, ylabel="accuracy")
plt.show()

In [None]:
param_keys = param_trajectories_n[0].keys()
for pk in param_keys:
    trajectories = [param_t[pk] for param_t in param_trajectories_n]
    trajectories_general_plot(trajectories, lines_palette, ylabel="probabilities", title=pk)
    plt.show()

## Demonstration of converging hyperparameter trajectories

In [None]:
experiment_config = ExperimentConfig()
sample_seeds = [40, 42, 44]
# sample_seeds = [44]
init_hyper = [(0.95, 0.95, 0.05), (0.75, 0.75, 0.2), (0.5, 0.5, 0.5), (0.3, 0.3, 0.8),
              (0.8145, 0.9818, 0.5502), (0.5466, 0.2618, 0.8580)]

exp_settings = list(product(sample_seeds, init_hyper))
results = [
    cache_results(run_deterministic_cpu_hyper_cnn_experiment)(
        replace(experiment_config,
                FRAMEWORK_SEED=sample_seed,
                INIT_DROPOUT=init_drop,
                INIT_PIXEL_DROPOUT=init_drop,
                INIT_AUGMENT_PROB=init_aug,
                MAX_EPOCHS=40
               ))
    for sample_seed, (init_drop, init_pixel, init_aug) in exp_settings
]

In [None]:
wlosses_n, hlosses_n, param_trajectories_n, scale_trajectories_n, accuracy_n = \
zip(*results)

In [None]:
best_acc = [max(acc) for acc in accuracy_n]
rescaled_acc = [0.3 + ((1 - 0.3) / (max(best_acc) - min(best_acc))) * (acc - min(best_acc)) for acc in best_acc]
lines_palette = [plt.get_cmap('Reds')(acc) for acc in rescaled_acc]

In [None]:
sorted_info = sorted(zip(lines_palette, exp_settings, best_acc), key=lambda x: x[2])
sorted_settings = [f"Acc: {acc:.{3}f} -> Param: {param}" for _, param, acc in sorted_info]
sorted_palette = [palette for palette, _, _ in sorted_info]
print("Accuracy -> (init_seed, (layer_keepin_prob (1 - dropout), pixel_keepin_prob (1 - dropout), augment_image_prob))")
trajectories_legend(sorted_settings, sorted_palette)
plt.gcf().set_size_inches(1.0, 1.0)

In [None]:
from matplotlib import cm
mappable = cm.ScalarMappable(cmap=plt.get_cmap('Reds'))
mappable.set_clim(vmin=np.min(best_acc), vmax=np.max(best_acc))
plt.colorbar(mappable, ax=plt.gca(), orientation='horizontal')
plt.gca().set_visible(False)
plt.show()

In [None]:
trajectories_general_plot(accuracy_n, lines_palette, ylabel="accuracy")
plt.show()

In [None]:
import numpy as np
param_keys = param_trajectories_n[0].keys()
print(f"Final hyperparameter values for network with best accuracy {np.max(best_acc):.{3}f}")
for pk in param_keys:
    print(pk, ": ", param_trajectories_n[np.argmax(best_acc)][pk][-1])

In [None]:
import numpy as np
param_keys = param_trajectories_n[0].keys()
print(f"Final hyperparameter values for network with worst accuracy {np.min(best_acc):.{3}f}")
for pk in param_keys:
    print(pk, ": ", param_trajectories_n[np.argmin(best_acc)][pk][-1])

In [None]:
param_keys = param_trajectories_n[0].keys()
for pk in param_keys:
    trajectories = [param_t[pk] for param_t in param_trajectories_n]
    trajectories_general_plot(trajectories, lines_palette, ylabel="Bernoulli probability", title=pk)
    plt.show()

In [None]:
param_keys = scale_trajectories_n[0].keys()
for pk in param_keys:
    trajectories = [scale_t[pk] for scale_t in scale_trajectories_n]
    trajectories_general_plot(trajectories, lines_palette, ylabel="scales", title=pk)
    plt.show()