# Machine Learning Final Assignment

The code used for the DREML algorithm and several of the utility functions are based on the following github: https://github.com/leftthomas/DREML

The article: Deep Randomized Ensembles for Metric
Learning - Hong Xuan, Richard Souvenir,  Robert Pless

## Imports and Mounting

In [None]:
import os
import torch
from scipy.io import loadmat
import torch.nn as nn
from torchvision.models.resnet import resnet18
import copy
import random
from time import time

import sklearn.base
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data.dataloader import DataLoader
import numpy as np
!pip3 install skorch
import skorch

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn import ensemble
from imblearn.metrics import specificity_score

import random

import torch
from PIL import Image
from torch.utils.data import Dataset
from torchvision.datasets import CIFAR100, EMNIST
from torchvision import transforms
from IPython.display import clear_output

from tqdm.notebook import tqdm
import math

mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
RUN_NAME ='Final Run'
WORK_DIR = f'/content/drive/MyDrive/ML_FA/{RUN_NAME}'
DATASET_DIR = f'/content/drive/MyDrive/ML_FA'
MODELS_DIR = f'{WORK_DIR}/model'

## Pre-processing

### Data Utils

Optional cell used for pre-process the car/cub datasets into .pth files.

In [None]:
def read_txt(path):
    data = {}
    for line in open(path, 'r', encoding='utf-8'):
        data_1, data_2 = line.split()
        data[data_1] = data_2
    return data


def process_car_data(data_path):
    train_images, test_images = {}, {}
    annotations = loadmat('{}/cars_annos.mat'.format(data_path))['annotations'][0]
    for img in annotations:
        img_name, img_label = str(img[0][0]), str(img[-2][0][0])
        if int(img_label) < 99:
            if img_label in train_images:
                train_images[img_label].append('{}/{}'.format(data_path, img_name))
            else:
                train_images[img_label] = ['{}/{}'.format(data_path, img_name)]
        else:
            if img_label in test_images:
                test_images[img_label].append('{}/{}'.format(data_path, img_name))
            else:
                test_images[img_label] = ['{}/{}'.format(data_path, img_name)]
    torch.save({'train': train_images, 'test': test_images}, '{}/{}'.format(data_path, data_dicts))


def process_cub_data(data_path):
    images = read_txt('{}/images.txt'.format(data_path))
    labels = read_txt('{}/image_class_labels.txt'.format(data_path))
    train_images, test_images = {}, {}
    for img_id, img_name in images.items():
        if int(labels[img_id]) < 101:
            if labels[img_id] in train_images:
                train_images[labels[img_id]].append('{}/images/{}'.format(data_path, img_name))
            else:
                train_images[labels[img_id]] = ['{}/images/{}'.format(data_path, img_name)]
        else:
            if labels[img_id] in test_images:
                test_images[labels[img_id]].append('{}/images/{}'.format(data_path, img_name))
            else:
                test_images[labels[img_id]] = ['{}/images/{}'.format(data_path, img_name)]
    torch.save({'train': train_images, 'test': test_images}, '{}/{}'.format(data_path, data_dicts))


# if __name__ == '__main__':
#     data_dicts = 'data_dicts.pth'
#     process_car_data(f'{DATASET_DIR}/car')
#     process_cub_data(f'{DATASET_DIR}/cub')

### Utils
Some useful functions mainly for pre-processing.

In [None]:

def get_transform(data_name, data_type):
    normalize = transforms.Normalize(rgb_mean[data_name], rgb_std[data_name])
    if data_type == 'train': # apply noise trainsformations to the training images
        transform = transforms.Compose([transforms.Resize(int(256 * 1.1)), transforms.RandomCrop(256),
                                        transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize])
    else:
        transform = transforms.Compose(
            [transforms.Resize(256), transforms.CenterCrop(256), transforms.ToTensor(), normalize])
    return transform


# random assign meta class for all classes
def create_id(meta_class_size, num_class):
    multiple = num_class // meta_class_size
    remain = num_class % meta_class_size
    if remain != 0:
        multiple += 1

    idx_all = []
    for _ in range(multiple):
        idx_base = [j for j in range(meta_class_size)]
        random.shuffle(idx_base)
        idx_all += idx_base

    idx_all = idx_all[:num_class]
    random.shuffle(idx_all)
    return idx_all


def load_data(meta_id, idx_to_class, data_dict):
    # balance data for each class
    max_size = 300
    meta_data_dict = {i: [] for i in range(max(meta_id) + 1)}
    for i, c in idx_to_class.items():
        meta_class_id = meta_id[i]
        image_list = data_dict[c]
        if len(image_list) > max_size:
            image_list = random.sample(image_list, max_size)
        meta_data_dict[meta_class_id] += image_list
    return meta_data_dict

def get_data_dict(X, y):
    """
      create data_dict based on X and y where the keys are the classes and 
      the values are a list of image paths of a particular class
    """
    data_dict = {}
    for i in range(len(X)):
        if y[i] not in data_dict.keys():
            data_dict[y[i]] = []
        data_dict[y[i]].append(X[i])
    return data_dict


def get_data_set(dir_path):
  """
    create a data_dict based on a directory of image directories, in the 
    following structure:
    -root
    --class_1
    ---img_1.png
        .
        .
        .
    ---img_n.png
    --class_2
    ---img_1.png
        .
        .
        .
    ---img_n.png
      .
      .
      .
    --class_k
    ---img_1.png
        .
        .
        .
    ---img_n.png
  """
  data_set = {}
  labels = os.listdir(dir_path)
  for label in tqdm(labels):
    label_path = os.path.join(dir_path,
                              label)
    images = os.listdir(label_path)
    images_paths = list(map(lambda image: os.path.join(dir_path, label, image), images))
    data_set[label] = images_paths
  return data_set

def get_hyper_params(params, space):
  s = ''
  return ", ".join(list(map(lambda key: (f'{key}: {params[key]}')), space)) # for every key of space, get the param of that key.


class ImageReader(Dataset):
    """
      This class organizes a data_dict to be fed into a DataLoader 
      (replacing image paths with the actual values), where the labels
      will be replaced with increasing, numeric values.
    """
    def __init__(self, data_dict, transform):
        classes = [c for c in sorted(data_dict)]
        classes.sort()
        class_to_idx = {classes[i]: i for i in range(len(classes))}
        self.reverse_class_to_idx = {i: classes[i] for i in range(len(classes))}
        self.images, self.labels = [], []
        for label in sorted(data_dict):
            for img in data_dict[label]:
                self.images.append(img)
                self.labels.append(class_to_idx[label])
        self.transform = transform

    def __getitem__(self, index):
        path, target = self.images[index], self.labels[index]
        img = Image.open(path).convert('RGB')
        img = self.transform(img)
        return img, target

    def __len__(self):
        return len(self.images)

    def get_label_dictionary(self):
      return self.reverse_class_to_idx


class LabelConsistentImageReader(Dataset):
    """
      This class organizes a data_dict to be fed into a DataLoader 
      (replacing image paths with the actual values), while keeping the 
      labels consistent.
    """
    def __init__(self, data_dict, transform):
        self.images, self.labels = [], []
        for label in sorted(data_dict):
            for img in data_dict[label]:
                self.images.append(img)
                self.labels.append(label)

        self.transform = transform

    def __getitem__(self, index):
        path, target = self.images[index], self.labels[index]
        img = Image.open(path).convert('RGB')
        img = self.transform(img)
        return img, target

    def __len__(self):
        return len(self.images)


### Normalizing
Optional cell for finding the parameters (mean, std) for data normalizing:

In [None]:
rgb_mean = {'car': [0.4853, 0.4965, 0.4295],
            'cub': [0.4707, 0.4601, 0.4549],
            'flowers': [0.4326, 0.3730, 0.2803],
            'caltech': [0.55, 0.5299, 0.5009]}

rgb_std = {'car': [0.2237, 0.2193, 0.2568],
           'cub': [0.2767, 0.2760, 0.2850],
           'flowers': [0.2972, 0.2442, 0.2651],
           'caltech': [0.3185, 0.3152, 0.2383]}

flower_t = transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])

UNNORMALIZED_DATA = dict()
# UNNORMALIZED_DATA['car'] = ImageReader(torch.load(f'{DATASET_DIR}/car/data_dicts.pth'), transform=transforms.ToTensor)
# UNNORMALIZED_DATA['flowers'] = ImageReader(get_data_set(f'{DATASET_DIR}/flowers'), transform=flower_t)


def calculate_mean_and_std(data_set):
  image_loader = DataLoader(data_set, 
                            batch_size  = 64, 
                            shuffle     = False, 
                            num_workers = 1,
                            pin_memory  = True)
  
  ####### COMPUTE MEAN / STD

  sum, sum_sq, batches = 0, 0, 0
  # loop through images
  for inputs, label in tqdm(image_loader):

    sum += torch.mean(inputs, dim=[0, 2, 3])
    sum_sq += torch.mean(inputs**2, dim=[0, 2, 3])
    batches += 1
    print(label)

  ####### FINAL CALCULATIONS

  total_mean = sum / batches
  total_std = (sum_sq/batches - total_mean**2) ** 0.5

  return total_mean, total_std

## Models

### Base Model Class
The base learning model used in the article - ResNet18

In [None]:
class Model(nn.Module):
    def __init__(self, num_class):
        super(Model, self).__init__()

        # backbone
        basic_model, layers = resnet18(pretrained=True), []      # get the basic model
        for name, module in basic_model.named_children():
            if name == 'fc':                                     # drop the last layer
                continue
            layers.append(module)
        self.features = nn.Sequential(*layers)

        # classifier
        self.fc = nn.Linear(512, num_class)

    def forward(self, x):
        x = self.features(x)                                     # predict using the bottom layers
        feature = x.view(x.size(0), -1)
        out = self.fc(feature)                                   # pass the previous results to the top layer and predict
        return out


### DREML
**Stage 1** - A DREML implementation based on the sklearn model interface

Note: meta_class_size_rate indirectly refers to D, because D is dependent on the number of classes within the training data.

#### Description:
The algorithm we have chosen (DREML - Deep Randomized Ensembles for Metric Learning) is an ensemble-based deep learning model for the creation of image-embedding functions in a randomized manner.

The main concept that the algorithm is based on, is grouping the class labels of a given dataset into l partitions of d sets each (called meta-classes), where every such set in a given partition contains several different labels chosen randomly. These l partitions constitute an ensemble of l different deep models, where for every partition the image is mapped to the meta-class corresponding to its label. That is essentially what the child model is trained on, which is the mapping between the image and the new meta-class labels that constitute an embedding.

At the end of the training process, we have l different embedding functions in the form of the trained child models. The ensemble embedding essentially concatenates all of its children's embeddings into a single, d * l dimensional vector embedding. That can be used via metric learning to indentify groups of images, for either classification, regression or the subject of the article, which is image retrieval.

#### Advantages:
* The randomized aspect of the ensemble allows for diverse partitions of the classes into meta-classes. This can uncover particularly good meta-class partitions that will significantly boost its metrics, and given this random aspect, repeating the training process of the algorithm might improve results from previous iterations.
* Using meta-classes, which in some way represent the relations between random classes of the original data-set, allows for a better understanding of the data-set and might allow us to better identify new, unseen classes that were not included in the training data. For example, if we had trained the algorithm to identify trucks and cars, and a meta-class had included these two labels, then the algorithm might be better at detecting pick-up trucks even if they were not previously included in the data set (since pick-up trucks can be considered an amalgamation of the two).
* This type of ensemble considers all information of its child models, and does not disregard the resultant embedding of any child. Thus, improving the performance of the model.

#### Disatvantages:
* The most glaring issue of the algorithm is in its runtime. Since we are essentially training a whole ensemble of deep learning models within the image-vision field, sometimes up to tens of such models, this comes at a sizable cost of training the model. This forces the end user to adopt more high-computation methods when training the model.
* As previously mentioned, the ensemble uses all of its children's embeddings to embed a single image without glossing over any one model. This, in turn, ups the time used to embed the image, which affects the inference time of the model.

In [None]:
class DREML(sklearn.base.BaseEstimator):
    """
      the DREML algorithm, as adhering to the sklearn model interface.
      SVM classifiction stage added.
    """
    def __init__(self, name, ensemble_size=48, meta_class_size_rate=0.25, verbose=False):
        self.name = name
        if not (0 <= meta_class_size_rate <= 1):
          raise AttributeError("the meta_class_size_rate either exceeds 1 or is less than 0")
        self.ensemble_size = ensemble_size
        self.meta_class_size_rate = meta_class_size_rate
        self.meta_class_size = 1
        self.models = None
        self.verbose = verbose
        self.svm_model = None
        self.C = None
        self.kernel = None
        self.param_grid = {'C': [0.1, 1, 10, 100],
                           'kernel': ['rbf', 'sigmoid']}

    def initialize_models(self, data_dict):
      return [{"model": Model(self.meta_class_size).to(DEVICE),
               # set the meta class partition for the current child model
               "meta_id": create_id(self.meta_class_size, len(data_dict))}

               for i in range(self.ensemble_size)]

    def fit(self, X, y, initialize_func=None):
        # set the initialize_func to the original value
        if initialize_func is None:
          initialize_func = self.initialize_models
        data_dict = get_data_dict(X, y)
        self.num_classes = len(data_dict)
        # get D as stated in the article from the rate 
        self.meta_class_size = math.ceil(self.meta_class_size_rate * self.num_classes)
        self.models = initialize_func(data_dict)
        i = 0
        for i in tqdm(range(1, self.ensemble_size + 1), postfix={'Ensemble': ''}):
            all_class = sorted(data_dict)
            idx_to_class = {i: all_class[i] for i in range(len(all_class))}
            # load the data_dict using the meta_class partition for the current child model
            meta_data_dict = load_data(self.models[i-1]["meta_id"], idx_to_class, data_dict)

            # train the child model using the meta_class partition:
            self.optimizer = Adam(self.models[i-1]["model"].parameters(), lr=1e-4)
            lr_scheduler = MultiStepLR(self.optimizer, milestones=[int(NUM_EPOCHS * 0.5), int(NUM_EPOCHS * 0.7)], gamma=0.1)
            self.criterion = CrossEntropyLoss()

            best_acc, best_model = 0, None
            self.curr_train_acc, self.curr_train_loss, self.best_train_acc= 0.0, 0.0, 0.0
            epoch = 0
            for epoch in tqdm(range(1, NUM_EPOCHS + 1), postfix={'Epoch': ''}, leave=False):
                train_loss, train_acc = self.train(self.models[i-1]["model"], meta_data_dict)
                self.curr_train_acc = train_acc
                self.curr_train_loss = train_loss
                # deep copy the model if its performance on the current child meta_class partition is better
                if train_acc > best_acc:
                    best_acc = train_acc
                    self.best_train_acc = best_acc
                    best_model = copy.deepcopy(self.models[i-1]["model"])
                    os.makedirs(f"{MODELS_DIR}/{self.name}/epochs/", exist_ok=True)
                    torch.save(self.models[i-1]["model"].state_dict(), '{}/{}/epochs/{}_model_{:03}.pth'.format(MODELS_DIR, self.name, DATA_NAME, i))
                lr_scheduler.step()
            self.models[i-1]["model"] = best_model
        # predict with the child models and get the concatenated coordinates as features
        features, labels = self.fit_predict_DREML(data_dict)

        #training the svm model based on the returned features
        embedded_X_and_y = [(features[i], labels[i]) for i in range(len(features))]
        random.shuffle(embedded_X_and_y)
        embedded_X, embedded_y =  zip(*embedded_X_and_y)
        model = sklearn.svm.SVC(C=self.C, kernel=self.kernel, verbose=self.verbose, probability=True)
        cv = KFold(n_splits=3, shuffle=True, random_state=1) # 3 splits
        self.svm_model = RandomizedSearchCV(model, self.param_grid, scoring='accuracy', n_jobs=1, cv=cv, refit=True) # define search
        results = self.svm_model.fit(embedded_X, embedded_y)
        params = results.best_estimator_.get_params()
        self.svm_model = results.best_estimator_
        self.classes_ = results.best_estimator_.classes_
        self.C = params['C']
        self.kernel = params['kernel']
    

    def predict(self, X):
        features = self.predict_DREML(X)  # get the concatenated coordinates
        y = self.svm_model.predict(features)  # predict by feeding the coordinates to the SVM model
        return y

    def predict_proba(self, X):
        features = self.predict_DREML(X)  # get the concatenated coordinates
        y = self.svm_model.predict_proba(features)  # predict_proba by feeding the coordinates to the SVM model
        return y

    def train(self, net, data_dict):
        # a single epoch
        net.train()  # set the mode to training
        data_set = ImageReader(data_dict, get_transform(DATA_NAME, 'train'))
        data_loader = DataLoader(data_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

        total_size = len(data_set.images)
        num_of_batches = total_size // BATCH_SIZE
        l_data, t_data, n_data = 0.0, 0, 0 
        batch_index = 0
        progress_bar = tqdm(data_loader, leave=False, 
                            postfix={'curr model': self.name,
                                    'curr_acc': self.curr_train_acc,
                                    'best_acc': self.best_train_acc,
                                    'loss': self.curr_train_loss})
        for inputs, labels in progress_bar:
            # pass the batch
            self.optimizer.zero_grad()
            out = net(inputs.to(DEVICE))
            loss = self.criterion(out, labels.to(DEVICE))
            loss.backward()  # calc gradients
            self.optimizer.step()  # update weights
            # calc values for loss and accuracy
            _, pred = torch.max(out, 1)
            l_data += loss.item()
            t_data += torch.sum(pred.cpu() == labels).item()
            n_data += len(labels)

        return l_data / n_data, t_data / n_data  # loss, accuracy

    def fit_predict_DREML(self, data_dict):
        """
        return two lists with the same length (amount of pictures), where for a 
        given i the respective entry in the first list will contain the 
        concatenated coordinates of an image and the entry in the second list 
        will contain the image's label.
        """
        data_set = LabelConsistentImageReader(data_dict, get_transform(DATA_NAME, 'test'))
        model_features = []
        all_labels = None
        for i in tqdm(range(len(self.models)), leave=True):
            self.models[i]["model"].eval()
            data_loader = DataLoader(data_set, BATCH_SIZE, shuffle=False, num_workers=2)

            model_features.append(torch.Tensor([]))
            curr_model_features = None
            # get the coordinates of every image into model_features[i]
            with torch.no_grad():
                for inputs, labels in tqdm(data_loader, leave=False):
                    # predict the batch
                    out = self.models[i]["model"](inputs.to(DEVICE))
                    out = F.normalize(out)
                    out_as_numpy = out.cpu().detach().numpy()
                    labels_as_numpy = np.asarray(labels)
                    if curr_model_features is None:
                        curr_model_features = np.empty(shape=(0, out_as_numpy.shape[1]))
                    curr_model_features = np.append(curr_model_features, out_as_numpy, axis=0)
                    if all_labels is None:
                        all_labels = np.empty(shape=(0), dtype=object)
                    all_labels = np.append(all_labels, labels_as_numpy, axis=0)
            model_features[i] = torch.from_numpy(curr_model_features)
        # concatenate all coordinates in all models of an image
        features = torch.cat(model_features, 1) # every row entry represents the coordinates of a single image
        return features.detach().numpy(), all_labels

    def predict_DREML(self, X):
        """
        return a list where every entry will represent the concatenated coordinates of a single image, to be used as features for the SVM model
        """
        data_dict = get_data_dict(X, [0]*len(X))
        data_set = ImageReader(data_dict, get_transform(DATA_NAME, 'test'))
        model_features = []
        for i in tqdm(range(len(self.models)), leave=True):
            self.models[i]["model"].eval()
            data_loader = DataLoader(data_set, BATCH_SIZE, shuffle=False, num_workers=2)

            model_features.append(torch.Tensor([]))
            curr_model_features = None
            # get the coordinates of every image into model_features[i]
            with torch.no_grad():
                for inputs, _ in data_loader:
                    # predict the batch
                    out = self.models[i]["model"](inputs.to(DEVICE))
                    out = F.normalize(out)
                    out_as_numpy = out.cpu().detach().numpy()
                    if curr_model_features is None:
                        curr_model_features = np.empty(shape=(0, out_as_numpy.shape[1]))
                    curr_model_features = np.append(curr_model_features, out.cpu().detach().numpy(), axis=0)
            model_features[i] = torch.from_numpy(curr_model_features)
        # concatenate all coordinates in all models of an image
        features = torch.cat(model_features, 1) # every row entry represents the coordinates of a single image
        return features.detach().numpy()

    def get_labels(self, data_dict):
        data_set = ImageReader(data_dict, get_transform(DATA_NAME, 'test'))
        return data_set.labels

    def get_images(self, data_dict):
        data_set = ImageReader(data_dict, get_transform(DATA_NAME, 'test'))
        return data_set.images


### Improved DREML
**Stage 2** - In order to improve DREML, we suggest to change D for each child model.

Note: This class inherits from the normal DREML since it contains similar code apart from the initialization of the meta_class partitions.

#### Description




In [None]:
# DREML with changing D of children

class Improved_DREML(DREML):
  def __init__(self, name, initial_rate=0.05, rate_step=0.01, ensemble_size=24, verbose=False):
    super().__init__(name, ensemble_size, initial_rate, verbose)
    self.initial_rate = initial_rate
    self.rate_step = rate_step

    if self.initial_rate + (self.ensemble_size - 1) * self.rate_step > 1 or \
        self.initial_rate < 0 or self.rate_step <= 0:
      raise AttributeError("The maximum meta_class_size_rate given to the children exceeds 1, or the values of the initial_rate and the rate_step are illegal") 

  def varied_initialize_models(self, data_dict):
      def meta_class_size_f(k):
        return math.ceil((self.initial_rate + k * self.rate_step) * self.num_classes)
      return [{"model": Model(meta_class_size_f(i)).to(DEVICE),
                        "meta_id": create_id(meta_class_size_f(i), len(data_dict))}

                       for i in range(self.ensemble_size)]

  def fit(self, X, y, initialize_func=None):
    super().fit(X, y, self.varied_initialize_models)

  def predict(self, X):
    return super().predict(X)

  def predict_proba(self, X):
    return super().predict_proba(X)

### Baseline
**Stage 3** - The resnet model, wrapped around skorch to be fed into the sklearn RandomizedSearchCV and used with preprocessing of the ImageReader to convert the X values from the image path into the actual tensor representing that image

In [None]:
class CustomResNet(sklearn.base.BaseEstimator):
  def __init__(self, num_class, lr=0.01, optimizer=torch.optim.SGD, batch_size=32):
    self.num_class = num_class
    self.optimizer = optimizer
    self.batch_size = batch_size
    self.lr = lr
    model = Model(self.num_class).to(DEVICE)
    self.skorch_model = skorch.NeuralNet(model,
                                         max_epochs=NUM_EPOCHS,
                                         optimizer=self.optimizer,
                                         batch_size=self.batch_size,
                                         lr=self.lr,
                                         # Shuffle training data on each epoch
                                         iterator_train__shuffle=True,
                                         device=DEVICE,
                                         criterion=torch.nn.CrossEntropyLoss,
                                         verbose=10)
    
  def fit(self, X, y):
    data_dict = get_data_dict(X, y)
    data_set = ImageReader(data_dict, get_transform(DATA_NAME, 'train'))
    self.class_dictionary = data_set.get_label_dictionary()
    self.skorch_model.fit(data_set)
    print(f'class_dictionary: {self.class_dictionary}\nclass_dictionary size: {len(self.class_dictionary)}')
    self.classes_ = [self.class_dictionary[i] for i in range(len(np.unique(y)))]
    self.params = {'lr': self.lr,
                   'optimizer': self.optimizer,
                   'batch_size': self.batch_size}

  def predict(self, X):
    y_preds_probabilities = self.predict_proba(X) # using predict proba to find the values
    y_preds_indices = np.argmax(y_preds_probabilities, axis=1)
    return list(map(lambda i: self.class_dictionary[i], y_preds_indices))
    

  def predict_proba(self, X):
    data_dict = get_data_dict(X, [0]*len(X))
    data_set = ImageReader(data_dict, get_transform(DATA_NAME, 'test'))
    val =  self.skorch_model.predict_proba(data_set)
    return np.transpose(np.transpose(val) / np.sum(val, axis=1)) # the values in the top layer divided by the sum of the layer


def get_ResNet_model(num_class):
  return CustomResNet(num_class)

## Main

The data partitions that used to generate the results (Each element is a class label):

In [None]:
partitions = {'car': [
                      ['67', '45', '24', '41', '82', '47', '94', '97', '15', '4', '64', '19', '43', '46', '28', '11', '74', '14', '86', '71', '52', '60', '77', '35'],
                      ['68', '18', '63', '42', '72', '52', '82', '64', '91', '20', '77', '41', '10', '55', '8', '70', '45', '31', '36', '76', '84', '44', '4', '62'],
                      ['42', '96', '65', '6', '19', '60', '47', '97', '85', '76', '26', '51', '40', '52', '88', '91', '3', '77', '25', '78', '34', '87', '31', '32'],
                      ['27', '72', '14', '92', '79', '55', '15', '10', '83', '84', '71', '16', '41', '48', '73', '8', '96', '52', '68', '30', '26', '54', '22', '31'],
                      ['4', '26', '93', '87', '44', '17', '56', '28', '71', '95', '77', '88', '85', '11', '98', '29', '8', '37', '16', '53', '60', '27', '84', '22']
                     ],
              'cub': [
                      ['11', '24', '65', '56', '57', '22', '48', '58', '72', '10', '43', '97', '90', '49', '75', '68', '55', '16', '37', '54', '40', '39', '98', '34'],
                      ['97', '3', '43', '94', '82', '27', '93', '26', '7', '28', '4', '96', '63', '2', '51', '89', '52', '37', '62', '47', '11', '68', '5', '85'],
                      ['52', '70', '21', '95', '48', '58', '57', '65', '69', '11', '1', '98', '73', '66', '13', '93', '80', '30', '14', '77', '97', '64', '24', '22'],
                      ['68', '41', '99', '94', '38', '76', '2', '80', '93', '69', '79', '24', '14', '58', '100', '31', '19', '18', '92', '12', '70', '36', '20', '71'],
                      ['58', '25', '32', '84', '72', '89', '87', '55', '95', '4', '100', '28', '51', '1', '2', '14', '67', '36', '43', '73', '88', '44', '92', '9']
                     ],
              'flowers': [
                          ['80', '44', '83', '2', '71', '56', '46', '15', '11', '72', '20', '70', '75', '63', '14', '90', '51', '87', '77', '55', '17', '5', '37', '66', '30', '36', '49', '52', '85'],
                          ['92', '52', '14', '48', '18', '56', '61', '12', '63', '20', '86', '88', '60', '91', '87', '42', '41', '23', '36', '77', '30', '22', '93', '32', '37', '80', '44', '83', '5'],
                          ['87', '37', '92', '75', '65', '14', '20', '63', '47', '60', '15', '12', '73', '81', '68', '71', '38', '44', '64', '93', '84', '42', '74', '49', '91', '18', '46', '59', '19'],
                          ['76', '66', '53', '38', '74', '18', '12', '75', '19', '60', '37', '87', '4', '47', '54', '59', '77', '70', '51', '69', '29', '91', '93', '48', '11', '85', '90', '78', '32'],
                          ['74', '14', '20', '42', '8', '52', '92', '38', '43', '5', '19', '22', '28', '91', '73', '41', '64', '4', '18', '89', '93', '56', '47', '44', '46', '84', '59', '23', '51']
                         ],
              'caltech': [
                          ['068.fern', '172.revolver-101', '193.soccer-ball', '191.sneaker', '189.snail', '123.ketch-101', '082.galaxy', '149.necktie', '165.pram', '212.teapot', '089.goose', '091.grand-piano-101', '037.chess-board', '032.cartman', '097.harmonica', '218.tennis-racket', '169.radio-telescope', '115.ice-cream-cone', '081.frying-pan', '111.house-fly', '058.doorknob', '155.paperclip', '018.bowling-pin'],
                          ['138.mattress', '123.ketch-101', '021.breadmaker', '180.screwdriver', '154.palm-tree', '113.hummingbird', '141.microscope', '091.grand-piano-101', '167.pyramid', '107.hot-air-balloon', '076.football-helmet', '175.roulette-wheel', '178.school-bus', '111.house-fly', '189.snail', '077.french-horn', '093.grasshopper', '165.pram', '135.mailbox', '025.cactus', '046.computer-monitor', '204.sunflower-101', '068.fern'],
                          ['101.head-phones', '110.hourglass', '207.swan', '132.light-house', '002.american-flag', '173.rifle', '167.pyramid', '200.stained-glass', '196.spaghetti', '177.saturn', '104.homer-simpson', '040.cockroach', '140.menorah-101', '041.coffee-mug', '138.mattress', '004.baseball-bat', '199.spoon', '082.galaxy', '175.roulette-wheel', '037.chess-board', '100.hawksbill-101', '081.frying-pan', '018.bowling-pin'],
                          ['069.fighter-jet', '200.stained-glass', '192.snowmobile', '216.tennis-ball', '037.chess-board', '224.touring-bike', '182.self-propelled-lawn-mower', '102.helicopter-101', '122.kayak', '132.light-house', '170.rainbow', '142.microwave', '085.goat', '095.hamburger', '065.elk', '163.playing-card', '035.cereal-box', '160.pez-dispenser', '083.gas-pump', '077.french-horn', '012.binoculars', '038.chimp', '169.radio-telescope'],
                          ['033.cd', '006.basketball-hoop', '027.calculator', '078.fried-egg', '189.snail', '029.cannon', '210.syringe', '059.drinking-straw', '034.centipede', '168.raccoon', '154.palm-tree', '145.motorbikes-101', '019.boxing-glove', '209.sword', '091.grand-piano-101', '030.canoe', '184.sheet-music', '102.helicopter-101', '040.cockroach', '049.cormorant', '211.tambourine', '127.laptop-101', '119.jesus-christ']
                         ]
              }

In [None]:
DATA_NAMES, BATCH_SIZE, NUM_EPOCHS = ['car', 'flowers', 'cub', 'caltech'], 32, 12
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Normalization Results for each dataset:
rgb_mean = {'car': [0.4853, 0.4965, 0.4295],
            'cub': [0.4707, 0.4601, 0.4549],
            'flowers': [0.4326, 0.3730, 0.2803],
            'caltech': [0.55, 0.5299, 0.5009]}

rgb_std = {'car': [0.2237, 0.2193, 0.2568],
           'cub': [0.2767, 0.2760, 0.2850],
           'flowers': [0.2972, 0.2442, 0.2651],
           'caltech': [0.3185, 0.3152, 0.2383]}



def data_to_Xy_and_classes(data):
  # generate a random, sampling partition of the current dataset
  Xy = []
  X = []
  y = []
  classes_list = generate_classes_list(data)
  for label in tqdm(data['data'], 'Pre-processing: Randomly partitioning the dataset...'):
    if label in classes_list:
      # for every class included in the partition, take the first values based on the inclass_use_rate
      length = int(len(data['data'][label]) * data['inclass_use_rate']) 
      X.extend(data['data'][label][: length])
      y.extend([label]*len(data['data'][label][: length]))
  return np.asarray(X), np.asarray(y), classes_list

def generate_classes_list(data):
  # generate a list representing a partition
  num_of_trunc_classes = math.ceil(data['num_classes'] * data['class_use_rate'])
  initial_list = list(data['data'].keys())
  random.shuffle(initial_list)
  lis = initial_list[: num_of_trunc_classes]
  os.makedirs(f"{MODELS_DIR}/results/{DATA_NAME}", exist_ok=True)
  f = open(f'{MODELS_DIR}/results/{DATA_NAME}/{partition}_description.txt', 'w')
  f.write(str(lis))
  f.close()
  return lis

models = {'DREML': {'model_factory': DREML,
                    'space': {'ensemble_size': [8, 12, 24],
                              'meta_class_size_rate': [0.1, 0.2, 0.3]}},
          'resnet': {'model_factory': lambda name, verbose: get_ResNet_model(num_class),
                     'space': {'lr': [0.1, 0.01, 0.005, 0.001, 0.0005],
                               'optimizer': [torch.optim.SGD, torch.optim.Adam, torch.optim.RMSprop],
                               'batch_size': [16, 32, 64, 128]}},
          'improved_DREML': {'model_factory': Improved_DREML,
                             'space': {'ensemble_size': [8, 12, 24],
                                       'initial_rate': [0.02],
                                       'rate_step': [0.005, 0.01, 0.02]}}}

DATA = {'caltech': {'data': get_data_set(f'{DATASET_DIR}/caltech/256_ObjectCategories'),
                    'class_use_rate': 0.5,
                    'num_classes': 225,
                    'inclass_use_rate': 0.25},
        'car': {'data': torch.load(f'{DATASET_DIR}/car/data_dicts.pth')['train'],
                'class_use_rate': 0.12,
                'num_classes': 196,
                'inclass_use_rate': (1/3)},
        'cub': {'data': torch.load(f'{DATASET_DIR}/cub/data_dicts.pth')['train'],
                'class_use_rate': 0.12,
                'num_classes': 200,
                'inclass_use_rate': 0.25},
        'flowers': {'data': get_data_set(f'{DATASET_DIR}/flowers'),
                    'class_use_rate': 0.4,
                    'num_classes': 71,
                    'inclass_use_rate': 0.25}}
s = ''
for DATA_NAME in DATA_NAMES:
  for partition in range(5):

    data = DATA[DATA_NAME]  # data contains a dictionary where the data itself, class_use_rate, num_classes and inclass_use_rate can be obtained
    X, y, classes_list = data_to_Xy_and_classes(data)  # generate a random, sampling partition of the current dataset
    classes_list.sort()
    num_class = len(classes_list)

    for model_name in models:
      print(f'train {model_name} on {DATA_NAME}_{partition}')
      model_type = models[model_name]
      cv_outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
      outer_results = list()
      i = 0

      for train_ix, test_ix in cv_outer.split(X, y):
          i += 1

          X_train, X_test = X[train_ix], X[test_ix]
          y_train, y_test = y[train_ix], y[test_ix]
          cv_inner = KFold(n_splits=2, shuffle=True, random_state=1)
          model = model_type['model_factory'](name=f'{DATA_NAME}_{partition}',verbose=True)
          space = model_type['space']
          search = RandomizedSearchCV(model, space, scoring='accuracy', n_jobs=1, cv=cv_inner, refit=True, verbose=10)

          train_time = time()
          result = search.fit(X_train, y_train) # cross validate (find hyper params) and fit
          train_time = time() - train_time

          best_model = result.best_estimator_
          
          print(f'predict of {model_name} on {DATA_NAME}_{partition}')
          test_time = time()
          yhat = best_model.predict(X_test)
          test_time = (time() - test_time) * 1000 / len(y_test)

          print(f'predict_proba of {model_name} on {DATA_NAME}_{partition}')
          y_proba = best_model.predict_proba(X_test)

          # calculate results:

          acc = metrics.balanced_accuracy_score(y_test, yhat) # uses the balanced accuracy function
          print(f'acc: {acc}')
          
          tpr = metrics.recall_score(y_test, yhat, average='weighted')
          fpr = 1 - specificity_score(y_test, yhat, average='weighted')

          precision = metrics.precision_score(y_test, yhat, average='weighted')
          
          y_pred_calc = [model.classes_[np.argmax(y_proba[i], axis=0)] for i  in range(y_proba.shape[0])]
          auc = metrics.roc_auc_score(y_test, y_proba, average='weighted', multi_class='ovr', labels=list(np.unique(y_test)))
          auprc = metrics.average_precision_score(label_binarize(y_test, classes_list), label_binarize(yhat, classes_list), average="weighted")
          expanded_spaces = ['C', 'kernel']
          expanded_spaces.extend(list(space.keys()))

          s += f'Model Name: {model_name}\n'\
            + f'Data Name: {DATA_NAME}-{partition}\n'\
            + f'Cross Validation: {i}\n'\
            + f'Hyper-Parameters: {get_hyper_params(best_model.params, expanded_spaces)}\n'\
            + f'Accuracy: {acc}\n'\
            + f'TPR: {tpr}\n'\
            + f'FPR: {fpr}\n'\
            + f'Precision: {precision}\n'\
            + f'AUC: {auc}\n'\
            + f'Area under the Precision-Recall Curve: {auprc}\n'\
            + f'Training Time: {train_time}\n'\
            + f'Inference Time: {test_time}\n\n'
          os.makedirs(f"{MODELS_DIR}/results", exist_ok=True)
          f = open(f'{MODELS_DIR}/results/{model_name}_{DATA_NAME}-{partition}_{i}.txt', 'w')  # save the results
          f.write(s)
          f.close()

          clear_output() # clean the screen
          print(s)
 