The goal of this benchmark is to offer a comparison between differents pre-trained image recognition's networks. Here the recognition is on the [Imagenet](http://image-net.org/) dataset wich allows to work on naturals images for 1000 labels. Some image's transformations like grayscale or downsampling will be implemented too to infer on their influence on the accuracy of the networks.    

On this notebook I used the [Pytorch](https://pytorch.org/) library for running the networks and the [pandas](https://pandas.pydata.org/docs/getting_started/index.html) library to collect and display the results. This Notebook was done during a master 1 internship by [Jean-Nicolas Jérémie](https://github.com/JNJER) under the supervision of [Laurent PERRINET](https://laurentperrinet.github.io/), researcher at the Neurosciences Institute of Timone (INT).

<!-- TEASER_END -->


# Initialization of the benchmark

## Importing librarys and definition of dataset:

In [None]:
%matplotlib inline
%mkdir -p DCNN_benchmark

In [None]:
%%writefile DCNN_benchmark/init.py
# import libs
import os
import time 
from time import strftime,gmtime
import json
import time 
import os
import numpy as np
import imageio
from numpy import random
from torchvision.datasets import ImageFolder
import matplotlib.pyplot as plt
import urllib.request
# to store results
import pandas as pd

# figure's variables
fig_width = 20
phi = (np.sqrt(5)+1)/2
phi = phi**2
colors = ['b', 'r', 'k','g']

# host & date's variables 
HOST = os.uname()[1]
#datetag = strftime("%Y-%m-%d", gmtime()) 
datetag = '2020-08-27'

#dataset configuration

image_size = 256 # default image resolution
image_sizes = 2**np.arange(6, 10) # resolutions explored

N_images_per_class = 100
#i_labels = random.randint(1000, size=(N_labels)) # Random choice
i_labels = [409, 530, 892, 487, 920, 704, 879, 963, 646, 620 ] # Pre-selected classes
N_labels = len(i_labels)

id_dl = ''
root = 'data'
folder = 'imagenet_classes_100'
path = os.path.join(root, folder) # data path

with open('ImageNet-datasets-downloader/imagenet_classes.txt') as f:
    labels = [line.strip() for line in f.readlines()]
labels[0].split(', ')
labels = [label.split(', ')[1].lower().replace('_', ' ') for label in labels]

class_loader = 'ImageNet-datasets-downloader/imagenet_class_info.json'
with open(class_loader, 'r') as fp: # get all the classes on the data_downloader
    name = json.load(fp)

# a reverse look-up-table giving the index of a given label (within the whole set of imagenet labels)
reverse_labels = {}
for i_label, label in enumerate(labels):
    reverse_labels[label] = i_label
# a reverse look-up-table giving the index of a given i_label (within the sub-set of classes)
reverse_i_labels = {}
for i_label, label in enumerate(i_labels):
    reverse_i_labels[label] = i_label

print('-'*24)
# choosing the selected classes for recognition
for i_label in i_labels: 
    print('label', i_label, '=', labels[i_label])
    for key in name:
        if name[key]['class_name'] == labels[i_label]:
            id_dl += key + ' '
print('label IDs = ', id_dl)

In [None]:
%run DCNN_benchmark/init.py

In [None]:
if HOST == 'fortytwo':
    do_local = False 
    python_exec = "KMP_DUPLICATE_LIB_OK=TRUE python3"
else :
    do_local =True

## Download of example images from ImageNet :

We use an [ImageNet dataloader](https://github.com/laurentperrinet/ImageNet-datasets-downloader) to populate a dataset based on the pre-selected or randoms classes listed in the `DCNN_benchmark/init.py` file.

In [None]:
scriptname = 'DCNN_benchmark/dataset.py'

In [None]:
%%writefile {scriptname}

from DCNN_benchmark.init import *

list_dir = os.listdir(path)

if os.path.isdir(path):
    print("The folder " , folder, " already exists, it includes: ", list_dir)   # check if the folder exist and create one 
    
else :
    print(f"No existing path match for this folder, creating a folder at {path}")
    os.makedirs(path)

print(list_dir)
if len(list_dir) < N_labels : # if there aren't anough labels download some more
    print('This folder do not have anough classes, downloading some more') # using the downloader
    cmd =f"python3 ImageNet-datasets-downloader/downloader.py -data_root {root} -data_folder {folder} -images_per_class {N_images_per_class} -use_class_list True  -class_list {id_dl} -multiprocessing_workers 0"
    print('Command to run : ', cmd)
    os.system(cmd) # running it
    list_dir = os.listdir(path)
    
elif len(os.listdir(path)) == N_labels :
    print(f'The folder already contains : {len(list_dir)} classes')
          
else : # if there are to many folders delete some
    print('The folder have to many classes, deleting some')
    while len(os.listdir(path)) > N_labels : 
        for elem in os.listdir(path):
            contenu = os.listdir(f'{path}/{elem}')
            for x in contenu:
                os.remove(f'{path}/{elem}/{x}') # delete exces folders
        try:
            os.rmdir(f'{path}/{elem}')
        except:
            os.remove(f'{path}/{elem}')
    list_dir = os.listdir(path)
    print("Now the folder " , folder, f" contains :", os.listdir(path)) 


In [None]:
%mkdir -p {path}

In [None]:
if do_local:
    %run {scriptname}
else:
    !python3 {scriptname}

## Pre-trained network's import

Here we worked on four differents pre-trained networks `Alexnet`, `Mobilenet`, `Resnet101` and `VGG16`:

In [None]:
scriptname = 'DCNN_benchmark/models.py'

In [None]:
%%writefile {scriptname}

from DCNN_benchmark.init import *

import torch
import torchvision
import torchvision.transforms as transforms

# transform function for input's image processing
transform = transforms.Compose([
    transforms.Resize(int(image_size)),      # Resize the image to image_size x image_size pixels size.
    transforms.CenterCrop(int(image_size-20)),  # Crop the image to (image_size-20) x (image_size-20) pixels around the center.
    transforms.ToTensor(),       # Convert the image to PyTorch Tensor data type.
    transforms.Normalize(        # Normalize the image by adjusting its average and
                                 #     its standard deviation at the specified values.
    mean=[0.485, 0.456, 0.406],                
    std=[0.229, 0.224, 0.225]                  
    )])


image_dataset = ImageFolder(path, transform=transform) # save the dataset

# imports networks with weights
models = {} # get model's names

models['alex'] = torchvision.models.alexnet(pretrained=True)
models['vgg'] = torchvision.models.vgg16(pretrained=True)
models['mob'] = torchvision.models.mobilenet_v2(pretrained=True)
models['res'] = torchvision.models.resnext101_32x8d(pretrained=True)


# Select a device (CPU or CUDA)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
for name in models.keys():
    models[name].to(device)

In [None]:
%run {scriptname}

# Experiment 1: Image processing and recognition for differents labels :

To recover the classification confidence of the models according to the classes on which they have been trained, i.e. the 1000 classes of the ImageNet library, the softmax mathematical function is added at the output of the networks. The softmax function is a function which takes a vector (here tensor) of dimension K (here K=1000 trained classes) real values and returns for each of these values a normalized propability between 0 and 1 with a sum equal to 1. Thus all the classes are represented in the final vector and a low probability would then be a proof of absence. Nevertheless, the recognition being carried out on so-called "natural" images of the irrelevant classes could noise the recognition of those of interest. Thus to reduce this effect, we have applied a slight modification to the output softmax function, it does not recover a vector of K = 1000 ais of K = N_labels. Thus the probabilities obtained would correspond to a confidence of classification discriminating only the classes of interest. 

We extract differents factors like the accuracy and the processing time for differents datasets at differents resolution in a pandas object. 

In [None]:
scriptname = 'experiment_basic.py'

In [None]:
%%writefile {scriptname}

#import model's script and set the output file
from DCNN_benchmark.models import *
filename = f'results/{datetag}_results_1_{HOST}.json'

try:
    df = pd.read_json(filename)
except:
    df = pd.DataFrame([], columns=['model', 'perf', 'fps', 'time', 'label', 'i_label', 'i_image', 'filename', 'device']) 
    i_trial = 0
    
    # image preprocessing
    for i_image, (data, label) in enumerate(image_dataset):
        for name in models.keys():
            model = models[name]
            model.eval()
            tic = time.time()
            out = model(data.unsqueeze(0).to(device)).squeeze(0)
            percentage = torch.nn.functional.softmax(out[i_labels], dim=0) * 100
            _, indices = torch.sort(percentage, descending=True)           
            dt = time.time() - tic
            i_label_top = reverse_labels[image_dataset.classes[label]]
            perf_ = percentage[reverse_i_labels[i_label_top]].item()            
            df.loc[i_trial] = {'model':name, 'perf':perf_, 'time':dt, 'fps': 1/dt,
                               'label':labels[i_label_top], 'i_label':i_label_top, 
                               'i_image':i_image, 'filename':image_dataset.imgs[i_image][0], 'device':str(device)}
            print(f'The {name} model get {labels[i_label_top]} at {perf_:.2f} % confidence in {dt:.3f} seconds')
            i_trial += 1
    df.to_json(filename)

In [None]:
if do_local:
    %run {scriptname}
else:
    !{python_exec} {scriptname}

## Image recognition on differents labels display :

Here we collect our results, we can already display all the data in a table 

In [None]:
filename = f'results/{datetag}_results_1_{HOST}.json'
df = pd.read_json(filename)
df

This graph shows the frequency of the classification performance for our four models. 

In [None]:
fig, axs = plt.subplots(figsize=(30, fig_width/phi))
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
for color, name in zip(colors, models.keys()):
    axs.set_ylabel('Frequency', size= 18)
    axs.set_xlabel('Classification performance (%)', size= 18)
    df[df['model']==name]['perf'].plot.hist(bins=np.linspace(0, 100, 20), lw=0, alpha=0.8, label=name)
    axs.legend(loc=0, fontsize = 20)
    axs.set_title('Process on : ' + str(df['device'][0]), size = 20)

Here we display the 64 worsts classification performance, all model combined : 

In [None]:
N_image_i = 8
N_image_j = 8
fig, axs = plt.subplots(N_image_i, N_image_j, figsize=(15, 15))
for i, idx in enumerate(df["perf"].argsort()[:(N_image_i*N_image_j)]):
    ax = axs[i%N_image_i][i//N_image_i]
    ax.imshow(imageio.imread(image_dataset.imgs[df.loc[idx]['i_image']][0]))
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel(df.loc[idx]['label'] + ' | ' + df.loc[idx]['model'], color='g')
    perf_ = df.loc[idx]['perf']
    ax.set_ylabel(f'{perf_:2.1f}', color='g')    

A display of the differents compuation time of each models on the same dataset for a single resolution :

In [None]:
fig, axs = plt.subplots(figsize=(30, fig_width/phi))
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

for color, name in zip(colors, models.keys()):
    axs.set_ylabel('Computation time  (s)', size= 18)
    axs.set_xlabel('Trial', size= 18)
    axs.set_ylim(0, 1)
    df[df['model']==name]['time'].plot(label=name, color=color, marker='s', lw=0)
    axs.legend(loc=0, fontsize = 20)
    axs.set_title('Process on : ' + str(df['device'][0]), size = 20)

To make it even clearer we extracted a specific median for each models : 

Accuracy's median

In [None]:
for name in models.keys():
    med_perf = np.median(df[df['model']==name]["perf"])
    print(f'For the {name} model, the median clasification performance =  {med_perf:.1f} %' )

Computation time 's median

In [None]:
for name in models.keys():
    med_perf = np.median(df[df['model']==name]["time"])
    print(f'For the {name} model, the median computation time  =  {med_perf:.3f} s')

Frame per second's median

In [None]:
for name in models.keys():
    med_perf = np.median(df[df['model']==name]["fps"])
    print(f'For the {name} model, the median fps  =  {med_perf:.3f} Hz' )

The model wih seems to present the best accuracy is the `Resnet_101` network, the cost for a high accuracy is represented in the computation time  the network need to compute an image so the Resnet_101 also present the higher computation time. Note that the Mobilenet network show a good accuracy (>80%) while it keeps a reasonable computation's time which come with a high frame per second rate that allows real time recognition. 

# Experiment 2: Image processing and recognition for differents resolutions :

In [None]:
scriptname = 'experiment_downsample.py'

In [None]:
%%writefile {scriptname}

#import model's script and set the output file
from DCNN_benchmark.models import *
filename = f'results/{datetag}_results_2_{HOST}.json'

# Output's set up
try:
    df_downsample = pd.read_json(filename)
except:
    df_downsample = pd.DataFrame([], columns=['model', 'perf', 'fps', 'time', 'label', 'i_label', 'i_image', 'image_size', 'filename', 'device']) 
    i_trial = 0

    # image preprocessing
    for image_size in image_sizes:
        image_size = int(image_size)
        transform = transforms.Compose([  # Downsampling function on the input
        transforms.Resize(image_size),      #  Resize the image to image_size x image_size pixels size.
        transforms.CenterCrop(image_size),  # Crop the image to image_size x image_size pixels around the center.
        transforms.ToTensor(),       # Convert the image to PyTorch Tensor data type.
        transforms.Normalize(        # Normalize the image by adjusting its average and
                                     # its standard deviation at the specified values.
        mean=[0.485, 0.456, 0.406],                
        std=[0.229, 0.224, 0.225]                  
        )])
        image_dataset_downsample = ImageFolder(path, transform=transform) # Get the downsample dataset
        print(f'Résolution de {image_size}')
        # Displays the input image of the model 
        for i_image, (data, label) in enumerate(image_dataset_downsample):
            for name in models.keys():
                model = models[name]
                model.eval()
                tic = time.time()
                out = model(data.unsqueeze(0).to(device)).squeeze(0)
                percentage = torch.nn.functional.softmax(out[i_labels], dim=0) * 100
                _, indices = torch.sort(percentage, descending=True)           
                dt = time.time() - tic
                i_label_top = reverse_labels[image_dataset_downsample.classes[label]]
                perf_ = percentage[reverse_i_labels[i_label_top]].item()            
                df_downsample.loc[i_trial] = {'model':name, 'perf':perf_, 'time':dt, 'fps': 1/dt,
                                   'label':labels[i_label_top], 'i_label':i_label_top, 
                                   'i_image':i_image, 'filename':image_dataset.imgs[i_image][0], 'image_size': image_size, 'device':str(device)}
                print(f'The {name} model get {labels[i_label_top]} at {perf_:.2f} % confidence in {dt:.3f} seconds')
                i_trial += 1
        df_downsample.to_json(filename)

In [None]:
if do_local:
    %run {scriptname}
else:
    !{python_exec} {scriptname}

## Image recognition on differents resolutions display :

Here, again, we collect our results, and display all the data in a table 

In [None]:
filename = f'results/{datetag}_results_2_{HOST}.json'
df_downsample = pd.read_json(filename)
df_downsample

A display of the accuracy of each models on the same dataset for differents resolutions :

Here accuracies are displayed as a violin plot to allow a better representation of the models.

In [None]:
import seaborn as sns

fig, axs = plt.subplots(figsize=(30, fig_width/phi))
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
for color, name in zip(colors, models.keys()):
    axs = sns.violinplot(x="image_size", y="perf", data=df_downsample, inner="quartile", hue='model',cut = 0)
    axs.set_title('Process on : ' + str(df_downsample['device'][0]), size=20)
    axs.set_ylabel('Classification performance (%)', size=18)
    axs.set_xlabel('Image size', size=18)

The 64 worsts classification performance, all model combined : 

In [None]:
N_image_i = 8
N_image_j = 8
fig, axs = plt.subplots(N_image_i, N_image_j, figsize=(15, 15))
for i, idx in enumerate(df_downsample["perf"].argsort()[:(N_image_i*N_image_j)]):
    ax = axs[i%N_image_i][i//N_image_i]
    ax.imshow(imageio.imread(image_dataset.imgs[df_downsample.loc[idx]['i_image']][0]))
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel(df_downsample.loc[idx]['label'] + ' | ' + df_downsample.loc[idx]['model']+ ' | ' + str(df_downsample.loc[idx]['image_size']), color='g')
    perf_ = df_downsample.loc[idx]['perf']
    ax.set_ylabel(f'{perf_:2.1f}', color='g')    

A display of the differents compuation time of each models on the same dataset for differents resolutions :

In [None]:
import seaborn as sns

fig, axs = plt.subplots(figsize=(30, fig_width/phi))
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
for color, name in zip(colors, models.keys()):
    axs = sns.violinplot(x="image_size", y="time", data=df_downsample, inner="quartile", hue='model')
    axs.set_title('Process on : ' + str(df_downsample['device'][0]), size = 20)
    axs.set_ylabel('Computation time  (s)', size= 18)
    axs.set_xlabel('Trial', size= 18)
    axs.set_yscale('log')

Again, we extracted a specific median for each models : 

In [None]:
for name in models.keys():
    med_perf = np.median(df_downsample[df_downsample['model']==name]["perf"])
    print(f'For the {name} model, the median clasification performance =  {med_perf:.3f} %' )

The classification performance does not depend on the host (a priori :-) ) but the timing does:

In [None]:
for name in models.keys():
    med_perf = np.median(df_downsample[df_downsample['model']==name]["time"])
    print(f'For the {name} model, the median  computation time =  {med_perf:.2f} s' )

In [None]:
for name in models.keys():
    med_perf = np.median(df_downsample[df_downsample['model']==name]["fps"])
    print(f'For the {name} model, the median fps  =  {med_perf:.3f} Hz' )

The performance seems reduce when the resolution is to low or to high as the regular size of an input's image is around 256 x 256 pixels. Also, the computation time  seems proportional to the resolution, a higher resolution need a higher delay to compute the image on a CPU. 

# Experiment 3: Image processing and recognition on Grayscale :

In [None]:
scriptname = 'experiment_grayscale.py'

In [None]:
%%writefile {scriptname}

#import model's script and set the output file
from DCNN_benchmark.models import *
filename = f'results/{datetag}_results_3_{HOST}.json'

# Output's set up
try:
    df_gray = pd.read_json(filename)
except:
    df_gray = pd.DataFrame([], columns=['model', 'perf', 'fps', 'time', 'label', 'i_label', 'i_image', 'filename', 'device']) 
    i_trial = 0
    
    # image preprocessing
    transform = transforms.Compose([
    transforms.Grayscale(3),      # convert the image in grayscale
    transforms.Resize(int(image_size)),      # Resize the image.
    transforms.CenterCrop(int(image_size-20)), # Crop the image with a 20 pixels border.
    transforms.ToTensor(),       # Convert the image to PyTorch Tensor data type.
    transforms.Normalize(        # Normalize the image by adjusting its average and
                                 #     its standard deviation at the specified values.
    mean=[0.485, 0.456, 0.406],                
    std=[0.229, 0.224, 0.225]                  
    )])
    image_dataset_grayscale = ImageFolder(path, transform=transform) # Get the downsample dataset

    # Displays the input image of the model
    for i_image, (data, label) in enumerate(image_dataset_grayscale):
            for name in models.keys():
                model = models[name]
                model.eval()
                tic = time.time()
                out = model(data.unsqueeze(0).to(device)).squeeze(0)
                percentage = torch.nn.functional.softmax(out[i_labels], dim=0) * 100
                _, indices = torch.sort(percentage, descending=True)           
                dt = time.time() - tic
                i_label_top = reverse_labels[image_dataset_grayscale.classes[label]]
                perf_ = percentage[reverse_i_labels[i_label_top]].item()            
                df_gray.loc[i_trial] = {'model':name, 'perf':perf_, 'time':dt, 'fps': 1/dt,
                                   'label':labels[i_label_top], 'i_label':i_label_top, 
                                   'i_image':i_image, 'filename':image_dataset.imgs[i_image][0], 'device':str(device)}
                print(f'The {name} model get {labels[i_label_top]} at {perf_:.2f} % confidence in {dt:.3f} seconds')
                i_trial += 1
    df_gray.to_json(filename)

In [None]:
if do_local:
    %run {scriptname}
else:
    !{python_exec} {scriptname}

## Image recognition on differents labels with grayscale display :

Collecting all the results, displaying all the data in a table 

In [None]:
filename = f'results/{datetag}_results_3_{HOST}.json'
df_gray = pd.read_json(filename)
df_gray

A display of the accuracy of each models on the same dataset for a single resolution :

In [None]:
fig, axs = plt.subplots(figsize=(30, fig_width/phi))
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
for color, name in zip(colors, models.keys()):
    axs.set_ylabel('Frequency', size= 18)
    axs.set_xlabel('Classification performance (%)', size= 18)
    df_gray[df_gray['model']==name]['perf'].plot.hist(bins=np.linspace(0, 100, 20), lw=0, alpha=0.8, label=name)
    axs.legend(loc=0, fontsize = 20)
    axs.set_title('Process on : ' + str(df_gray['device'][0]), size = 20)

The 64 worsts classification performance, all model combined : 

In [None]:
N_image_i = 6
N_image_j = 6
fig, axs = plt.subplots(N_image_i, N_image_j, figsize=(15, 15))
for i, idx in enumerate(df_gray["perf"].argsort()[:(N_image_i*N_image_j)]):
    ax = axs[i%N_image_i][i//N_image_i]
    ax.imshow(imageio.imread(image_dataset.imgs[df_gray.loc[idx]['i_image']][0]))
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel(df_gray.loc[idx]['label'] + ' | ' + df_gray.loc[idx]['model'], color='g')
    perf_ = df_gray.loc[idx]['perf']
    ax.set_ylabel(f'{perf_:2.1f}', color='g')    

A display of the differents computation time of each models on the same dataset for a single resolution :

In [None]:
fig, axs = plt.subplots(figsize=(30, fig_width/phi))
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

for color, name in zip(colors, models.keys()):
    axs.set_ylabel('Computation time  (s)', size= 18)
    axs.set_xlabel('Trial', size= 18)
    df_gray[df_gray['model']==name]['time'].plot(label=name, color=color, marker='s', lw=0)
    axs.legend(loc=0, fontsize = 20)
    axs.set_title('Process on : ' + str(df_gray['device'][0]), size = 20)

Extracted a specific median for each models

In [None]:
for name in models.keys():
    med_perf = np.median(df_gray[df_gray['model']==name]["perf"])
    print(f'For the {name} model, the median clasification performance =  {med_perf:.1f} %' )

In [None]:
for name in models.keys():
    med_perf = np.median(df_gray[df_gray['model']==name]["time"])
    print(f'For the {name} model, the median computation time  =  {med_perf:.3f} s' )

In [None]:
for name in models.keys():
    med_perf = np.median(df_gray[df_gray['model']==name]["fps"])
    print(f'For the {name} model, the median fps  =  {med_perf:.3f} Hz' )

The grayscale transformation on the input seems to degrade the recognition for all the models as they perform on the same dataset.

A display of the accuracy of each models on the same dataset for differents resolutions :

Here accuracies are displayed as a violin plot to allow a better representation of the models.

In [None]:
import seaborn as sns

fig, axs = plt.subplots(figsize=(30, fig_width/phi))
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
for color, df_, label in zip(['gray', 'red'], [df_gray, df], ['black', 'color']):
    axs = sns.violinplot(x="model", y="perf", data=df_, inner="quartile", cut=0, color=color, alpha=.5)
    axs.set_title('Process on : ' + str(df_['device'][0]) + ' mode :' + label, size=20)
    axs.set_ylabel('Classification performance (%)', size=18)
    axs.set_xlabel('Model', size=18)

# Final synthesis

We have run the benchmark on various platforms, with or without GPU. Let's summarize the main message.


In [None]:
HOSTS = {'fortytwo':'iMac pro 36 cores', 
         'ai-int-desktop': 'NVIDIA Jetson TX2', 
        # 'inv-ope-de06': 'Dell station with GTX Tegra',
        }
for HOST in  HOSTS:
    print('HOST:', HOST, ', device:', HOSTS[HOST])

We verify that the classification performance is similar on different machines (as these algorithms are deterministic):

In [None]:
pd.read_json('results/2020-08-27_results_1_fortytwo.json')

In [None]:
for HOST in HOSTS:
    filename = f'results/{datetag}_results_1_{HOST}.json'
    print(filename)
    df = pd.read_json(filename)
    for name in models.keys():
        med_perf = np.median(df[df['model']==name]["perf"])
        print(f'On host {HOSTS[HOST]}, for the {name} model, the median clasification performance =  {med_perf:.1f} %' )

But that the computation time varies greatly depending on the platform:

In [None]:
for HOST in HOSTS:
    filename = f'results/{datetag}_results_1_{HOST}.json'
    df = pd.read_json(filename)
    for name in models.keys():
        med_perf = np.median(df[df['model']==name]["time"])
        print(f'On host {HOSTS[HOST]}, for the {name} model, the median computation time  =  {med_perf:.3f} s' )

From experiment 2, we check on the different hosts the same trend of classification performance for different image size :

In [None]:
import pandas as pd
df_summary = pd.DataFrame([], columns=image_sizes) 

for name in models.keys():
    print(f'For the {name} model' )
    for HOST in HOSTS:
        filename = f'results/{datetag}_results_2_{HOST}.json'
        df = pd.read_json(filename)
        df_summary.loc[HOSTS[HOST]] = [np.median(df[df['model']==name][df['image_size']==image_size]["perf"]) for image_size in df.columns]
    print(df_summary)