## Generating task infos

To plot Figure 3 (the task overview) we need to transform task features.

In [7]:
import mml.interactive
from pathlib import Path
mml.interactive.init(Path('~/.config/mml.env').expanduser())
from mml.api import Keyword
from mml_tf.tasks import all_tasks, train_tasks, test_tasks, task_infos
from mml_tf.distances import transformed_task_infos
from mml_tf.paths import FIG_PATH
import pandas as pd

 _____ ______   _____ ______   ___
|\   _ \  _   \|\   _ \  _   \|\  \
\ \  \\\__\ \  \ \  \\\__\ \  \ \  \
 \ \  \\|__| \  \ \  \\|__| \  \ \  \
  \ \  \    \ \  \ \  \    \ \  \ \  \____
   \ \__\    \ \__\ \__\    \ \__\ \_______\
    \|__|     \|__|\|__|     \|__|\|_______|
         ____  _  _    __  _  _  ____  _  _
        (  _ \( \/ )  (  )( \/ )/ ___)( \/ )
         ) _ ( )  /    )( / \/ \\___ \ )  /
        (____/(__/    (__)\_)(_/(____/(__/
Interactive MML API initialized.


First we generate some descriptive statistics for the task attributes.

In [8]:
# statistics
def get_percentiles(attr: str) -> str:
    assert hasattr(task_infos, attr)
    vals = pd.Series(list(getattr(task_infos, attr).values()))
    desc = vals.describe()
    return f'{attr=}: min={desc.loc["min"]}, q1={desc.loc["25%"]}, q2={desc.loc["50%"]}, q3={desc.loc["75%"]}, max={desc.loc["max"]}, mean={desc.loc["mean"]}, std={vals.std()}'

In [9]:
for attr in ['num_classes', 'num_samples', 'dimensions', 'imbalance_ratios']:
    print(get_percentiles(attr))

attr='num_classes': min=2.0, q1=2.0, q2=2.0, q3=5.0, max=257.0, mean=11.985915492957746, std=36.55856083356302
attr='num_samples': min=170.0, q1=1572.5, q2=5718.0, q3=40673.0, max=122138.0, mean=28163.521126760563, std=36595.27630987332
attr='dimensions': min=4.0, q1=9.0, q2=13.0, q3=14.0, max=25.0, mean=12.04225352112676, std=4.254196649757608
attr='imbalance_ratios': min=1.0, q1=1.3456414066188114, q2=2.878428465530022, q3=14.277188328912466, max=171.33333333333334, mean=14.83879432031359, std=29.773240229472247


These numbers have been reported in the legend and caption of Figure 3.

In [10]:
task_overview_details = []
structs = mml.interactive.get_task_structs(all_tasks)

tag_groups = {
                tag.value: [tag]
                for tag in [
                    Keyword.DERMATOSCOPY,
                    Keyword.LARYNGOSCOPY,
                    Keyword.GASTROSCOPY_COLONOSCOPY,
                    Keyword.LAPAROSCOPY,
                    Keyword.NATURAL_OBJECTS,
                    Keyword.HANDWRITINGS,
                    Keyword.CATARACT_SURGERY,
                    Keyword.FUNDUS_PHOTOGRAPHY,
                    Keyword.MRI_SCAN,
                    Keyword.X_RAY,
                    Keyword.CT_SCAN,
                    Keyword.CLE,
                    Keyword.ULTRASOUND,
                    Keyword.CAPSULE_ENDOSCOPY,
                ]
            }
# convert tag groups to task clusters
task_clusters = {
    group_name: [task.name for task in structs if any([tag in task.keywords for tag in group_tags])]
    for group_name, group_tags in tag_groups.items()
}
clusters = sorted(list(task_clusters.keys()))
legend_map = {ix: cluster_name for ix, cluster_name in enumerate(clusters)}
task_mapping = {name: cluster for cluster in task_clusters.keys() for name in task_clusters[cluster]}
color_map = {task.name: clusters.index(task_mapping[task.name]) for task in structs}
# color_map, legend_map = plot_2D.create_color_mapping(task_list=structs, criteria='domain', task_clusters=None)
REPLACEMENTS = {'cataract_surgery': 'ophthalmic microscopy',
                'ct_scan': 'CT', 'fundus_photography': 'fundus photography',
                'gastroscopy_colonoscopy': 'gastro & colonoscopy',
                'mri_scan': 'MRI', 'natural_objects': 'natural images', 'x_ray': 'X-ray',
                }
for k in legend_map:
    if legend_map[k] in REPLACEMENTS:
        legend_map[k] = REPLACEMENTS[legend_map[k]]
domains = {t: legend_map[color_map[t]] for t in all_tasks}
for t in train_tasks + test_tasks:
    _printable = t.replace('_', ' ').replace('lapgyn4', 'LapGyn4').replace('sklin2 skin lesions', 'SKLIN2').replace(
        'object classification', '').replace('svhn', 'SVHN').replace('cholec80', 'Cholec80').replace('hyperkvasir',
                                                                                                     'HyperKvasir').replace(
        'mura xr', 'MURA').replace('identify nbi infframes', 'NBI-InfFrames').replace('laryngeal tissues',
                                                                                      'Laryngeal cancerous tissue').replace(
        'stanford dogs image categorization', 'Stanford dogs').replace('digit classification', '').replace('emnist',
                                                                                                           'EMNIST').replace(
        'mnist', 'MNIST').replace('caltech', 'Caltech').replace('cifar', 'CIFAR').replace('skin lesions', '').replace(
        'idle action recognition', 'CatRelComp').replace('chexpert', 'CheXpert').replace('barretts esophagus diagnosis',
                                                                                         'AIDA-E Barrett\'s esophagus').replace(
        'brain tumor classification', 'kaggle Brain Tumor dataset').replace('nerthus bowel cleansing quality',
                                                                            'Nerthus').replace(
        'mednode melanoma classification', 'MED-NODE').replace('crawled covid ct classification',
                                                               'COVID-CT-Dataset').replace(
        'ph2-melanocytic-lesions-classification', 'PH2').replace('covid xray classification',
                                                                 'kaggle COVID X-Ray dataset').replace(
        'isic20 melanoma classification', 'ISIC20').replace('deep drid', 'DeepDRiD').replace('kvasir capsule',
                                                                                             'Kvasir-Capsule').replace(
        'bean plant disease classification', 'ibean').replace('aptos19 blindness detection',
                                                              'APTOS 2019 Blindness Detection').replace(
        'eye condition classification', 'kaggle cataract dataset').replace('breast cancer classification v2',
                                                                           'Dataset of breast ultrasound images').replace(
        'shenzen chest xray tuberculosis', 'Shenzhen Hospital CXR Set').replace('pneumonia classification',
                                                                                'Zhang Chest X-Ray Images')
    group = None
    if 'cholec80' in t:
        group = 'Cholec80'
    elif 'deep_drid' in t:
        group = 'deepdrid'
    elif 'chexpert' in t:
        group = 'chexpert'
    _info = {'name': t, 'train': t in train_tasks, 'printable': _printable.strip(),
             'samples': transformed_task_infos.num_samples[t],
             'classes': transformed_task_infos.num_classes[t], 'dimension': transformed_task_infos.dimensions[t],
             'imbalance': transformed_task_infos.imbalance_ratios[t], 'domain': domains[t], 'group': group}
    task_overview_details.append(_info)
pd.DataFrame(task_overview_details).to_csv(FIG_PATH / 'advanced_task_infos.csv')

These transformed properties have been used to generate the bar charts in Figure 3.

In [15]:
from mml_tf.tasks import paper_id_map
_printable_map = {}
for t in train_tasks + test_tasks:
    _printable_map[t] = t.replace('_', ' ').replace('lapgyn4', 'LapGyn4').replace('sklin2 skin lesions', 'SKLIN2').replace(
        'object classification', '').replace('svhn', 'SVHN').replace('cholec80', 'Cholec80').replace('hyperkvasir',
                                                                                                     'HyperKvasir').replace(
        'mura xr', 'MURA').replace('identify nbi infframes', 'NBI-InfFrames').replace('laryngeal tissues',
                                                                                      'Laryngeal cancerous tissue').replace(
        'stanford dogs image categorization', 'Stanford dogs').replace('digit classification', '').replace('emnist',
                                                                                                           'EMNIST').replace(
        'mnist', 'MNIST').replace('caltech', 'Caltech').replace('cifar', 'CIFAR').replace('skin lesions', '').replace(
        'idle action recognition', 'CatRelComp').replace('chexpert', 'CheXpert').replace('barretts esophagus diagnosis',
                                                                                         'AIDA-E Barrett\'s esophagus').replace(
        'brain tumor classification', 'kaggle Brain Tumor dataset').replace('nerthus bowel cleansing quality',
                                                                            'Nerthus').replace(
        'mednode melanoma classification', 'MED-NODE').replace('crawled covid ct classification',
                                                               'COVID-CT-Dataset').replace(
        'ph2-melanocytic-lesions-classification', 'PH2').replace('covid xray classification',
                                                                 'kaggle COVID X-Ray dataset').replace(
        'isic20 melanoma classification', 'ISIC20').replace('deep drid', 'DeepDRiD').replace('kvasir capsule',
                                                                                             'Kvasir-Capsule').replace(
        'bean plant disease classification', 'ibean').replace('aptos19 blindness detection',
                                                              'APTOS 2019 Blindness Detection').replace(
        'eye condition classification', 'kaggle cataract dataset').replace('breast cancer classification v2',
                                                                           'Dataset of breast ultrasound images').replace(
        'shenzen chest xray tuberculosis', 'Shenzhen Hospital CXR Set').replace('pneumonia classification',
                                                                                'Zhang Chest X-Ray Images')


{'lapgyn4_anatomical_structures': 'LapGyn4 anatomical structures',
 'lapgyn4_surgical_actions': 'LapGyn4 surgical actions',
 'lapgyn4_instrument_count': 'LapGyn4 instrument count',
 'lapgyn4_anatomical_actions': 'LapGyn4 anatomical actions',
 'sklin2_skin_lesions': 'SKLIN2',
 'identify_nbi_infframes': 'NBI-InfFrames',
 'laryngeal_tissues': 'Laryngeal cancerous tissue',
 'nerthus_bowel_cleansing_quality': 'Nerthus',
 'stanford_dogs_image_categorization': 'Stanford dogs',
 'svhn': 'SVHN',
 'caltech101_object_classification': 'Caltech101 ',
 'caltech256_object_classification': 'Caltech256 ',
 'cifar10_object_classification': 'CIFAR10 ',
 'cifar100_object_classification': 'CIFAR100 ',
 'mnist_digit_classification': 'MNIST ',
 'emnist_digit_classification': 'EMNIST ',
 'hyperkvasir_anatomical-landmarks': 'HyperKvasir anatomical-landmarks',
 'hyperkvasir_pathological-findings': 'HyperKvasir pathological-findings',
 'hyperkvasir_quality-of-mucosal-views': 'HyperKvasir quality-of-mucosal-views

In [19]:
rows = []
for t in all_tasks:
    _row = {'name': _printable_map[t], 'id': paper_id_map[t], '# samples': task_infos.num_samples[t], '# classes': task_infos.num_classes[t], 'IR': round(task_infos.imbalance_ratios[t], 2), 'domain': domains[t], 'shared images': '', 'reference': ''}
    rows.append(_row)
pd.DataFrame(rows)

Unnamed: 0,name,id,# samples,# classes,IR,domain,shared images,reference
0,LapGyn4 anatomical structures,T06,2728,5,8.42,laparoscopy,,
1,LapGyn4 surgical actions,T07,30682,8,10.90,laparoscopy,,
2,LapGyn4 instrument count,T08,21424,4,1.12,laparoscopy,,
3,LapGyn4 anatomical actions,T09,4782,4,2.95,laparoscopy,,
4,SKLIN2,T23,280,8,19.40,dermatoscopy,,
...,...,...,...,...,...,...,...,...
66,MURA forearm,T49,1825,2,1.76,X-ray,,
67,MURA finger,T50,5106,2,1.59,X-ray,,
68,MURA elbow,T51,4931,2,1.46,X-ray,,
69,ibean,T52,1167,3,1.02,natural images,,
