# Test evaluations

In [1]:
from os.path import realpath
from pathlib import Path
import pandas as pd
import itertools
import json
import shutil

def load_json(path):
    with open(path) as f:
        cfg = json.load(f)
    return cfg

RUNS_DIR = Path(realpath('.')) / 'runs'
if not 'domain-adaptation/runs' in str(RUNS_DIR):
    RUNS_DIR = Path(realpath('.')).parent / 'runs'
assert('domain-adaptation/runs' in str(RUNS_DIR))

In [2]:
def load_cls_rep_paths_new(
    method,
    experiment_id,
    prefix=None,
    config_key=None,
    config_value=None,
):
    d = RUNS_DIR / method / experiment_id
    paths = []
    for item in d.glob('*'):
        if (item.is_dir()):
            if config_key:
                if not (config_value == load_json(item / 'config.json')[config_key]):
                    continue
            if prefix:
                if not item.name.startswith(prefix):
                    continue
            paths.append( item / 'report.json' ) 
    return sorted(paths)

def get_score_new(
    report_paths,
    metric:str='precision',
    avg_types=['macro avg', 'weighted avg'],
    map_col_name= lambda n: n
):
    reports = [ pd.read_json(p) for p in report_paths ]
    score = pd.DataFrame(
        [[r[avgt][metric] for avgt in avg_types]
         for r in reports
        ],
        columns = [ map_col_name(avgt) for avgt in avg_types]
    ) 
    return score

def get_score_combinations_new(
    method,
    experiment_id,
    config_key=None,
    config_value=None,
    domains=['A','W','D'], 
    metric:str='precision',
    avg_types=['macro avg', 'weighted avg'],
):
    combos = [c for c in itertools.product(domains, repeat=2) if c[0] != c[1]]
    scores = [
        get_score_new(
            report_paths=load_cls_rep_paths_new(method, experiment_id, '{}{}'.format(c[0],c[1]), config_key, config_value),
            metric=metric,
            avg_types=avg_types,
            map_col_name=lambda n: '{}->{}'.format(c[0],c[1]),
        )
        for c in combos
    ]
    return pd.concat(scores, sort=False)


def load_cls_rep_paths(
    suffix:str, 
    runs_dir:Path=RUNS_DIR, 
    from_date:str='19700101000000', 
    to_date:str='30001010000000'
):
    assert(len(from_date)==14 and len(to_date)==14)
    return sorted([
        item / 'report.json' 
        for item in runs_dir.glob('*') 
        if item.is_dir() 
        and item.name.endswith(suffix)
        and int(item.name.split('_')[0]) >= int(from_date)
        and int(item.name.split('_')[0]) <= int(to_date)
    ])

def get_score(
    suffix:str, 
    runs_dir:Path=RUNS_DIR, 
    metric:str='precision',
    avg_types=['macro avg', 'weighted avg'],
    from_date:str='19700101000000', 
    to_date:str='30001010000000',
    map_col_name= lambda n: n
):
    report_paths = load_cls_rep_paths(suffix, RUNS_DIR, from_date, to_date)
    reports = [ pd.read_json(p) for p in report_paths ]
    score = pd.DataFrame(
        [[r[avgt][metric] for avgt in avg_types]
         for r in reports
        ],
        columns = [ map_col_name(avgt) for avgt in avg_types]
    ) 
    return score


def get_score_combinations(
    suffix:str, 
    domains=['A','W','D'], 
    runs_dir:Path=RUNS_DIR, 
    metric:str='precision',
    avg_types=['macro avg', 'weighted avg'],
    from_date:str='19700101000000', 
    to_date:str='30001010000000',
):
    combos = [c for c in itertools.product(domains, repeat=2) if c[0] != c[1]]
    scores = [
        get_score(
            suffix='{}_{}_{}'.format(c[0],c[1],suffix),
            runs_dir=runs_dir,
            metric=metric,
            avg_types=avg_types,
            from_date=from_date,
            to_date=to_date,
            map_col_name=lambda n: '{}->{}'.format(c[0],c[1]),
        )
        for c in combos
    ]
    return pd.concat(scores, sort=False)
    

## Tune source only
In this experiment, we tune a VGG16-network pretrained on ImageNet with all available source data.
The target data is used for validation (during training) and test.

In [3]:
# without augmentation
# tune_source_no_aug_scores = get_score_combinations(
#     suffix='tune_source', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191014123846',
#     to_date='20191014162536'
# )*100
# tune_source_no_aug_scores.describe().T

tune_source_no_aug_scores = get_score_combinations_new(
    method='tune_source',
    experiment_id='tune_source_no_aug',
    avg_types=['macro avg'],
)*100
tune_source_no_aug_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,58.647708,1.675882,56.614427,57.265038,59.272086,59.371915,60.715072
A->D,5.0,66.119209,1.978536,63.9226,64.272422,66.324489,67.951688,68.124848
W->A,5.0,45.655557,1.135846,44.720362,45.216344,45.321017,45.387328,47.632732
W->D,5.0,98.714765,0.487872,98.14102,98.279352,98.85421,99.01609,99.283154
D->A,5.0,45.956431,2.777532,42.368585,44.673151,45.209599,48.745268,48.785551
D->W,5.0,91.561151,1.374618,89.826742,90.537983,91.810545,92.430816,93.199668


In [4]:
# with augmentation
# tune_source_scores = get_score_combinations(
#     suffix='tune_source', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191022103424',
#     to_date='20191022142437'
# )*100
# tune_source_scores.describe().T

tune_source_scores = get_score_combinations_new(
    method='tune_source',
    experiment_id='tune_source_with_aug',
    avg_types=['macro avg'],
)*100
tune_source_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,59.522947,1.309548,57.809506,58.72371,59.741903,60.119134,61.220482
A->D,5.0,67.163203,1.622441,64.650273,66.852105,67.490647,67.742923,69.080065
W->A,5.0,45.297358,2.32515,42.970616,43.627089,44.308746,47.700264,47.880078
W->D,5.0,98.657229,0.771405,97.900666,98.197343,98.347653,99.010264,99.830221
D->A,5.0,46.943538,1.683987,44.351185,46.729477,46.780614,48.161656,48.694758
D->W,5.0,94.329697,1.313532,92.649473,93.192383,94.96945,95.372884,95.464293


## Tune source and target

In [5]:
# tune_both_scores = get_score_combinations(
#     suffix='tune_target', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191023072318',
#     to_date='20191023123426'
# )*100
# tune_both_scores.describe().T

tune_both_scores = get_score_combinations_new(
    method='tune_target',
    experiment_id='tune_target_with_aug',
    avg_types=['macro avg'],
)*100
tune_both_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,61.447768,1.87253,58.935474,61.053916,61.301769,61.773919,64.173762
A->D,5.0,71.565021,2.735697,67.044792,71.531295,71.779077,73.594656,73.875283
W->A,5.0,52.377981,1.508269,50.675616,51.028661,52.672868,53.25945,54.253307
W->D,5.0,99.169865,0.620451,98.407462,98.785086,99.139785,99.516992,100.0
D->A,5.0,52.608226,2.532902,48.726655,52.347771,52.48521,53.898876,55.58262
D->W,5.0,95.052622,2.355281,90.955851,95.494887,95.627899,96.329072,96.855399


## CCSA

__Run 1__: The were some severe stability issues when introducting the CSA loss (alpha 0.25 as done by original author).
Employed parameters:
- alpha=0.01 
- freeze_base=true

In [6]:
# tune_ccsa_scores = get_score_combinations(
#     suffix='ccsa', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191025141713',
#     to_date='20191025170306'
# )*100
# tune_ccsa_scores.describe().T

tune_ccsa_scores = get_score_combinations_new(
    method='ccsa',
    experiment_id='ccsa_without_batchnorn',
    avg_types=['macro avg'],
)*100
tune_ccsa_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,78.798048,2.45935,75.701057,76.806076,79.394747,80.737548,81.350812
A->D,5.0,79.63265,3.983204,75.801623,76.008044,79.133937,82.237858,84.98179
W->A,5.0,61.247778,2.383082,59.214011,59.410071,60.22722,62.646077,64.74151
W->D,5.0,93.912782,2.337585,90.781195,93.178093,93.841792,94.511284,97.251548
D->A,5.0,61.581135,1.017806,59.997092,61.487433,61.752556,61.853747,62.814847
D->W,5.0,93.393685,1.891096,91.173019,91.928614,93.604523,94.374407,95.887861


We observe a large performance increment for the domain adaptations where domains are far apart. However, for similar domains (W,D)

__Run 2__: Using batch-norm greatly increased stability of the method. The choice of alpha is still unclear, though. We'll try out a couple of cofigurations for a single adaptation

In [7]:
ccsa_scores_alpha0 = get_score_combinations(
    suffix='ccsa', 
    domains=['A','D'], 
    avg_types=['macro avg'],
    from_date='20191101122726',
    to_date='20191101123533'
)*100
ccsa_scores_alpha0.describe().T

Unnamed: 0,count,unique,top,freq
A->D,0,0,,
D->A,0,0,,


In [8]:
ccsa_scores_alpha025 = get_score_combinations(
    suffix='ccsa', 
    domains=['A','D'], 
    avg_types=['macro avg'],
    from_date='20191101123927',
    to_date='20191101125222'
)*100
ccsa_scores_alpha025.describe().T

Unnamed: 0,count,unique,top,freq
A->D,0,0,,
D->A,0,0,,


In [9]:
ccsa_scores_alpha05 = get_score_combinations(
    suffix='ccsa', 
    domains=['A','D'], 
    avg_types=['macro avg'],
    from_date='20191101130046',
    to_date='20191101132008'
)*100
ccsa_scores_alpha05.describe().T

Unnamed: 0,count,unique,top,freq
A->D,0,0,,
D->A,0,0,,


__Run 3__ multi-task learning (ccsa code with alpha=0)

In [10]:
# multitask_scores = get_score_combinations(
#     suffix='ccsa', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191101173414',
#     to_date='20191101191220'
# )*100
# multitask_scores.describe().T

multitask_scores = get_score_combinations_new(
    method='ccsa',
    experiment_id='ccsa_with_batchnorn_alpha_0',
    avg_types=['macro avg'],
)*100
multitask_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,82.786633,2.176073,79.688982,81.647784,83.031861,84.673918,84.890622
A->D,5.0,80.116118,3.575463,75.444807,78.19922,80.351172,81.64899,84.936403
W->A,5.0,63.620193,1.667376,61.608446,62.583484,63.262434,65.136835,65.509767
W->D,5.0,96.022428,1.805967,93.657375,95.365056,95.873795,96.615347,98.600569
D->A,5.0,63.539013,2.036433,60.171669,63.210951,64.276065,64.684695,65.351686
D->W,5.0,93.962518,0.803252,93.022606,93.299236,94.201545,94.277264,95.011937


__Run 4__ ccsa with alpha = 0.25

In [11]:
# ccsa_scores = get_score_combinations(
#     suffix='ccsa', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191101150707',
#     to_date='20191101171905'
# )*100
# ccsa_scores.describe().T

ccsa_scores = get_score_combinations_new(
    method='ccsa',
    experiment_id='ccsa_with_batchnorn_alpha_0.25',
    avg_types=['macro avg'],
)*100
ccsa_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,84.123137,1.925269,81.226364,83.079057,85.104744,85.563238,85.642282
A->D,5.0,82.13117,3.735848,76.218028,80.769827,83.651851,84.741472,85.27467
W->A,5.0,61.107807,2.253307,58.095626,60.495632,60.917527,61.682899,64.347349
W->D,5.0,92.810065,2.506197,88.809418,92.114287,93.452229,94.792916,94.881476
D->A,5.0,62.383859,2.733972,58.223974,61.187359,63.625331,63.681493,65.201136
D->W,5.0,93.71217,1.357313,91.648125,93.074904,94.255764,94.591728,94.990329


In [12]:
# ccsa_uneven_scores = get_score_combinations(
#     suffix='ccsa', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191104152050',
#     to_date='20191104185818'
# )*100
# ccsa_uneven_scores.describe().T

ccsa_uneven_scores = get_score_combinations_new(
    method='ccsa',
    experiment_id='ccsa_uneven',
    avg_types=['macro avg'],
)*100
ccsa_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,84.707488,1.425911,82.84051,83.521377,85.504465,85.662013,86.009077
A->D,5.0,82.596073,2.859945,78.051521,82.07346,82.891426,84.582776,85.381183
W->A,5.0,62.244928,3.06593,58.439242,59.682659,63.575644,63.740426,65.786667
W->D,5.0,93.793427,2.248753,90.650457,92.540323,94.144556,95.328581,96.303216
D->A,5.0,61.599173,3.129505,59.65982,59.679137,60.013532,61.639419,67.003957
D->W,5.0,93.238799,2.460498,90.460994,91.45336,92.842509,95.064086,96.373045


In [13]:
# ccsa_resnet_uneven_scores = get_score_combinations(
#     suffix='ccsa', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191106083058',
#     to_date='20191106144631'
# )*100
# ccsa_resnet_uneven_scores.describe().T

ccsa_resnet_uneven_scores = get_score_combinations_new(
    method='ccsa',
    experiment_id='ccsa_resnet_uneven',
    avg_types=['macro avg'],
)*100
ccsa_resnet_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,88.515942,1.163446,87.404071,87.423183,88.734515,88.814687,90.203254
A->D,6.0,87.694157,2.361554,84.903802,85.647832,87.861207,89.694434,90.323178
W->A,5.0,68.780836,1.997994,66.350606,67.673645,68.219386,70.584884,71.075659
W->D,5.0,94.85447,1.429756,92.442928,94.860859,95.217836,95.606338,96.144388
D->A,6.0,67.163408,3.654978,61.017038,65.736368,67.754554,69.889443,70.789592
D->W,5.0,93.637002,0.672926,93.153521,93.254722,93.310533,93.678979,94.787255


### CCSA from features
Testing if batch size has an impact on CCSA (like seems to have on DAGE)

In [14]:
ccsa_from_feat_batch_size_16_uneven_scores = get_score_combinations_new(
    method='ccsa',
    experiment_id='ccsa_batch_size',
    avg_types=['macro avg'],
    config_key='batch_size',
    config_value=16,
)*100
ccsa_from_feat_batch_size_16_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,79.972359,2.341966,76.999669,78.589037,79.847692,81.459382,82.966014
A->D,5.0,80.008812,1.290591,78.302883,79.536273,79.657664,80.952506,81.594734
W->A,5.0,61.973213,1.282982,60.719107,61.116084,61.426316,62.830696,63.773863
W->D,5.0,90.422576,3.114701,87.302968,88.274296,89.420359,92.179796,94.935462
D->A,5.0,61.917372,1.333989,60.253373,60.93437,61.996151,63.00295,63.400017
D->W,5.0,90.641608,1.5056,88.369373,90.41682,90.878293,90.983377,92.560178


In [15]:
ccsa_from_feat_batch_size_32_uneven_scores = get_score_combinations_new(
    method='ccsa',
    experiment_id='ccsa_batch_size',
    avg_types=['macro avg'],
    config_key='batch_size',
    config_value=32,
)*100
ccsa_from_feat_batch_size_32_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,80.010601,1.548712,77.743864,79.151813,80.523946,81.169787,81.463593
A->D,5.0,79.593732,1.410833,78.12149,78.283947,79.541027,80.840603,81.181594
W->A,5.0,62.502327,1.229695,61.349584,61.998959,62.261665,62.308431,64.592995
W->D,5.0,90.449098,2.088465,87.663949,89.727331,90.61006,90.790697,93.453453
D->A,5.0,63.106735,1.034425,61.489518,62.947914,63.249508,63.54563,64.301103
D->W,5.0,90.808125,1.076348,90.004096,90.148502,90.244479,91.053701,92.589846


In [16]:
ccsa_from_feat_batch_size_64_uneven_scores = get_score_combinations_new(
    method='ccsa',
    experiment_id='ccsa_batch_size',
    avg_types=['macro avg'],
    config_key='batch_size',
    config_value=64,
)*100
ccsa_from_feat_batch_size_64_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,79.708751,2.307348,76.920129,78.636711,79.13654,80.872306,82.978071
A->D,5.0,79.141243,3.449395,74.629593,76.648884,80.174678,81.093436,83.159627
W->A,5.0,61.35665,1.105851,59.478968,61.317171,61.692201,62.124731,62.170181
W->D,5.0,88.773578,2.190124,85.960699,88.369675,88.541375,88.899718,92.096425
D->A,5.0,62.502549,1.694606,60.939045,61.115881,61.921053,63.746415,64.790349
D->W,5.0,90.691934,2.119183,88.559343,89.34639,89.796408,92.091719,93.665811


In [17]:
ccsa_from_feat_batch_size_128_uneven_scores = get_score_combinations_new(
    method='ccsa',
    experiment_id='ccsa_batch_size',
    avg_types=['macro avg'],
    config_key='batch_size',
    config_value=128,
)*100
ccsa_from_feat_batch_size_128_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,79.57441,1.783976,77.136137,79.175153,79.245729,80.289979,82.02505
A->D,5.0,78.38128,2.577381,75.497962,76.882082,77.990009,79.274382,82.261966
W->A,5.0,61.721014,1.298845,60.309468,60.422891,62.025856,62.716212,63.130644
W->D,5.0,88.606514,1.859057,86.563848,87.970823,88.093132,88.80764,91.597128
D->A,5.0,63.451152,1.402333,61.615792,62.484554,63.634805,64.534683,64.985924
D->W,5.0,88.268766,3.023749,85.685352,86.706178,86.904382,88.742204,93.305714


In [18]:
ccsa_from_feat_batch_size_256_uneven_scores = get_score_combinations_new(
    method='ccsa',
    experiment_id='ccsa_batch_size',
    avg_types=['macro avg'],
    config_key='batch_size',
    config_value=256,
)*100
ccsa_from_feat_batch_size_256_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,10.0,79.379387,1.985631,76.080885,78.374017,79.812031,80.610989,82.504517
A->D,10.0,78.786036,2.304174,73.797929,77.65616,79.174231,80.662377,81.214352
W->A,10.0,58.468661,3.252602,52.458933,57.19158,59.751938,60.919195,61.658348
W->D,10.0,86.847297,4.263313,82.205995,84.006827,84.832667,90.3569,93.588572
D->A,10.0,60.12177,2.537747,56.148226,57.970192,60.659292,61.763476,63.492761
D->W,10.0,86.121395,3.016405,81.178231,85.229458,86.145157,88.28757,90.781888


In [19]:
ccsa_from_feat_batch_size_4096_uneven_scores = get_score_combinations_new(
    method='ccsa',
    experiment_id='ccsa_batch_size',
    avg_types=['macro avg'],
    config_key='batch_size',
    config_value=4096,
)*100
ccsa_from_feat_batch_size_4096_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,74.34052,2.770725,70.005945,73.982577,74.43098,75.869098,77.413999
A->D,5.0,70.568961,3.09594,65.539694,70.587786,70.989835,71.782941,73.944551
W->A,5.0,55.953759,2.53325,52.015856,55.242665,56.372153,57.552997,58.585125
W->D,5.0,80.926854,3.118521,77.377828,77.760694,82.335696,83.2506,83.909453
D->A,5.0,55.974445,5.168682,48.077417,53.698516,57.682702,59.670861,60.74273
D->W,5.0,82.364048,3.737953,77.811662,79.445775,82.660491,85.308472,86.593839


## d-SNE

In [20]:
dsne_scores = get_score_combinations(
    suffix='dsne', 
    domains=['A','W','D'], 
    avg_types=['macro avg'],
    from_date='20191104094606',
    to_date='20191104124943'
)*100
dsne_scores.describe().T

dsne_scores = get_score_combinations_new(
    method='dsne',
    experiment_id='dsne_even',
    avg_types=['macro avg'],
)*100
dsne_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,83.59965,2.934409,78.691244,83.156612,84.848799,85.351201,85.950393
A->D,5.0,81.461538,1.801268,79.23711,79.839005,82.196868,82.928646,83.106061
W->A,5.0,62.476751,2.373655,58.803858,62.203546,62.429067,63.768399,65.178888
W->D,5.0,93.674039,2.514686,89.425243,93.501042,94.642239,95.006289,95.795379
D->A,5.0,63.597292,1.222746,61.85374,63.142117,63.479165,64.622647,64.88879
D->W,5.0,93.726503,1.360903,92.120308,92.483518,94.203878,94.564026,95.260785


In [21]:
# dsne_uneven_scores = get_score_combinations(
#     suffix='dsne', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191104152038',
#     to_date='20191104182550'
# )*100
# dsne_uneven_scores.describe().T

dsne_uneven_scores = get_score_combinations_new(
    method='dsne',
    experiment_id='dsne_uneven',
    avg_types=['macro avg'],
)*100
dsne_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,81.331476,2.280087,77.81342,80.707778,81.545574,82.927621,83.662988
A->D,5.0,82.184021,4.724724,75.314405,79.355833,84.275841,85.354313,86.619711
W->A,5.0,63.370318,1.643826,61.438764,62.224592,63.638733,63.83983,65.709672
W->D,5.0,87.876722,4.714901,81.59864,85.661237,88.425745,89.325191,94.372796
D->A,5.0,62.785679,1.008744,61.346362,62.375043,63.042269,63.084983,64.079737
D->W,5.0,91.057538,1.481214,88.950284,90.322571,91.276593,91.977455,92.760784


In [22]:
# dsne_large_uneven_scores = get_score_combinations(
#     suffix='dsne', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191105120356',
#     to_date='20191105154214'
# )*100
# dsne_large_uneven_scores.describe().T

dsne_large_uneven_scores = get_score_combinations_new(
    method='dsne',
    experiment_id='dsne_uneven_large',
    avg_types=['macro avg'],
)*100
dsne_large_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,82.006544,1.746592,80.447386,81.103619,81.287498,82.297248,84.896968
A->D,5.0,80.857453,3.553114,75.810612,80.170605,80.395155,82.351191,85.559702
W->A,5.0,64.476773,1.09209,63.534057,63.730474,64.034078,64.888069,66.197187
W->D,5.0,89.686751,3.566893,85.466601,88.014192,88.122183,93.292066,93.538713
D->A,5.0,63.997957,1.287688,62.607819,63.108509,64.067159,64.248378,65.957922
D->W,5.0,90.099385,2.351134,86.595243,89.745318,89.788602,91.505607,92.862155


In [23]:
# dsne_resnet_uneven_scores = get_score_combinations(
#     suffix='dsne', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191107085855',
#     to_date='20191107131237'
# )*100
# dsne_resnet_uneven_scores.describe().T


dsne_resnet_uneven_scores = get_score_combinations_new(
    method='dsne',
    experiment_id='dsne_uneven_resnet',
    avg_types=['macro avg'],
)*100
dsne_resnet_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,87.113041,2.059469,84.446705,86.025125,86.758302,88.938416,89.396658
A->D,5.0,87.076223,2.464536,83.374793,85.919893,87.734352,89.119094,89.232982
W->A,5.0,68.358893,2.172218,66.098332,67.311365,67.603428,69.00508,71.776258
W->D,5.0,91.403295,1.552391,88.924168,91.046221,91.694993,92.527626,92.823467
D->A,5.0,69.11144,2.098614,65.398859,69.586397,69.975771,70.148826,70.447345
D->W,5.0,90.969955,0.464405,90.392922,90.578467,91.097682,91.332567,91.448135


## Dage

In [24]:
# dage_resnet_uneven_scores = get_score_combinations(
#     suffix='homebrew', 
#     domains=['A','W','D'], 
#     avg_types=['macro avg'],
#     from_date='20191113093959',
#     to_date='20191113143542'
# )*100
# dage_resnet_uneven_scores.describe().T

dage_resnet_uneven_scores = get_score_combinations_new(
    method='dage',
    experiment_id='dage_uneven_resnet',
    avg_types=['macro avg'],
)*100
dage_resnet_uneven_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,89.288269,1.675657,86.808312,88.916546,89.364825,89.938798,91.412862
A->D,5.0,88.8645,2.584009,85.228901,87.131043,89.950014,90.748162,91.264381
W->A,5.0,65.502206,2.647026,62.979116,63.671843,64.131791,68.203819,68.524459
W->D,5.0,90.300239,3.579583,86.463322,87.055836,90.563158,92.549446,94.869432
D->A,5.0,65.268226,1.128416,63.829532,64.651385,65.056321,66.284887,66.519007
D->W,5.0,89.287878,1.89665,86.551691,88.577565,89.257746,90.6058,91.44659


### DAGE on logits
Varying batch_size

In [25]:
dage_logits_bs_16_scores = get_score_combinations_new(
    method='dage_logits',
    experiment_id='dage_batch_size',
    config_key='batch_size',
    config_value=16,
    avg_types=['macro avg'],
)*100
dage_logits_bs_16_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,4.0,79.083908,1.271973,77.54123,78.312879,79.231974,80.003002,80.330453
A->D,5.0,78.920532,2.641869,75.766935,76.451872,79.93522,80.882264,81.566368
W->A,5.0,58.906196,1.02569,57.556578,58.502733,58.910418,59.191292,60.369961
W->D,5.0,84.419557,2.946203,81.163686,82.923013,82.927431,87.262096,87.821561
D->A,5.0,60.146131,2.43571,57.159801,57.890933,61.403548,61.814952,62.461423
D->W,5.0,84.306695,2.23792,80.350447,84.733161,85.307703,85.570343,85.571821


In [26]:
dage_logits_bs_64_scores = get_score_combinations_new(
    method='dage_logits',
    experiment_id='dage_batch_size',
    config_key='batch_size',
    config_value=64,
    avg_types=['macro avg'],
)*100
dage_logits_bs_64_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,80.028115,0.590132,79.128785,79.939559,79.989499,80.38637,80.696359
A->D,5.0,78.932016,2.065651,76.18375,78.19784,78.513206,80.111157,81.654129
W->A,5.0,59.314446,1.868753,56.441307,59.320351,59.325398,59.838513,61.64666
W->D,5.0,81.452787,3.03951,78.348841,79.628901,80.6347,82.47962,86.171874
D->A,5.0,58.672926,0.980122,56.984856,58.668941,59.081773,59.279276,59.349786
D->W,5.0,80.804733,1.842784,79.017118,79.873186,80.463223,80.798907,83.87123


In [27]:
dage_logits_bs_256_scores = get_score_combinations_new(
    method='dage_logits',
    experiment_id='dage_batch_size',
    config_key='batch_size',
    config_value=256,
    avg_types=['macro avg'],
)*100
dage_logits_bs_256_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,78.077035,0.936379,77.276619,77.3176,77.694935,78.688687,79.407335
A->D,5.0,78.377147,2.118085,76.297195,76.590952,78.110061,79.491057,81.396471
W->A,5.0,57.651993,1.960048,54.485954,57.603871,57.714276,58.797821,59.658041
W->D,5.0,79.636448,1.50542,77.538537,78.58454,80.280083,80.853009,80.926072
D->A,5.0,58.045696,1.635476,56.91339,57.114697,57.236735,58.108714,60.854943
D->W,5.0,77.601154,1.620823,75.442019,76.746547,77.719568,78.374074,79.723563


In [28]:
dage_logits_bs_1024_scores = get_score_combinations_new(
    method='dage_logits',
    experiment_id='dage_batch_size',
    config_key='batch_size',
    config_value=1024,
    avg_types=['macro avg'],
)*100
dage_logits_bs_1024_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,76.556109,2.521913,72.080752,77.200189,77.575761,77.916085,78.00776
A->D,5.0,77.319947,2.220683,74.284633,76.087518,77.352838,79.324113,79.550633
W->A,5.0,57.49053,1.59136,55.947844,56.238811,57.079315,58.431549,59.755131
W->D,5.0,78.148223,3.468832,74.20793,74.816799,79.374859,80.240505,82.10102
D->A,5.0,56.912722,2.119283,54.136796,55.505918,57.076737,58.623209,59.220951
D->W,5.0,78.203661,1.282889,76.951547,77.055732,78.214027,78.749538,80.047461


In [29]:
dage_logits_bs_full_scores = get_score_combinations_new(
    method='dage_logits',
    experiment_id='dage_batch_size',
    config_key='batch_size',
    config_value=4096,
    avg_types=['macro avg'],
)*100
dage_logits_bs_full_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,76.015487,2.008171,73.251027,74.611663,76.870338,77.286302,78.058107
A->D,5.0,76.036335,2.05374,72.838716,75.856811,76.143272,76.879023,78.463851
W->A,5.0,58.777794,1.878089,55.750784,58.288953,59.410817,59.944393,60.494025
W->D,5.0,76.83271,1.943947,73.873896,76.535063,77.081852,77.417541,79.255198
D->A,5.0,57.77279,1.744264,55.799524,57.014259,57.458837,58.078332,60.513001
D->W,5.0,76.145234,0.844387,75.609745,75.611926,75.619274,76.337409,77.547813


### DAGE full on embeds
Varying batch_size

In [30]:
dage_embeds_bs_8_scores = get_score_combinations_new(
    method='dage_full',
    experiment_id='dage_embeds_batch_size',
    config_key='batch_size',
    config_value=8,
    avg_types=['macro avg'],
)*100
dage_embeds_bs_8_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,79.994329,1.362808,78.719737,78.746797,79.649224,81.331867,81.524022
A->D,5.0,79.999508,2.138067,77.290782,79.280819,79.907742,80.304644,83.213552
W->A,5.0,60.818767,1.540717,59.238493,59.58724,60.673469,61.559759,63.034873
W->D,5.0,84.134145,5.766925,77.580481,80.457541,83.70427,86.337104,92.591328
D->A,5.0,60.796396,2.133414,58.608103,59.696477,60.008891,61.553345,64.115165
D->W,5.0,85.6564,0.581485,84.980913,85.136223,85.794988,86.030473,86.339403


In [31]:
dage_embeds_bs_16_scores = get_score_combinations_new(
    method='dage_full',
    experiment_id='dage_embeds_batch_size',
    config_key='batch_size',
    config_value=16,
    avg_types=['macro avg'],
)*100
dage_embeds_bs_16_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,80.202092,0.311357,79.905275,80.018471,80.035384,80.405736,80.645592
A->D,5.0,81.548517,2.58906,77.663175,80.149496,82.817686,83.34623,83.766
W->A,5.0,60.788798,1.039866,59.429113,60.112678,60.880445,61.51477,62.006982
W->D,5.0,85.473387,2.188494,83.251723,84.754776,84.76889,85.483686,89.107861
D->A,5.0,59.394119,2.107688,56.432292,58.518405,59.384284,60.674045,61.961569
D->W,5.0,87.244853,1.785414,84.085285,87.707117,87.876475,88.173321,88.382067


In [32]:
dage_embeds_bs_32_scores = get_score_combinations_new(
    method='dage_full',
    experiment_id='dage_embeds_batch_size',
    config_key='batch_size',
    config_value=32,
    avg_types=['macro avg'],
)*100
dage_embeds_bs_32_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,79.371127,1.546785,77.188373,78.49967,79.609332,80.668139,80.890121
A->D,5.0,80.040065,3.422389,76.489657,77.208245,79.42501,82.711934,84.36548
W->A,5.0,58.299218,1.806373,55.495613,57.809094,58.664734,59.248212,60.27844
W->D,5.0,81.736062,2.890562,76.812753,81.850909,82.442604,83.428439,84.145605
D->A,5.0,59.523528,1.289578,58.142662,59.059336,59.302092,59.473827,61.639722
D->W,5.0,84.015783,2.618205,81.398696,83.094715,83.555621,83.615205,88.414676


In [33]:
dage_embeds_bs_64_scores = get_score_combinations_new(
    method='dage_full',
    experiment_id='dage_embeds_batch_size',
    config_key='batch_size',
    config_value=64,
    avg_types=['macro avg'],
)*100
dage_embeds_bs_64_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,77.987673,2.171584,74.280067,78.3238,78.506751,78.836706,79.99104
A->D,5.0,79.801568,2.300227,77.43541,78.61366,79.295146,80.15043,83.513195
W->A,5.0,58.199223,2.497218,54.331033,57.270018,59.163182,59.467387,60.764495
W->D,5.0,80.17652,2.340317,76.924322,79.410279,80.304661,80.864527,83.378809
D->A,5.0,59.119945,1.788596,57.357949,57.625739,58.610512,60.768776,61.23675
D->W,5.0,80.460327,2.734946,75.652358,81.086904,81.48546,81.603335,82.473578


In [34]:
dage_embeds_bs_128_scores = get_score_combinations_new(
    method='dage_full',
    experiment_id='dage_embeds_batch_size',
    config_key='batch_size',
    config_value=128,
    avg_types=['macro avg'],
)*100
dage_embeds_bs_128_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,78.452532,0.658984,77.771016,78.094979,78.135232,78.881536,79.379899
A->D,5.0,79.824112,2.210336,77.147235,78.609714,79.414721,81.112341,82.836548
W->A,5.0,57.8945,1.828793,55.558275,57.039884,57.939295,58.409507,60.52554
W->D,5.0,79.240452,3.369077,74.241175,77.296667,81.053702,81.500167,82.110551
D->A,5.0,57.90735,1.982,55.780913,55.832351,58.501436,59.595872,59.826177
D->W,5.0,78.202525,1.276799,77.294346,77.303749,77.306284,79.089689,80.018555


In [35]:
dage_embeds_bs_full_scores = get_score_combinations_new(
    method='dage_full',
    experiment_id='dage_embeds_batch_size',
    config_key='batch_size',
    config_value=4096,
    avg_types=['macro avg'],
)*100
dage_embeds_bs_full_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,75.839727,1.043335,74.703065,74.960641,75.844727,76.479166,77.211036
A->D,5.0,77.039101,0.685601,76.363638,76.463781,76.94587,77.410218,78.012
W->A,5.0,57.231632,1.142483,56.071605,56.466062,56.800323,58.006487,58.81368
W->D,5.0,77.564211,1.42229,75.406859,77.158046,77.811356,78.208278,79.236514
D->A,5.0,57.407096,1.235219,55.532449,57.030148,57.506901,58.236401,58.72958
D->W,5.0,77.680942,2.344724,74.172929,76.355856,79.002208,79.345524,79.528195


### DAGE full across on embeds
Varying loss alpha

In [49]:
dage_embeds_full_across_025_scores = get_score_combinations_new(
    method='dage_full_across',
    experiment_id='dage_full_across_alpha',
    config_key='loss_alpha',
    config_value=0.25,
    avg_types=['macro avg'],
)*100
dage_embeds_full_across_025_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,78.132738,0.957914,76.601064,78.030759,78.169007,78.795717,79.067146
A->D,5.0,78.815805,2.463657,76.690671,76.790369,78.34773,79.580003,82.67025
W->A,5.0,58.631871,0.776146,57.887623,58.069207,58.301541,59.238696,59.662289
W->D,5.0,78.755262,2.148625,75.363926,78.329984,79.338623,79.591798,81.151977
D->A,5.0,58.501797,1.144103,56.98439,58.135066,58.464294,58.77301,60.152224
D->W,5.0,81.377175,1.974768,78.859831,80.477262,80.81299,83.045625,83.690167


In [50]:
dage_embeds_full_across_05_scores = get_score_combinations_new(
    method='dage_full_across',
    experiment_id='dage_full_across_alpha',
    config_key='loss_alpha',
    config_value=0.5,
    avg_types=['macro avg'],
)*100
dage_embeds_full_across_05_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,79.005367,0.536058,78.525734,78.625045,78.846742,79.174931,79.854382
A->D,5.0,81.615218,2.296978,77.661222,81.860839,82.268975,82.744232,83.540823
W->A,5.0,58.974101,1.480721,56.867737,58.015734,59.649797,59.939936,60.3973
W->D,5.0,84.83158,3.680084,81.591465,81.966712,84.420459,85.437513,90.741749
D->A,5.0,59.218419,2.439969,55.791202,58.991089,59.166292,59.46803,62.675482
D->W,5.0,84.550307,1.720351,82.915333,83.394871,83.613755,86.350512,86.477064


In [51]:
dage_embeds_full_across_075_scores = get_score_combinations_new(
    method='dage_full_across',
    experiment_id='dage_full_across_alpha',
    config_key='loss_alpha',
    config_value=0.75,
    avg_types=['macro avg'],
)*100
dage_embeds_full_across_075_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,80.556851,2.158697,77.866955,79.343734,80.538891,81.457769,83.576905
A->D,5.0,81.191325,1.418538,79.733436,80.108261,80.91918,81.997996,83.197753
W->A,5.0,60.398224,1.915917,58.392399,58.65224,60.30425,61.999161,62.64307
W->D,5.0,86.507247,2.446548,82.947983,85.521752,86.847887,87.787879,89.430732
D->A,5.0,62.321222,1.389487,60.987556,60.995089,62.202245,63.316619,64.104599
D->W,5.0,86.449848,3.512403,80.967249,86.082391,86.294968,88.710572,90.194059


### DAGE pair across on embeds
Varying loss alpha

In [52]:
dage_embeds_pair_across_025_scores = get_score_combinations_new(
    method='dage_pair_across',
    experiment_id='dage_pair_across_alpha',
    config_key='loss_alpha',
    config_value=0.25,
    avg_types=['macro avg'],
)*100
dage_embeds_pair_across_025_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,81.022165,0.952997,79.723185,80.480612,81.179533,81.53901,82.188482
A->D,5.0,81.147679,2.978561,77.68631,79.464225,81.42744,81.487423,85.672998
W->A,5.0,61.307122,2.799733,56.609608,61.184955,62.33414,62.487376,63.919532
W->D,5.0,90.071752,1.659351,87.658533,89.328289,90.620986,90.691236,92.059717
D->A,5.0,61.011129,1.924858,58.69324,59.249128,61.653463,62.624866,62.834948
D->W,5.0,89.26586,2.572536,85.201268,88.6407,89.629717,91.240433,91.61718


In [53]:
dage_embeds_pair_across_05_scores = get_score_combinations_new(
    method='dage_pair_across',
    experiment_id='dage_pair_across_alpha',
    config_key='loss_alpha',
    config_value=0.5,
    avg_types=['macro avg'],
)*100
dage_embeds_pair_across_05_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,80.703122,1.293152,78.952336,79.78925,81.241225,81.42152,82.111279
A->D,5.0,80.964925,1.820209,79.081145,79.679134,80.561091,81.884715,83.61854
W->A,5.0,62.413675,1.70298,59.790147,61.71447,62.916765,63.807891,63.839101
W->D,5.0,90.168267,4.321881,83.208969,89.796508,90.655388,92.47498,94.70549
D->A,5.0,62.530311,3.291501,57.157322,62.409159,62.652462,65.152501,65.280113
D->W,5.0,89.613031,2.299037,86.660665,88.08542,89.637628,91.552261,92.129179


In [54]:
dage_embeds_pair_across_075_scores = get_score_combinations_new(
    method='dage_pair_across',
    experiment_id='dage_pair_across_alpha',
    config_key='loss_alpha',
    config_value=0.75,
    avg_types=['macro avg'],
)*100
dage_embeds_pair_across_075_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,79.763566,0.601079,79.259557,79.352233,79.365214,80.40662,80.434207
A->D,5.0,80.330842,3.000603,76.5103,77.787337,81.899086,82.025442,83.432044
W->A,5.0,63.437435,0.822838,62.06192,63.351282,63.676783,64.001346,64.095845
W->D,5.0,90.870442,0.965928,89.312267,90.786785,90.936787,91.497449,91.818924
D->A,5.0,62.046068,1.898032,59.891496,60.236502,62.833298,62.991372,64.277673
D->W,5.0,91.396558,0.988414,89.978404,91.062461,91.288408,92.180427,92.473088


### DAGE on aux dense layer
Varying embedding size

In [36]:
dage_aux_dense_16_bs_full_scores = get_score_combinations_new(
    method='dage_aux_dense',
    experiment_id='dage_vary_emb_size',
    config_key='aux_dense_size',
    config_value=16,
    avg_types=['macro avg'],
)*100
dage_aux_dense_16_bs_full_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,3.0,75.67096,1.762363,74.378412,74.667215,74.956017,76.317234,77.678451
A->D,2.0,76.873513,0.613393,76.439779,76.656646,76.873513,77.09038,77.307247
W->A,3.0,58.168872,1.57147,56.48762,57.452941,58.418262,59.009498,59.600735
W->D,3.0,77.336463,1.372849,76.441625,76.546149,76.650673,77.783883,78.917092
D->A,3.0,58.300719,1.674665,56.474824,57.568526,58.662227,59.213666,59.765105
D->W,3.0,76.847087,0.288125,76.559002,76.703004,76.847006,76.991129,77.135253


In [37]:
dage_aux_dense_31_bs_full_scores = get_score_combinations_new(
    method='dage_aux_dense',
    experiment_id='dage_vary_emb_size',
    config_key='aux_dense_size',
    config_value=31,
    avg_types=['macro avg'],
)*100
dage_aux_dense_31_bs_full_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,76.648572,1.123459,75.224573,76.08036,76.805344,76.863081,78.269501
A->D,5.0,76.263721,2.00848,73.99799,74.339123,76.915329,77.479687,78.586473
W->A,5.0,57.626929,1.346506,56.29634,56.862759,57.017681,58.319054,59.638812
W->D,5.0,76.878696,2.952629,73.112087,74.957499,77.663968,77.853541,80.806387
D->A,5.0,58.093881,1.741193,56.258467,56.579651,57.881051,59.682147,60.068087
D->W,5.0,77.190155,1.266553,75.055168,77.081361,77.604168,78.080643,78.129434


In [38]:
dage_aux_dense_64_bs_full_scores = get_score_combinations_new(
    method='dage_aux_dense',
    experiment_id='dage_vary_emb_size',
    config_key='aux_dense_size',
    config_value=64,
    avg_types=['macro avg'],
)*100
dage_aux_dense_64_bs_full_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,75.851632,1.220615,74.391887,74.870866,76.215497,76.352268,77.427643
A->D,5.0,76.102984,2.761717,71.75978,76.106419,76.246439,77.013755,79.388528
W->A,5.0,58.074874,1.906263,56.402399,56.590635,57.228511,59.441361,60.711463
W->D,5.0,78.156971,1.779846,76.737548,77.264012,77.31661,78.287108,81.179574
D->A,5.0,57.341154,1.306844,56.041984,56.174579,57.087808,58.596502,58.804898
D->W,5.0,77.703387,0.378159,77.389197,77.432831,77.552896,77.839525,78.302486


In [39]:
# The above experiments used all data every batch. We saw this to be suboptimal in the logits experiments, so here is a sredo with bs=16

In [40]:
dage_aux_dense_16_bs_16_scores = get_score_combinations_new(
    method='dage_aux_dense',
    experiment_id='dage_aux_dense_low_bs',
    config_key='aux_dense_size',
    config_value=16,
    avg_types=['macro avg'],
)*100
dage_aux_dense_16_bs_16_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,78.095393,1.564628,76.040409,77.574056,77.682719,79.015373,80.164409
A->D,5.0,78.983155,1.959221,77.112999,77.514309,79.06601,79.129493,82.092964
W->A,5.0,58.935442,1.330225,56.72927,59.027011,59.28159,59.310317,60.329024
W->D,5.0,78.89336,2.1555,75.905747,78.386469,78.845006,79.415049,81.914528
D->A,5.0,58.703949,1.243395,56.708511,58.252632,59.481615,59.516449,59.560539
D->W,5.0,78.60811,1.114381,76.993733,78.518521,78.536653,78.874543,80.117101


In [41]:
dage_aux_dense_31_bs_16_scores = get_score_combinations_new(
    method='dage_aux_dense',
    experiment_id='dage_aux_dense_low_bs',
    config_key='aux_dense_size',
    config_value=31,
    avg_types=['macro avg'],
)*100
dage_aux_dense_31_bs_16_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,77.986278,1.17924,75.946036,78.189759,78.290866,78.538023,78.966706
A->D,5.0,80.064392,2.253147,78.022577,78.255046,80.019891,80.38836,83.636087
W->A,5.0,58.451498,0.796745,57.272855,58.065425,58.778464,58.817357,59.323391
W->D,5.0,80.165652,4.047821,75.568208,77.2705,79.271136,84.202619,84.515796
D->A,5.0,59.688565,1.34341,58.136381,58.691773,59.697965,60.414609,61.5021
D->W,5.0,79.781735,1.356648,78.260484,78.857827,79.440324,80.862675,81.487365


In [42]:
dage_aux_dense_64_bs_16_scores = get_score_combinations_new(
    method='dage_aux_dense',
    experiment_id='dage_aux_dense_low_bs',
    config_key='aux_dense_size',
    config_value=64,
    avg_types=['macro avg'],
)*100
dage_aux_dense_64_bs_16_scores.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,78.560228,1.451011,76.966034,77.766725,77.969105,79.567699,80.531575
A->D,5.0,79.898318,2.521151,77.70705,77.892294,78.989637,81.281296,83.621313
W->A,5.0,58.018935,1.594799,55.961571,56.621681,59.117745,59.141312,59.252368
W->D,5.0,80.10027,2.190833,77.530068,78.979636,79.223601,82.121687,82.64636
D->A,5.0,59.459487,0.856987,58.351705,59.036513,59.29368,60.127541,60.487996
D->W,5.0,79.673429,1.46786,78.005352,79.174824,79.485187,79.6721,82.029682


## Summary

In [55]:
def present(df, name):
    df = df.describe().T[['mean','std']]
    tot_mean = df.mean()[['mean']]
    
    toStr = lambda s: f'{s:.2f}'
    df = df['mean'].map(toStr) + ' ±' + df['std'].map(toStr)
    df = pd.concat([df,tot_mean[['mean']]])
    return pd.DataFrame(df, columns=[name]).T


def highlight_max(s):
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]

df_all_from_img_vgg16 = pd.concat([
    present(tune_source_no_aug_scores, 'FT (source only, w/o aug, VGG16)'),
    present(tune_source_scores, 'FT (source only, w. aug, VGG16)'),
    present(tune_both_scores, 'FT (source and target, w. aug, VGG16)'),
    present(multitask_scores, 'Multitask, w. aug, VGG16'),
    present(ccsa_scores, 'CCSA (even, w. aug, VGG16)'),
    present(ccsa_uneven_scores, 'CCSA (uneven, w. aug, VGG16)'),
    present(dsne_scores, 'd-SNE (even, w. aug, VGG16)'),
    present(dsne_uneven_scores, 'd-SNE (uneven, w. aug, VGG16)'),
#     present(dsne_large_uneven_scores, 'd-SNE (large, w. aug, uneven, VGG16)'),
#     present(dage_logits_bs_16_scores, 'DAGE (uneven, w/o. aug, VGG16, bs 16)'),
#     present(dage_logits_bs_full_scores, 'DAGE (uneven, w/o. aug, VGG16, bs full)'),
#     present(ccsa_resnet_uneven_scores, 'CCSA (uneven, ResNet101_v2)'),
#     present(dsne_resnet_uneven_scores, 'd-SNE (uneven, ResNet101_v2)'),
#     present(dage_resnet_uneven_scores, 'DAGE (uneven, ResNet101_v2)'),
])

df_all_from_feat_vgg16 = pd.concat([
    present(ccsa_from_feat_batch_size_16_uneven_scores, 'CCSA (uneven, w/o. aug, VGG16, bs 16)'),
    present(ccsa_from_feat_batch_size_32_uneven_scores, 'CCSA (uneven, w/o. aug, VGG16, bs 32)'),
    present(ccsa_from_feat_batch_size_64_uneven_scores, 'CCSA (uneven, w/o. aug, VGG16, bs 64)'),
    present(ccsa_from_feat_batch_size_128_uneven_scores, 'CCSA (uneven, w/o. aug, VGG16, bs 128)'),
    present(ccsa_from_feat_batch_size_256_uneven_scores, 'CCSA (uneven, w/o. aug, VGG16, bs 256)'),
    present(ccsa_from_feat_batch_size_4096_uneven_scores, 'CCSA (uneven, w/o. aug, VGG16, bs full)'),
    present(dage_logits_bs_16_scores, 'DAGE (uneven, logits, w/o. aug, VGG16, bs 16)'),
    present(dage_logits_bs_64_scores, 'DAGE (uneven, logits, w/o. aug, VGG16, bs 64)'),
    present(dage_logits_bs_256_scores, 'DAGE (uneven, logits, w/o. aug, VGG16, bs 256)'),
    present(dage_logits_bs_1024_scores, 'DAGE (uneven, logits, w/o. aug, VGG16, bs 1024)'),
    present(dage_logits_bs_full_scores, 'DAGE (uneven, logits, w/o. aug, VGG16, bs full)'),
    present(dage_embeds_bs_8_scores, 'DAGE (uneven, embeds, full, w/o. aug, VGG16, bs 8)'),
    present(dage_embeds_bs_16_scores, 'DAGE (uneven, embeds, full, w/o. aug, VGG16, bs 16)'),
    present(dage_embeds_bs_32_scores, 'DAGE (uneven, embeds, full, w/o. aug, VGG16, bs 32)'),
    present(dage_embeds_bs_64_scores, 'DAGE (uneven, embeds, full, w/o. aug, VGG16, bs 64)'),
    present(dage_embeds_bs_128_scores, 'DAGE (uneven, embeds, full, w/o. aug, VGG16, bs 128)'),
    present(dage_embeds_bs_full_scores, 'DAGE (uneven, embeds, full, w/o. aug, VGG16, bs full)'),
    present(dage_embeds_full_across_025_scores, 'DAGE (uneven, embeds, full across, alpha=0.25 w/o. aug, VGG16, bs 16)'),
    present(dage_embeds_full_across_05_scores,  'DAGE (uneven, embeds, full across, alpha=0.50 w/o. aug, VGG16, bs 16)'),
    present(dage_embeds_full_across_075_scores, 'DAGE (uneven, embeds, full across, alpha=0.75 w/o. aug, VGG16, bs 16)'),
    present(dage_embeds_pair_across_025_scores, 'DAGE (uneven, embeds, pair across, alpha=0.25 w/o. aug, VGG16, bs 16)'),
    present(dage_embeds_pair_across_05_scores,  'DAGE (uneven, embeds, pair across, alpha=0.50 w/o. aug, VGG16, bs 16)'),
    present(dage_embeds_pair_across_075_scores, 'DAGE (uneven, embeds, pair across, alpha=0.75 w/o. aug, VGG16, bs 16)'),
    present(dage_aux_dense_16_bs_full_scores, 'DAGE (uneven, aux dense 16, w/o. aug, VGG16, bs full)'),
    present(dage_aux_dense_31_bs_full_scores, 'DAGE (uneven, aux dense 31, w/o. aug, VGG16, bs full)'),
    present(dage_aux_dense_64_bs_full_scores, 'DAGE (uneven, aux dense 64, w/o. aug, VGG16, bs full)'),
    present(dage_aux_dense_16_bs_16_scores, 'DAGE (uneven, aux dense 16, w/o. aug, VGG16, bs 16)'),
    present(dage_aux_dense_31_bs_16_scores, 'DAGE (uneven, aux dense 31, w/o. aug, VGG16, bs 16)'),
    present(dage_aux_dense_64_bs_16_scores, 'DAGE (uneven, aux dense 64, w/o. aug, VGG16, bs 16)'),
])

df_all_from_img_renset101 = pd.concat([
    present(ccsa_resnet_uneven_scores, 'CCSA (uneven, ResNet101_v2)'),
    present(dsne_resnet_uneven_scores, 'd-SNE (uneven, ResNet101_v2)'),
    present(dage_resnet_uneven_scores, 'DAGE (uneven, ResNet101_v2)'),
])

In [44]:
df_all_from_img_vgg16.style.apply(highlight_max)

Unnamed: 0,A->W,A->D,W->A,W->D,D->A,D->W,mean
"FT (source only, w/o aug, VGG16)",58.65 ±1.68,66.12 ±1.98,45.66 ±1.14,98.71 ±0.49,45.96 ±2.78,91.56 ±1.37,67.7758
"FT (source only, w. aug, VGG16)",59.52 ±1.31,67.16 ±1.62,45.30 ±2.33,98.66 ±0.77,46.94 ±1.68,94.33 ±1.31,68.6523
"FT (source and target, w. aug, VGG16)",61.45 ±1.87,71.57 ±2.74,52.38 ±1.51,99.17 ±0.62,52.61 ±2.53,95.05 ±2.36,72.0369
"Multitask, w. aug, VGG16",82.79 ±2.18,80.12 ±3.58,63.62 ±1.67,96.02 ±1.81,63.54 ±2.04,93.96 ±0.80,80.0078
"CCSA (even, w. aug, VGG16)",84.12 ±1.93,82.13 ±3.74,61.11 ±2.25,92.81 ±2.51,62.38 ±2.73,93.71 ±1.36,79.378
"CCSA (uneven, w. aug, VGG16)",84.71 ±1.43,82.60 ±2.86,62.24 ±3.07,93.79 ±2.25,61.60 ±3.13,93.24 ±2.46,79.6966
"d-SNE (even, w. aug, VGG16)",83.60 ±2.93,81.46 ±1.80,62.48 ±2.37,93.67 ±2.51,63.60 ±1.22,93.73 ±1.36,79.756
"d-SNE (uneven, w. aug, VGG16)",81.33 ±2.28,82.18 ±4.72,63.37 ±1.64,87.88 ±4.71,62.79 ±1.01,91.06 ±1.48,78.101


In [56]:
df_all_from_feat_vgg16.style.apply(highlight_max)

Unnamed: 0,A->W,A->D,W->A,W->D,D->A,D->W,mean
"CCSA (uneven, w/o. aug, VGG16, bs 16)",79.97 ±2.34,80.01 ±1.29,61.97 ±1.28,90.42 ±3.11,61.92 ±1.33,90.64 ±1.51,77.4893
"CCSA (uneven, w/o. aug, VGG16, bs 32)",80.01 ±1.55,79.59 ±1.41,62.50 ±1.23,90.45 ±2.09,63.11 ±1.03,90.81 ±1.08,77.7451
"CCSA (uneven, w/o. aug, VGG16, bs 64)",79.71 ±2.31,79.14 ±3.45,61.36 ±1.11,88.77 ±2.19,62.50 ±1.69,90.69 ±2.12,77.0291
"CCSA (uneven, w/o. aug, VGG16, bs 128)",79.57 ±1.78,78.38 ±2.58,61.72 ±1.30,88.61 ±1.86,63.45 ±1.40,88.27 ±3.02,76.6672
"CCSA (uneven, w/o. aug, VGG16, bs 256)",79.38 ±1.99,78.79 ±2.30,58.47 ±3.25,86.85 ±4.26,60.12 ±2.54,86.12 ±3.02,74.9541
"CCSA (uneven, w/o. aug, VGG16, bs full)",74.34 ±2.77,70.57 ±3.10,55.95 ±2.53,80.93 ±3.12,55.97 ±5.17,82.36 ±3.74,70.0214
"DAGE (uneven, logits, w/o. aug, VGG16, bs 16)",79.08 ±1.27,78.92 ±2.64,58.91 ±1.03,84.42 ±2.95,60.15 ±2.44,84.31 ±2.24,74.2972
"DAGE (uneven, logits, w/o. aug, VGG16, bs 64)",80.03 ±0.59,78.93 ±2.07,59.31 ±1.87,81.45 ±3.04,58.67 ±0.98,80.80 ±1.84,73.2008
"DAGE (uneven, logits, w/o. aug, VGG16, bs 256)",78.08 ±0.94,78.38 ±2.12,57.65 ±1.96,79.64 ±1.51,58.05 ±1.64,77.60 ±1.62,71.5649
"DAGE (uneven, logits, w/o. aug, VGG16, bs 1024)",76.56 ±2.52,77.32 ±2.22,57.49 ±1.59,78.15 ±3.47,56.91 ±2.12,78.20 ±1.28,70.7719


In [46]:
df_all_from_img_renset101.style.apply(highlight_max)

Unnamed: 0,A->W,A->D,W->A,W->D,D->A,D->W,mean
"CCSA (uneven, ResNet101_v2)",88.52 ±1.16,87.69 ±2.36,68.78 ±2.00,94.85 ±1.43,67.16 ±3.65,93.64 ±0.67,83.441
"d-SNE (uneven, ResNet101_v2)",87.11 ±2.06,87.08 ±2.46,68.36 ±2.17,91.40 ±1.55,69.11 ±2.10,90.97 ±0.46,82.3388
"DAGE (uneven, ResNet101_v2)",89.29 ±1.68,88.86 ±2.58,65.50 ±2.65,90.30 ±3.58,65.27 ±1.13,89.29 ±1.90,81.4186


# Utils

In [47]:
[ str(p) for p in
    load_cls_rep_paths(
        suffix='tune_target',
        from_date='20191023072318',
        to_date='20191023120138'
    )
]

ValueError: invalid literal for int() with base 10: 'tune'

In [None]:
# restructure former runs
def reorganize(from_date, to_date, suffix, experiment_id='your_id', description='your_description_here'):

    RUNS_DIR_OLD = RUNS_DIR / '..' / 'runs_old'

    for item in RUNS_DIR_OLD.glob('*'):
        parts = item.name.split('_')
        if len(parts)<3:
            continue
        timestamp = parts[0]
        src = parts[1]
        tgt = parts[2]
        method = '_'.join(parts[3:])
        
        if (item.is_dir() 
            and item.name.endswith(suffix)
            and int(item.name.split('_')[0]) >= int(from_date)
            and int(item.name.split('_')[0]) <= int(to_date)):

            seed = load_json(item / 'config.json')['seed']

            new_method_dir = RUNS_DIR / method / experiment_id 
            new_run_dir = new_method_dir / '{}{}_{}_{}'.format(src,tgt,seed,timestamp)
            
            try:
                shutil.copytree(item, new_run_dir)
            except:
                pass
            
            new_run_dir.mkdir(parents=True, exist_ok=True)

            with open(new_method_dir / 'description.txt', 'a') as f:
                f.write(description)


In [None]:
reorganize(
    from_date='20191014123846',
    to_date='20191014162536',
    suffix='tune_source',
    experiment_id='tune_source_no_aug',
    description='In this experiment, we tune a VGG16-network pretrained on ImageNet with all available source data. The target data is used for validation (during training) and test. No augmentation was used'
)

reorganize(
    from_date='20191022103424',
    to_date='20191022142437',
    suffix='tune_source',
    experiment_id='tune_source_with_aug',
    description='In this experiment, we tune a VGG16-network pretrained on ImageNet with all available source data. The target data is used for validation (during training) and test. Augmentation was used'
)

reorganize(
    from_date='20191023072318',
    to_date='20191023123426',
    suffix='tune_target',
    experiment_id='tune_target_with_aug',
    description='Fine tune target from ImageNet model that was already fine-tuned on source. Augmentation was used. No batchnorm was used.'
)

reorganize(
    from_date='20191025141713',
    to_date='20191025170306',
    suffix='ccsa',
    experiment_id='ccsa_without_batchnorn',
    description='CCSA method without batchnorm. In this experiment, we saw stability issues for loss_alpha over 0.01.'
)

reorganize(
    from_date='20191101173414',
    to_date='20191101191220',
    suffix='ccsa',
    experiment_id='ccsa_with_batchnorn_alpha_0',
    description='CCSA method with batchnorm and loss_alpha=0. This amounts to a multi-task learning setup.'
)

reorganize(
    from_date='20191101150707',
    to_date='20191101171905',
    suffix='ccsa',
    experiment_id='ccsa_with_batchnorn_alpha_0.25',
    description='CCSA method with batchnorm and loss_alpha=0.25.'
)

reorganize(
    from_date='20191104152050',
    to_date='20191104185818',
    suffix='ccsa',
    experiment_id='ccsa_uneven',
    description='CCSA method with batchnorm and loss_alpha=0.25. Here, we only weight the target softmax (source softmax not used).'
)

reorganize(
    from_date='20191106083058',
    to_date='20191106144631',
    suffix='ccsa',
    experiment_id='ccsa_resnet_uneven',
    description='CCSA method with batchnorm and loss_alpha=0.25. Here, we only weight the target softmax (source softmax not used). ResNet features where used.'
)

reorganize(
    from_date='20191104094606',
    to_date='20191104124943',
    suffix='dsne',
    experiment_id='dsne_even',
    description='DSNE method using both source and target softmax loss'
)

reorganize(
    from_date='20191104152038',
    to_date='20191104182550',
    suffix='dsne',
    experiment_id='dsne_uneven',
    description='DSNE method using only target softmax loss'
)

reorganize(
    from_date='20191105120356',
    to_date='20191105154214',
    suffix='dsne',
    experiment_id='dsne_uneven_large',
    description='DSNE method using only target softmax loss. Larger dimensionality was used for dense and embedding layers'
)

reorganize(
    from_date='20191107085855',
    to_date='20191107131237',
    suffix='dsne',
    experiment_id='dsne_uneven_resnet',
    description='DSNE method using only target softmax loss. ResNet features where used.'
)

reorganize(
    from_date='20191113093959',
    to_date='20191113143542',
    suffix='homebrew',
    experiment_id='dage_uneven_resnet',
    description='DAGE method using only target softmax loss. ResNet features where used.'
)