# Test evaluations

In [7]:
from os.path import realpath
from pathlib import Path
import pandas as pd
import itertools

RUNS_DIR = Path(realpath('.')) / 'runs'
if not 'domain-adaptation/runs' in str(RUNS_DIR):
    RUNS_DIR = Path(realpath('.')).parent / 'runs'
assert('domain-adaptation/runs' in str(RUNS_DIR))

In [10]:
def load_cls_rep_paths(
    suffix:str, 
    runs_dir:Path=RUNS_DIR, 
    from_date:str='19700101000000', 
    to_date:str='30001010000000'
):
    assert(len(from_date)==14 and len(to_date)==14)
    return sorted([
        item / 'report.json' 
        for item in runs_dir.glob('*') 
        if item.is_dir() 
        and item.name.endswith(suffix)
        and int(item.name.split('_')[0]) >= int(from_date)
        and int(item.name.split('_')[0]) <= int(to_date)
    ])

def get_score(
    suffix:str, 
    runs_dir:Path=RUNS_DIR, 
    metric:str='precision',
    avg_types=['macro avg', 'weighted avg'],
    from_date:str='19700101000000', 
    to_date:str='30001010000000',
    map_col_name= lambda n: n
):
    report_paths = load_cls_rep_paths(suffix, RUNS_DIR, from_date, to_date)
    reports = [ pd.read_json(p) for p in report_paths ]
    score = pd.DataFrame(
        [[r[avgt][metric] for avgt in avg_types]
         for r in reports
        ],
        columns = [ map_col_name(avgt) for avgt in avg_types]
    ) 
    return score

def get_score_combinations(
    suffix:str, 
    domains=['A','W','D'], 
    runs_dir:Path=RUNS_DIR, 
    metric:str='precision',
    avg_types=['macro avg', 'weighted avg'],
    from_date:str='19700101000000', 
    to_date:str='30001010000000',
):
    combos = [c for c in itertools.product(domains, repeat=2) if c[0] != c[1]]
    scores = [
        get_score(
            suffix='{}_{}_{}'.format(c[0],c[1],suffix),
            runs_dir=runs_dir,
            metric=metric,
            avg_types=avg_types,
            from_date=from_date,
            to_date=to_date,
            map_col_name=lambda n: '{}->{}'.format(c[0],c[1]),
        )
        for c in combos
    ]
    return pd.concat(scores, sort=False)
    

# get_score(suffix='A_D_tune_source', avg_types=['macro avg']).describe()

# reports = [ pd.read_json(p) for p in load_cls_rep_paths(suffix='A_D_tune_source') ]
# reports[0]

## Tune source only
In this experiment, we tune a VGG16-network pretrained on ImageNet with all available source data.
The target data is used for validation (during training) and test.

In [12]:
# without augmentation
get_score_combinations(
    suffix='tune_source', 
    domains=['A','W','D'], 
    avg_types=['macro avg'],
    from_date='20191014123846',
    to_date='20191014162536'
).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A->W,5.0,0.586477,0.016759,0.566144,0.57265,0.592721,0.593719,0.607151
A->D,5.0,0.661192,0.019785,0.639226,0.642724,0.663245,0.679517,0.681248
W->A,5.0,0.456556,0.011358,0.447204,0.452163,0.45321,0.453873,0.476327
W->D,5.0,0.987148,0.004879,0.98141,0.982794,0.988542,0.990161,0.992832
D->A,5.0,0.459564,0.027775,0.423686,0.446732,0.452096,0.487453,0.487856
D->W,5.0,0.915612,0.013746,0.898267,0.90538,0.918105,0.924308,0.931997


In [14]:
# with augmentation
get_score_combinations(
    suffix='tune_source', 
    domains=['A','W','D'], 
    avg_types=['macro avg'],
    from_date='20191022100000',
    to_date='20191022200000'
).describe().T

Unnamed: 0,count,unique,top,freq
A->W,0,0,,
A->D,0,0,,
W->A,0,0,,
W->D,0,0,,
D->A,0,0,,
D->W,0,0,,


## Tune source and target

In [149]:
get_score_combinations(
    suffix='tune_both', 
    domains=['A','W','D'], 
    avg_types=['macro avg']
).describe()

Unnamed: 0,A->W macro avg,A->D macro avg,W->A macro avg,W->D macro avg,D->A macro avg,D->W macro avg
count,0.0,0.0,0.0,0.0,0.0,0.0
unique,0.0,0.0,0.0,0.0,0.0,0.0
top,,,,,,
freq,,,,,,
