#### Analyse Experiments

In [1]:
import os
import re
import glob

import pandas as pd

pd.set_option("display.max_rows", 30)



def parse_output(fpath, splits):
    args_dict = dict()
    results_dict = {
        "best_acc": {split: 0 for split in splits},
        "last_acc": {split: 0 for split in splits},
    }
    with open(fpath) as f:
        lines = f.readlines()
    
    arg_pattern = re.compile(r"^(\S+)\s*:\s+(.+)$")
    result_pattern = re.compile(r"\[(\w+)\]\s+acc\s+([\d.]+)")
    
    for line in lines:
        arg_match = arg_pattern.match(line)
        if arg_match:
            k, v = arg_match.groups()
            if k != "WARNING":
                args_dict[k] = v
        
        result_matches = result_pattern.findall(line)
        if result_matches:
            for split, acc in result_matches:
                if split in splits:
                    acc = float(acc)
                    if acc > results_dict["best_acc"][split]:
                        results_dict["best_acc"][split]= acc
                    results_dict["last_acc"][split] = acc
    
    return args_dict, results_dict



def collect_results(folder, splits, args_to_select, pattern="e_*.out"):
    type_map = {
        "str": str,
        "int": int,
        "float": float,
    }
    
    raw_df = []
    
    for fpath in glob.glob(os.path.join(folder, pattern)):
        raw_row = dict()
        args, results = parse_output(fpath, splits)
        
        for arg, arg_type in args_to_select.items():
            raw_value = args.get(arg)
            raw_row[arg] = type_map[arg_type](raw_value)
            
        for split in splits:
            raw_row[f"{split}_best_acc"] = results["best_acc"].get(split, None)
            raw_row[f"{split}_last_acc"] = results["last_acc"].get(split, None)
            
        id_match = re.search(r"e_(\d+)\.out", fpath)
        if id_match:
            index = int(id_match.group(1))
        else:
            raise Exception("Error!")
        raw_row["id"] = index
        
        raw_df.append(raw_row)
    
    df = pd.DataFrame(raw_df)
    return df   

In [2]:
folder1 = "/home/ym2380/elastic_net/official/logs/exp11/shallow_net/config1024/baseline"
splits1 = ['train', 'val1', 'val2']
args_to_select1 = {"exp_name": "str", "lr": "float", "batch_size": "int"}

df1 = collect_results(folder1, splits1, args_to_select1)

In [3]:
folder2 = "/home/ym2380/elastic_net/official/logs/exp11/shallow_net/config1024/finetune/baseline"
splits2 = ['train', 'val']
args_to_select2 = {"filename": "str", 'lr': "float"}

df2 = collect_results(folder2, splits2, args_to_select2)

In [4]:
folder3 = "/home/ym2380/elastic_net/official/logs/exp11/shallow_net/config1024/reg"
splits3 = ['train', 'val1', 'val2']
args_to_select3 = {"exp_name": "str", "lr": "float", "batch_size": "int", "alpha": "float", "rho": "float"}

df3 = collect_results(folder3, splits3, args_to_select3)

In [5]:
folder4 = "/home/ym2380/elastic_net/official/logs/exp11/shallow_net/config1024/finetune/reg"
splits4 = ['train', 'val']
args_to_select4 = {"filename": "str", 'lr': "float"}

df4 = collect_results(folder4, splits4, args_to_select4)

In [6]:
def filter_df(df, filter_conditions):
    condition = pd.Series([True] * len(df), index=df.index)
    
    for column, value in filter_conditions.items():
        condition = condition & (df[column] == value)
        
    return df[condition]


def filter4max(df):
    indices = df.groupby(['filename'])["val_last_acc"].idxmax()
    max_acc_rows = df.loc[indices]

    return max_acc_rows

In [7]:
#baseline processing
temp_df1 = df1.drop(columns=["train_best_acc", "train_last_acc", "val1_best_acc", "val1_last_acc"])
temp_df2 = df2.drop(columns=["train_best_acc", "train_last_acc", "id", 'lr'])
temp_df2 = filter4max(temp_df2)
baseline_df = pd.merge(temp_df1, temp_df2, left_on="exp_name", right_on="filename")
baseline_df = baseline_df.drop(columns=["filename",])
baseline_df = baseline_df.sort_values(by="id", ascending=True)
baseline_df = baseline_df.drop(columns=['id',])
baseline_df

Unnamed: 0,exp_name,lr,batch_size,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
1,11031030_0,0.1,25,0.6495,0.649,0.822,0.766
10,11031030_1,0.1,50,0.5525,0.5415,0.849,0.803
8,11031030_2,0.1,100,0.6085,0.6005,0.878,0.878
6,11031030_3,0.01,25,0.598,0.5955,0.987,0.977
15,11031030_4,0.01,50,0.586,0.5845,0.995,0.995
13,11031030_5,0.01,100,0.549,0.5485,0.961,0.906
4,11031030_6,0.001,25,0.628,0.6275,0.896,0.877
2,11031030_7,0.001,50,0.5425,0.5405,0.9,0.88
17,11031030_8,0.001,100,0.58,0.5795,0.913,0.911
9,11031030_9,0.0001,25,0.599,0.599,0.775,0.745


In [8]:
#regularization experiments processing
temp_df3 = df3.drop(columns=["train_best_acc", "train_last_acc"])
temp_df4 = df4.drop(columns=["train_best_acc", "train_last_acc", "id", "lr"])
temp_df4 = filter4max(temp_df4)
reg_df = pd.merge(temp_df3, temp_df4, left_on="exp_name", right_on="filename")
reg_df = reg_df.drop(columns=["filename",])
reg_df = reg_df.sort_values(by="id", ascending=True)
reg_df = reg_df.drop(columns=['id',])
reg_df

Unnamed: 0,exp_name,lr,batch_size,alpha,rho,val1_best_acc,val1_last_acc,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
57,11031038_0,0.100000,25,1.000000,1.00,0.5010,0.5000,0.5075,0.5000,0.612,0.549
361,11031038_1,0.100000,25,1.000000,0.75,0.5040,0.4985,0.5565,0.4945,0.611,0.561
288,11031038_2,0.100000,25,1.000000,0.50,0.5015,0.5010,0.5035,0.5010,0.566,0.535
223,11031038_3,0.100000,25,1.000000,0.25,0.5035,0.5030,0.5165,0.5155,0.573,0.568
537,11031038_4,0.100000,25,1.000000,0.00,0.5000,0.5000,0.5035,0.5000,0.521,0.508
...,...,...,...,...,...,...,...,...,...,...,...
412,11031113_625,0.000001,100,0.000001,1.00,0.9770,0.9735,0.6000,0.5430,0.963,0.928
187,11031113_626,0.000001,100,0.000001,0.75,0.9815,0.9750,0.6160,0.5640,0.981,0.972
591,11031113_627,0.000001,100,0.000001,0.50,0.9720,0.9685,0.5000,0.4830,0.975,0.957
602,11031113_628,0.000001,100,0.000001,0.25,0.9820,0.9820,0.6020,0.5540,0.988,0.970


In [12]:
conditions = {'lr': 0.001, 'batch_size': 25}

filtered_baseline_df = filter_df(baseline_df, conditions)
filtered_reg_df = filter_df(reg_df, conditions)

In [13]:
filtered_baseline_df

Unnamed: 0,exp_name,lr,batch_size,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
4,11031030_6,0.001,25,0.628,0.6275,0.896,0.877


In [14]:
pd.set_option("display.max_rows", None)
filtered_reg_df

Unnamed: 0,exp_name,lr,batch_size,alpha,rho,val1_best_acc,val1_last_acc,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
178,11031041_210,0.001,25,1.0,1.0,0.502,0.5,0.519,0.5,0.551,0.519
258,11031041_211,0.001,25,1.0,0.75,0.5,0.5,0.5,0.5,0.546,0.504
281,11031041_212,0.001,25,1.0,0.5,0.5025,0.5,0.5105,0.5,0.637,0.578
291,11031041_213,0.001,25,1.0,0.25,0.503,0.5,0.5155,0.5,0.56,0.541
225,11031041_214,0.001,25,1.0,0.0,0.5,0.5,0.5,0.5,0.566,0.566
363,11031041_215,0.001,25,0.1,1.0,0.5,0.5,0.5105,0.5,0.555,0.54
416,11031041_216,0.001,25,0.1,0.75,0.5005,0.5,0.518,0.5005,0.627,0.562
385,11031041_217,0.001,25,0.1,0.5,0.5,0.5,0.502,0.5,0.588,0.563
425,11031041_218,0.001,25,0.1,0.25,0.5015,0.5,0.5285,0.5,0.592,0.544
100,11031041_219,0.001,25,0.1,0.0,0.5,0.5,0.5045,0.5,0.539,0.511
