#### Analyse Experiments

In [46]:
import os
import re
import glob

import pandas as pd



def parse_output(fpath, splits):
    args_dict = dict()
    results_dict = {
        "best_acc": {split: 0 for split in splits},
        "last_acc": {split: 0 for split in splits},
    }
    with open(fpath) as f:
        lines = f.readlines()
    
    arg_pattern = re.compile(r"^(\S+)\s*:\s+(.+)$")
    result_pattern = re.compile(r"\[(\w+)\]\s+acc\s+([\d.]+)")
    
    for line in lines:
        arg_match = arg_pattern.match(line)
        if arg_match:
            k, v = arg_match.groups()
            if k != "WARNING":
                args_dict[k] = v
        
        result_matches = result_pattern.findall(line)
        if result_matches:
            for split, acc in result_matches:
                if split in splits:
                    acc = float(acc)
                    if acc > results_dict["best_acc"][split]:
                        results_dict["best_acc"][split]= acc
                    results_dict["last_acc"][split] = acc
    
    return args_dict, results_dict



def collect_results(folder, splits, args_to_select, pattern="e_*.out"):
    type_map = {
        "str": str,
        "int": int,
        "float": float,
    }
    
    raw_df = []
    
    for fpath in glob.glob(os.path.join(folder, pattern)):
        raw_row = dict()
        args, results = parse_output(fpath, splits)
        
        for arg, arg_type in args_to_select.items():
            raw_value = args.get(arg)
            raw_row[arg] = type_map[arg_type](raw_value)
            
        for split in splits:
            raw_row[f"{split}_best_acc"] = results["best_acc"].get(split, None)
            raw_row[f"{split}_last_acc"] = results["last_acc"].get(split, None)
            
        id_match = re.search(r"e_(\d+)\.out", fpath)
        if id_match:
            index = int(id_match.group(1))
        else:
            raise Exception("Error!")
        raw_row["id"] = index
        
        raw_df.append(raw_row)
    
    df = pd.DataFrame(raw_df)
    return df   

In [54]:
folder1 = "/home/ym2380/elastic_net/official/logs/exp09/shallow_net/bias095/baseline/save"
splits1 = ['train', 'val1', 'val2']
args_to_select1 = {"exp_name": "str", "lr": "float", "batch_size": "int"}

df1 = collect_results(folder1, splits1, args_to_select1)

In [71]:
folder2 = "/home/ym2380/elastic_net/official/logs/exp09/shallow_net/finetune/baseline"
splits2 = ['train', 'val']
args_to_select2 = {"filename": "str", 'lr': "float"}

df2 = collect_results(folder2, splits2, args_to_select2)

In [74]:
folder3 = "/home/ym2380/elastic_net/official/logs/exp09/shallow_net/bias095/reg/save"
splits3 = ['train', 'val1', 'val2']
args_to_select3 = {"exp_name": "str", "lr": "float", "batch_size": "int", "alpha": "float", "rho": "float"}

df3 = collect_results(folder3, splits3, args_to_select3)

In [76]:
folder4 = "/home/ym2380/elastic_net/official/logs/exp09/shallow_net/finetune/reg"
splits4 = ['train', 'val']
args_to_select4 = {"filename": "str", 'lr': "float"}

df4 = collect_results(folder4, splits4, args_to_select4)

In [64]:
def filter_df(df, filter_conditions):
    condition = pd.Series([True] * len(df), index=df.index)
    
    for column, value in filter_conditions.items():
        condition = condition & (df[column] == value)
        
    return df[condition]


def filter4max(df):
    indices = df.groupby(['filename'])["val_last_acc"].idxmax()
    max_acc_rows = df.loc[indices]

    return max_acc_rows

In [81]:
#baseline processing
temp_df1 = df1.drop(columns=["train_best_acc", "train_last_acc", "val1_best_acc", "val1_last_acc"])
temp_df2 = df2.drop(columns=["train_best_acc", "train_last_acc", "id", 'lr'])
temp_df2 = filter4max(temp_df2)
baseline_df = pd.merge(temp_df1, temp_df2, left_on="exp_name", right_on="filename")
baseline_df = baseline_df.drop(columns=["filename",])
baseline_df = baseline_df.sort_values(by="id", ascending=True)
baseline_df = baseline_df.drop(columns=['id',])
baseline_df

Unnamed: 0,exp_name,lr,batch_size,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
0,10310830_0,0.1,50,0.6295,0.6295,0.89,0.882
6,10310830_1,0.1,100,0.601,0.5895,0.916,0.873
4,10310830_2,0.01,50,0.6005,0.595,0.934,0.927
3,10310830_3,0.01,100,0.563,0.5595,0.965,0.964
8,10310830_4,0.001,50,0.5975,0.596,0.904,0.888
7,10310830_5,0.001,100,0.5895,0.5895,0.912,0.858
2,10310830_6,0.0001,50,0.5655,0.5645,0.78,0.707
1,10310830_7,0.0001,100,0.541,0.53,0.764,0.754
9,10310830_8,1e-05,50,0.51,0.51,0.679,0.606
5,10310830_9,1e-05,100,0.605,0.526,0.823,0.809


In [82]:
#regularization experiments processing
temp_df3 = df3.drop(columns=["train_best_acc", "train_last_acc", "val1_best_acc", "val1_last_acc"])
temp_df4 = df4.drop(columns=["train_best_acc", "train_last_acc", "id", "lr"])
temp_df4 = filter4max(temp_df4)
reg_df = pd.merge(temp_df3, temp_df4, left_on="exp_name", right_on="filename")
reg_df = reg_df.drop(columns=["filename",])
reg_df = reg_df.sort_values(by="id", ascending=True)
reg_df = reg_df.drop(columns=['id',])
reg_df

Unnamed: 0,exp_name,lr,batch_size,alpha,rho,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
19,10310838_0,0.10000,50,0.001000,1.0,0.6280,0.5985,0.633,0.633
113,10310838_1,0.10000,50,0.001000,0.8,0.6550,0.6215,0.648,0.619
88,10310838_2,0.10000,50,0.001000,0.6,0.6420,0.5895,0.706,0.704
67,10310838_3,0.10000,50,0.001000,0.3,0.6395,0.5815,0.713,0.710
170,10310838_4,0.10000,50,0.001000,0.0,0.6520,0.6290,0.669,0.654
...,...,...,...,...,...,...,...,...,...
21,10310838_195,0.00001,100,0.000001,1.0,0.5035,0.5015,0.839,0.783
0,10310838_196,0.00001,100,0.000001,0.8,0.5200,0.5195,0.821,0.782
108,10310838_197,0.00001,100,0.000001,0.6,0.5080,0.5070,0.823,0.778
110,10310838_198,0.00001,100,0.000001,0.3,0.5240,0.5230,0.808,0.709


In [119]:
conditions = {'lr': 0.001, 'batch_size': 50}

filtered_baseline_df = filter_df(baseline_df, conditions)
filtered_reg_df = filter_df(reg_df, conditions)

In [120]:
filtered_baseline_df

Unnamed: 0,exp_name,lr,batch_size,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
8,10310830_4,0.001,50,0.5975,0.596,0.904,0.888


In [121]:
filtered_reg_df

Unnamed: 0,exp_name,lr,batch_size,alpha,rho,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
79,10310838_80,0.001,50,0.001,1.0,0.6025,0.601,0.917,0.876
150,10310838_81,0.001,50,0.001,0.8,0.6075,0.6075,0.964,0.946
30,10310838_82,0.001,50,0.001,0.6,0.6085,0.607,0.95,0.925
92,10310838_83,0.001,50,0.001,0.3,0.587,0.587,0.954,0.95
168,10310838_84,0.001,50,0.001,0.0,0.642,0.638,0.901,0.883
45,10310838_85,0.001,50,0.0001,1.0,0.5465,0.5435,0.92,0.85
109,10310838_86,0.001,50,0.0001,0.8,0.593,0.593,0.983,0.977
185,10310838_87,0.001,50,0.0001,0.6,0.6015,0.6015,0.971,0.965
57,10310838_88,0.001,50,0.0001,0.3,0.6145,0.614,0.947,0.923
126,10310838_89,0.001,50,0.0001,0.0,0.603,0.603,0.972,0.962
