#### Analyse Experiments

In [1]:
import os
import re
import glob

import pandas as pd



def parse_output(fpath, splits):
    args_dict = dict()
    results_dict = {
        "best_acc": {split: 0 for split in splits},
        "last_acc": {split: 0 for split in splits},
    }
    with open(fpath) as f:
        lines = f.readlines()
    
    arg_pattern = re.compile(r"^(\S+)\s*:\s+(.+)$")
    result_pattern = re.compile(r"\[(\w+)\]\s+acc\s+([\d.]+)")
    
    for line in lines:
        arg_match = arg_pattern.match(line)
        if arg_match:
            k, v = arg_match.groups()
            if k != "WARNING":
                args_dict[k] = v
        
        result_matches = result_pattern.findall(line)
        if result_matches:
            for split, acc in result_matches:
                if split in splits:
                    acc = float(acc)
                    if acc > results_dict["best_acc"][split]:
                        results_dict["best_acc"][split]= acc
                    results_dict["last_acc"][split] = acc
    
    return args_dict, results_dict



def collect_results(folder, splits, args_to_select, pattern="e_*.out"):
    type_map = {
        "str": str,
        "int": int,
        "float": float,
    }
    
    raw_df = []
    
    for fpath in glob.glob(os.path.join(folder, pattern)):
        raw_row = dict()
        args, results = parse_output(fpath, splits)
        
        for arg, arg_type in args_to_select.items():
            raw_value = args.get(arg)
            raw_row[arg] = type_map[arg_type](raw_value)
            
        for split in splits:
            raw_row[f"{split}_best_acc"] = results["best_acc"].get(split, None)
            raw_row[f"{split}_last_acc"] = results["last_acc"].get(split, None)
            
        id_match = re.search(r"e_(\d+)\.out", fpath)
        if id_match:
            index = int(id_match.group(1))
        else:
            raise Exception("Error!")
        raw_row["id"] = index
        
        raw_df.append(raw_row)
    
    df = pd.DataFrame(raw_df)
    return df   

In [2]:
folder1 = "/home/ym2380/elastic_net/official/logs/exp11/shallow_net/config256/baseline"
splits1 = ['train', 'val1', 'val2']
args_to_select1 = {"exp_name": "str", "lr": "float", "batch_size": "int"}

df1 = collect_results(folder1, splits1, args_to_select1)

In [3]:
folder2 = "/home/ym2380/elastic_net/official/logs/exp11/shallow_net/config256/finetune/baseline"
splits2 = ['train', 'val']
args_to_select2 = {"filename": "str", 'lr': "float"}

df2 = collect_results(folder2, splits2, args_to_select2)

In [4]:
folder3 = "/home/ym2380/elastic_net/official/logs/exp11/shallow_net/config256/reg"
splits3 = ['train', 'val1', 'val2']
args_to_select3 = {"exp_name": "str", "lr": "float", "batch_size": "int", "alpha": "float", "rho": "float"}

df3 = collect_results(folder3, splits3, args_to_select3)

In [5]:
folder4 = "/home/ym2380/elastic_net/official/logs/exp11/shallow_net/config256/finetune/reg"
splits4 = ['train', 'val']
args_to_select4 = {"filename": "str", 'lr': "float"}

df4 = collect_results(folder4, splits4, args_to_select4)

In [6]:
def filter_df(df, filter_conditions):
    condition = pd.Series([True] * len(df), index=df.index)
    
    for column, value in filter_conditions.items():
        condition = condition & (df[column] == value)
        
    return df[condition]


def filter4max(df):
    indices = df.groupby(['filename'])["val_last_acc"].idxmax()
    max_acc_rows = df.loc[indices]

    return max_acc_rows

In [13]:
#baseline processing
temp_df1 = df1.drop(columns=["train_best_acc", "train_last_acc", "val1_best_acc", "val1_last_acc"])
temp_df2 = df2.drop(columns=["train_best_acc", "train_last_acc", "id", 'lr'])
temp_df2 = filter4max(temp_df2)
baseline_df = pd.merge(temp_df1, temp_df2, left_on="exp_name", right_on="filename")
baseline_df = baseline_df.drop(columns=["filename",])
baseline_df = baseline_df.sort_values(by="id", ascending=True)
baseline_df = baseline_df.drop(columns=['id',])
baseline_df

Unnamed: 0,exp_name,lr,batch_size,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
0,11021233_0,1.0,50,0.5435,0.5395,0.656,0.6
7,11021233_1,1.0,100,0.6085,0.5845,0.658,0.642
5,11021233_2,0.1,50,0.623,0.606,0.723,0.709
4,11021233_3,0.1,100,0.649,0.645,0.821,0.82
11,11021233_4,0.01,50,0.632,0.6295,0.898,0.888
10,11021233_5,0.01,100,0.624,0.6205,0.934,0.932
2,11021233_6,0.001,50,0.5925,0.5925,0.714,0.712
1,11021233_7,0.001,100,0.5955,0.595,0.817,0.817
13,11021233_8,0.0001,50,0.5175,0.5155,0.684,0.682
6,11021233_9,0.0001,100,0.513,0.5125,0.707,0.689


In [8]:
#regularization experiments processing
temp_df3 = df3.drop(columns=["train_best_acc", "train_last_acc", "val1_best_acc", "val1_last_acc"])
temp_df4 = df4.drop(columns=["train_best_acc", "train_last_acc", "id", "lr"])
temp_df4 = filter4max(temp_df4)
reg_df = pd.merge(temp_df3, temp_df4, left_on="exp_name", right_on="filename")
reg_df = reg_df.drop(columns=["filename",])
reg_df = reg_df.sort_values(by="id", ascending=True)
reg_df = reg_df.drop(columns=['id',])
reg_df

Unnamed: 0,exp_name,lr,batch_size,alpha,rho,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
47,11021239_0,1.000000,50,1.000000e-02,1.0,0.5635,0.5040,0.511,0.511
294,11021239_1,1.000000,50,1.000000e-02,0.8,0.5710,0.5155,0.547,0.546
233,11021239_2,1.000000,50,1.000000e-02,0.6,0.5505,0.4990,0.519,0.518
181,11021239_3,1.000000,50,1.000000e-02,0.4,0.5225,0.5045,0.519,0.519
435,11021239_4,1.000000,50,1.000000e-02,0.2,0.5645,0.4920,0.518,0.512
...,...,...,...,...,...,...,...,...,...
115,11021243_499,0.000001,100,1.000000e-07,0.8,0.5000,0.5000,0.932,0.917
166,11021244_500,0.000001,100,1.000000e-07,0.6,0.5000,0.5000,0.920,0.920
254,11021244_501,0.000001,100,1.000000e-07,0.4,0.5000,0.5000,0.878,0.861
137,11021244_502,0.000001,100,1.000000e-07,0.2,0.5000,0.5000,0.854,0.844


In [41]:
conditions = {'lr': 0.0001, 'batch_size': 50}

filtered_baseline_df = filter_df(baseline_df, conditions)
filtered_reg_df = filter_df(reg_df, conditions)

#filtered_reg_df = filtered_reg_df[~filtered_reg_df['alpha'].isin([1e-7, 0.01])]

In [42]:
filtered_baseline_df

Unnamed: 0,exp_name,lr,batch_size,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
13,11021233_8,0.0001,50,0.5175,0.5155,0.684,0.682


In [43]:
filtered_reg_df

Unnamed: 0,exp_name,lr,batch_size,alpha,rho,val2_best_acc,val2_last_acc,val_best_acc,val_last_acc
46,11021240_288,0.0001,50,0.01,1.0,0.5305,0.526,0.987,0.925
194,11021239_289,0.0001,50,0.01,0.8,0.5555,0.555,0.901,0.885
310,11021239_290,0.0001,50,0.01,0.6,0.5625,0.5355,0.941,0.856
239,11021239_291,0.0001,50,0.01,0.4,0.5695,0.5625,0.871,0.684
213,11021239_292,0.0001,50,0.01,0.2,0.569,0.569,0.697,0.675
114,11021239_293,0.0001,50,0.01,0.0,0.567,0.563,0.74,0.695
168,11021239_294,0.0001,50,0.001,1.0,0.5845,0.495,0.68,0.673
454,11021239_295,0.0001,50,0.001,0.8,0.5525,0.4935,0.664,0.619
470,11021239_296,0.0001,50,0.001,0.6,0.531,0.5245,0.639,0.623
273,11021240_297,0.0001,50,0.001,0.4,0.534,0.534,0.661,0.66
