In [38]:
import os, sys
from glob import glob
from pathlib import Path
import io

import pandas as pd

In [39]:
BASE_DIR = "test/"
output_fnames = glob(f"./{BASE_DIR}/*.out")
output_fnames

['./test\\tpot-nn_config-tpot-base_mushroom_02_1603851187.out',
 './test\\tpot-nn_config-tpot-nn-deep-nogpu_ionosphere_01_1604067099.out',
 './test\\tpot-nn_config-tpot-nn-shallow_spambase_07_1603851197.out']

In [40]:
def parse_single_job(full_path):
    """Pull informative values from job's filename."""
    path = Path(x)
    parts = path.name.split('.')[0].split('_')
    _, conf_type, dset, rep, time = parts

    return (conf_type, dset, rep, time, path.name)

def get_log_line_value(line):
    """Parse a single line from logfile containing a certain value."""
    val = line.split()[-1]
    try:
        val = float(val)
        return val
    finally:
        return val

def parse_log_file(full_path):
    """Get all informative values out of the log file for a job.
    
    ('Informative values' are found in lines beginning with ">>")."""
    log_values = []
    with io.open(full_path, 'r', encoding="utf-8") as fp:
        #log_contents = [x.strip() for x in fp.readlines()]
        for line in fp:
            if line.startswith(">> "):
                log_values.append(line.strip())
                
    start_time = None
    dataset = None
    config_dict_name = None
    start_train = None
    end_train = None
    train_duration = None
    accuracy = None
    pipeline_file = None
        
    for log_value in log_values:
        if log_value.startswith(">> JOB START TIME:"):
            start_time = get_log_line_value(log_value)
        elif log_value.startswith(">> DATASET:"):
            dataset = get_log_line_value(log_value)
        elif log_value.startswith(">> CONFIG DICT NAME:"):
            config_dict_name = get_log_line_value(log_value)
        elif log_value.startswith(">> BEGIN TRAINING AT:"):
            start_train = get_log_line_value(log_value)
        elif log_value.startswith(">> END TRAINING AT:"):
            end_train = get_log_line_value(log_value)
        elif log_value.startswith(">> TRAINING TIME ELAPSED:"):
            train_duration = get_log_line_value(log_value)
        elif log_value.startswith(">> ACCURACY SCORE:"):
            accuracy = get_log_line_value(log_value)
        elif log_value.startswith(">> PIPELINE SAVED TO:"):
            pipeline_file = get_log_line_value(log_value)
    
    return (start_train, end_train, train_duration, accuracy, pipeline_file)

filename_results = pd.DataFrame(
    [parse_single_job(x) for x in output_fnames],
    columns=['conf_type', 'dset', 'rep', 'time_submit', 'log_file']
)

logfile_results = pd.DataFrame(
    [parse_log_file(y) for y in output_fnames],
    columns=['start_train', 'end_train', 'train_duration', 'accuracy', 'pipeline_file']
)

logfile_results

results = pd.concat([filename_results, logfile_results], axis=1)

reorder = [
    'conf_type',
    'dset',
    'rep',
    'accuracy',
    'time_submit',
    'start_train',
    'end_train',
    'train_duration',
    'pipeline_file',
    'log_file'
]

results[reorder]

Unnamed: 0,conf_type,dset,rep,accuracy,time_submit,start_train,end_train,train_duration,pipeline_file,log_file
0,config-tpot-base,mushroom,2,1.0,1603851187,1.61,9094.91,9093.3,pipelines/tpot-nn_config-tpot-base_mushroom_02...,tpot-nn_config-tpot-base_mushroom_02_160385118...
1,config-tpot-base,mushroom,2,,1603851187,1.89,,,,tpot-nn_config-tpot-base_mushroom_02_160385118...
2,config-tpot-base,mushroom,2,0.955483,1603851187,1.56,312698.89,312697.34,pipelines/tpot-nn_config-tpot-nn-shallow_spamb...,tpot-nn_config-tpot-base_mushroom_02_160385118...


In [41]:
results.to_csv("./results_df.csv", sep=',', header=True, index=False, encoding='utf-8')