In [1]:
# encoding: utf-8

import numpy as np
import sklearn as sk
import sklearn.linear_model as sklm
import sklearn.metrics as skmt
import matplotlib
matplotlib.use('agg') # so that plt works in command line
import matplotlib.pyplot as plt
import scipy.io as sio
import skimage.io
import h5py
import sys
import os
import gc
import os
import psutil
import re

from optparse import OptionParser

sys.path.append('../Metric/')
sys.path.append('../../Visualization/')
sys.path.append('../../Data_Preprocessing/')
from Metric import *
from Visualization import *
from Data_Extractor import *


parser = OptionParser()
parser.add_option("--dir", dest="dir")
(options, args) = parser.parse_args(["--dir", "./Log/sklearn/"])

log_dir = options.dir

  from ._conv import register_converters as _register_converters


In [2]:
record_list = []
cnt = 0
for filename in os.listdir(log_dir):
    cnt += 1
    print(cnt, " : ", filename)

    # open file
    file = open(log_dir+filename)
    log = file.read().split('\n')
    file.close()

    # initialization
    train_record = {'pos_recall':None, 'pos_pre':None, 'pos_F1':None, 'neg_recall':None}
    val_record   = {'pos_recall':None, 'pos_pre':None, 'pos_F1':None, 'neg_recall':None}
    record = {}
    for n in ['name', 'weight', 'pos', 'epoch', 'norm_T', 'reg_param', 'rand', 'avg_pre', 'bal_acc', 'AUC']:
        record[n] = None
    record['train'] = train_record
    record['val'] = val_record

    # name
    record['name'] = filename
    
    # settings
    record['weight'] = filename.find('weight') > 0
    record['pos']    = int(filename.split('p')[-1].split('_')[0])
    record['epoch']  = int(filename.split('_e')[-1].split('_')[0])
    
    rand = filename.split('r')[-1].split('_')[0]
    if rand == 'None': record['rand']   = np.float('NaN')
    else: record['rand'] = int(rand[0])
        
    if filename[3:].find('G') > 0:
        record['norm_T'] = 'std'

        l = filename[3:].split('G')[-1].split('_')
        record['reg_param'] = float(l[0])
        if record['reg_param'] == 0:
            record['reg_param'] = float('.'.join([l[0], l[1]]))
    elif filename[3:].find('m') > 0:
        record['norm_T'] = 'mean'
        l = filename[3:].split('m')[-1].split('_')
        record['reg_param'] = float(l[0])
        if record['reg_param'] == 0:
            record['reg_param'] = float('.'.join([l[0], l[1]]))
    else:
        record['norm_T'] = 'None'
        record['reg_param'] = 'None'
    print(l)
#     if not (record['weight'] and record['pos'] <= 1 and record['norm_T'] == 'std'): continue
            
    # metric on validation set
    idx = log.index('finish')

    line = log[idx-1].split()
    assert (line[0] == 'balanced_acc')
    record['bal_acc'] = float(line[2])
    record['AUC'] = float(line[5])
    record['avg_pre'] = float(line[8])


    train_idx = log.index("On train set")
    val_idx = log.index("On CV set")
    
    # train set
    train_metric = log[train_idx:val_idx]
    
    line = [s for s in train_metric if 'pos_recall' in s][0].split()
    assert line[0] == 'pos_recall'
    record['train']['pos_recall'] = float(line[-1])

    
    line = [s for s in train_metric if 'pos_precision' in s][0].split()
    assert line[0] == 'pos_precision'
    record['train']['pos_pre'] = float(line[-1])
    
    line = [s for s in train_metric if 'pos_F1' in s][0].split()
    assert line[0] == 'pos_F1'
    record['train']['pos_F1'] = float(line[-1])

    line = [s for s in train_metric if 'neg_recall' in s][0].split()
    assert line[0] == 'neg_recall'
    record['train']['neg_recall'] = float(line[-1])

    # validation set
    val_metric = log[val_idx:]
    
    line = [s for s in val_metric if 'pos_recall' in s][0].split()
    assert line[0] == 'pos_recall'
    record['val']['pos_recall'] = float(line[-1])

    
    line = [s for s in val_metric if 'pos_precision' in s][0].split()
    assert line[0] == 'pos_precision'
    record['val']['pos_pre'] = float(line[-1])
    
    line = [s for s in val_metric if 'pos_F1' in s][0].split()
    assert line[0] == 'pos_F1'
    record['val']['pos_F1'] = float(line[-1])

    line = [s for s in val_metric if 'neg_recall' in s][0].split()
    assert line[0] == 'neg_recall'
    record['val']['neg_recall'] = float(line[-1])

    record_list.append(record)
    print(record)
    print()

1  :  SGD_weight_m0.01_p8_e15_r1
['0.01', 'p8', 'e15', 'r1']
{'name': 'SGD_weight_m0.01_p8_e15_r1', 'weight': True, 'pos': 8, 'epoch': 15, 'norm_T': 'mean', 'reg_param': 0.01, 'rand': 1, 'avg_pre': 0.03169907487438541, 'bal_acc': 0.5007392884240832, 'AUC': 0.5459586335138389, 'train': {'pos_recall': 0.9993473698019153, 'pos_pre': 0.014184954780251696, 'pos_F1': 0.02797285672412595, 'neg_recall': 0.0038014258675150093}, 'val': {'pos_recall': 0.9987473002159827, 'pos_pre': 0.028778867864406612, 'pos_F1': 0.05594566343073394, 'neg_recall': 0.00273127663218382}}

2  :  SGD_weight_m0.01_p0_e15_r1
['0.01', 'p0', 'e15', 'r1']
{'name': 'SGD_weight_m0.01_p0_e15_r1', 'weight': True, 'pos': 0, 'epoch': 15, 'norm_T': 'mean', 'reg_param': 0.01, 'rand': 1, 'avg_pre': 0.07611311778373303, 'bal_acc': 0.6448460311882365, 'AUC': 0.7109264263283681, 'train': {'pos_recall': 0.6843598320871048, 'pos_pre': 0.02512573402542735, 'pos_F1': 0.04847186169811211, 'neg_recall': 0.6191303954318349}, 'val': {'pos_re

In [3]:
sorted_record_list = sorted(record_list, key=lambda r: (r['AUC'], r['avg_pre']),  reverse=True)
sorted_record_list

[{'AUC': 0.7567699381515794,
  'avg_pre': 0.093033113500065,
  'bal_acc': 0.6224495399477878,
  'epoch': 15,
  'name': 'SGD_weight_m5_p1_e15_r1',
  'norm_T': 'mean',
  'pos': 1,
  'rand': 1,
  'reg_param': 5.0,
  'train': {'neg_recall': 0.2771797410076058,
   'pos_F1': 0.03595153813523979,
   'pos_pre': 0.018325945111926013,
   'pos_recall': 0.9407352748261839},
  'val': {'neg_recall': 0.2958278055975195,
   'pos_F1': 0.07371854019104881,
   'pos_pre': 0.0383486232126899,
   'pos_recall': 0.9490712742980562},
  'weight': True},
 {'AUC': 0.7546025723093874,
  'avg_pre': 0.0898026579686897,
  'bal_acc': 0.6336635304124253,
  'epoch': 15,
  'name': 'SGD_weight_m10_p1_e15_r1',
  'norm_T': 'mean',
  'pos': 1,
  'rand': 1,
  'reg_param': 10.0,
  'train': {'neg_recall': 0.30743224363296096,
   'pos_F1': 0.0370236770094833,
   'pos_pre': 0.01888799436335627,
   'pos_recall': 0.9295388954479864},
  'val': {'neg_recall': 0.32702468501491533,
   'pos_F1': 0.07618301001284417,
   'pos_pre': 0.0396

In [7]:
print("%-6s   %-9s   %-9s   %-6s   %-7s   %-6s   %-7s   %-3s %s \\\\ \n" % ('norm_T', 'up_sample', 
                                   'train_acc', 'recall',
                                   'val_acc',   'recall',
                                   'avg_pre', 'AUC', 'name'))
for r in sorted_record_list:
    upsample = r['pos']/64.0
    if upsample >= 0.125: continue
    train_bal_acc = (r['train']['pos_recall'] + r['train']['neg_recall'])/2
    val_bal_acc = (r['val']['pos_recall'] + r['train']['neg_recall'])/2
    print("%-6s & %-9.3f & %-9.3f & %-6.3f & %-7.3f & %-6.3f & %-7.3f & %-3.3f \\\\ %% - %s \n" % (r['norm_T'], upsample, 
                                       train_bal_acc, r['train']['pos_recall'],
                                       val_bal_acc,   r['val']['pos_recall'],
                                       r['avg_pre'], r['AUC'], r['name']))

    assert ((r['val']['pos_recall'] + r['val']['neg_recall'])/2 - r['bal_acc']) < 1e-10


norm_T   up_sample   train_acc   recall   val_acc   recall   avg_pre   AUC name \\ 

mean   & 0.016     & 0.609     & 0.941  & 0.613   & 0.949  & 0.093   & 0.757 \\ % - SGD_weight_m5_p1_e15_r1 

mean   & 0.016     & 0.618     & 0.930  & 0.624   & 0.940  & 0.090   & 0.755 \\ % - SGD_weight_m10_p1_e15_r1 

mean   & 0.016     & 0.608     & 0.942  & 0.611   & 0.948  & 0.091   & 0.750 \\ % - SGD_weight_m5_p1_e15_r0 

mean   & 0.000     & 0.664     & 0.817  & 0.648   & 0.785  & 0.091   & 0.750 \\ % - SGD_weight_m1_p0_e15_r1 

mean   & 0.000     & 0.664     & 0.818  & 0.649   & 0.789  & 0.088   & 0.749 \\ % - SGD_weight_m10_p0_e15_r0 

mean   & 0.000     & 0.664     & 0.811  & 0.645   & 0.773  & 0.090   & 0.749 \\ % - SGD_weight_m5_p0_e15_r0 

mean   & 0.000     & 0.665     & 0.793  & 0.646   & 0.754  & 0.091   & 0.748 \\ % - SGD_weight_m0.5_p0_e15_r1 

mean   & 0.016     & 0.616     & 0.929  & 0.619   & 0.935  & 0.089   & 0.748 \\ % - SGD_weight_m10_p1_e15_r0 

mean   & 0.000     & 0.662    

In [None]:
import re
file = open('./norm_T   up_sample   train_acc   recall   val_acc')
table = file.read()
file.close()

In [None]:
re.sub('- .* \\\\', '\\\\', table)