In [1]:
import pandas as pd
import numpy as np
import os
import re
from collections import defaultdict

In [253]:
# log_folder = 'prev_logs/'
log_folder = 'logs/random_tickers/'

def agg_by_threshold(thresholds):
    files = sorted(os.listdir(log_folder))
    by_threshold = []
    for threshold in thresholds:
        by_threshold += [[file for file in files if threshold in file.split('_')[-1]]]
    return by_threshold

In [254]:
def create_key(original_line):
    result = []
    of_interest = ['block_depth', 
                   'const_factor', 
                   'learning_rate', 
                   'linear_dim', 
                   'percentile',
                  ]
    for item in of_interest:
        pattern = re.compile(item+'=[\d.]*')
        result += [re.search(pattern, original_line).group(0)]
    return ','.join(result)

In [255]:
def process_single_log_file(file_path):
    split_colon = lambda x: x.split(':')

    test_data = (pd.read_csv(file_path)
                   .iloc[:-1, 2:]
                   .iloc[1::2]
                   .iloc[:, 1:9]
                   .applymap(split_colon))
    
    if test_data.empty:
        return None
    
    names = (test_data.applymap(lambda x: x[0])
                      .iloc[0]
                      .values
                      .tolist())
    names = list(map(lambda x: x.replace(' ', ''), names))
    dataframe = (test_data.iloc[:, :-1].applymap(lambda x: float(x[-1]))
                          .copy())
    confusion_matrix_ = test_data.iloc[:, -1].map(lambda x: x[-1]).copy()
    dataframe = pd.concat((dataframe, confusion_matrix_), axis=1)
    dataframe.columns = names
    
    return dataframe

In [256]:
def get_key_and_df(file_name):
    file_path = log_folder + file_name

    with open(file_path, 'r') as f:
        line = f.readlines()[0]
        key = create_key(line)
    
    df = process_single_log_file(file_path)
    
    if df is None:
        return None, None
    else:
        return key, df 

In [265]:
def get_relevant_key_df(file_name):
    key, df = get_key_and_df(file_name)
    if df is None:
        return None, None
    
    relevant_df = df[(df.F1Score > 0.01) & 
                     (df.index < 60)].copy()

    #                                       Use df to compare!!
    relevant_df = relevant_df[relevant_df.BCE < df.BCE.quantile(0.1)]

    if len(relevant_df) > 0:
        return key, relevant_df
    else:
        return None, None

In [266]:
def garner_relevant_dictionary():
    
    thresholds = ['0.1', '0.2', '0.3', '0.35', '0.65', '0.7', '0.8', '0.9']
    files_by_thresholds = agg_by_threshold(thresholds)

    # Not sure if this should be default_dict
    so_far = defaultdict(list)
    for files in files_by_thresholds:
        for file in files:
            key, df = get_relevant_key_df(file)
            if key is not None:
                so_far[key].append(df)
    
    return so_far

In [267]:
my_dict = garner_relevant_dictionary()
# iter_my_dict = iter(my_dict.items())

In [268]:
training_results = []
pct = 0.8
statement = 'percentile={}'.format(pct)
for key, df in my_dict.items():
    if len(df) > 1:
        print('key {} has len(df) > 1'.format(key))
    if statement in key : # and 'block_depth=3' in key:
        training_results.append((key, df[0]))

key block_depth=4,const_factor=2,learning_rate=0.007,linear_dim=5,percentile=0.8 has len(df) > 1


In [269]:
# training_results_sorted = sorted(training_results, 
#                                  key=lambda x: (x[1].Mean*x[1].F1Score).mean(), 
#                                  reverse=True)
reverse = pct > 0.5

training_results_sorted = sorted(training_results, 
                                 key=lambda x: (x[1].Mean).mean(), 
                                 reverse=reverse)

In [270]:
for key, result in training_results_sorted:
    print(key)

block_depth=6,const_factor=2,learning_rate=0.007,linear_dim=3,percentile=0.8
block_depth=4,const_factor=2,learning_rate=0.007,linear_dim=5,percentile=0.8
block_depth=6,const_factor=2,learning_rate=0.007,linear_dim=5,percentile=0.8
block_depth=2,const_factor=2,learning_rate=0.007,linear_dim=3,percentile=0.8
block_depth=4,const_factor=3,learning_rate=0.007,linear_dim=3,percentile=0.8
block_depth=4,const_factor=3,learning_rate=0.007,linear_dim=5,percentile=0.8


In [271]:
print('F1Score, Mean, Precision, Stdev, Count')
for key, result in training_results_sorted:
    print('{:.5f}, {:.5f}, {:.5f}, {:.5f}, {}'.format(
          result.F1Score.mean(), 
          result.Mean.mean(),
          result.Precision.mean(), 
          result.Stdev.mean(),
          result.Mean.count(),)
         )

F1Score, Mean, Precision, Stdev, Count
0.01250, 0.01349, 0.52817, 0.03937, 1
0.11725, 0.01048, 0.37994, 0.03550, 1
0.12810, 0.00981, 0.41386, 0.04268, 1
0.04828, 0.00857, 0.44283, 0.03954, 1
0.02071, 0.00839, 0.38789, 0.03839, 2
0.08871, 0.00082, 0.33162, 0.04307, 1


In [272]:
ranks = 0
print(training_results_sorted[ranks][0])
training_results_sorted[ranks][1]

block_depth=6,const_factor=2,learning_rate=0.007,linear_dim=3,percentile=0.8


Unnamed: 0,BCE,F1Score,ROC_AUC,Precision,Recall,Mean,Stdev,ConfusionMatrix
35,0.50618,0.0125,0.56624,0.52817,0.00633,0.01349,0.03937,[228145 335 58905 375]
