In [251]:
import re
import os

from typing import List, Set

import pandas as pd
import numpy as np

import natsort


In [252]:
csv_list: List[str] = os.listdir('./results/csv/')
csv_list = [csv for csv in csv_list if re.search('\.csv$', csv)]
len(csv_list)

720

In [253]:
def strip_common_substring(csv_element: str) -> str:
    return '_'.join(csv_element.split('_')[1:])

def get_train_size_from_csv(csv_element: str) -> int:
    return int(csv_element.split('_')[0])

In [254]:
csv_elem = csv_list[0]

In [255]:
test_substring = strip_common_substring(csv_elem)
test_substring

'forest_num_estimators_100.csv'

In [256]:
test_train_size = get_train_size_from_csv(csv_elem)
test_train_size

2000

In [257]:
all_common_substrings_gen = (strip_common_substring(x) for x in csv_list)

In [258]:
def get_all_common_csv(csv_substring: str) -> List[str]:
    dir_files = [x for x in csv_list if csv_substring in x]
    dir_files = natsort.natsorted(dir_files)
    return dir_files
    
get_all_common_csv(test_substring)

['1000_forest_num_estimators_100.csv',
 '2000_forest_num_estimators_100.csv',
 '3000_forest_num_estimators_100.csv',
 '4000_forest_num_estimators_100.csv',
 '5000_forest_num_estimators_100.csv',
 '6000_forest_num_estimators_100.csv',
 '7000_forest_num_estimators_100.csv',
 '8000_forest_num_estimators_100.csv',
 '9000_forest_num_estimators_100.csv',
 '10000_forest_num_estimators_100.csv',
 '11000_forest_num_estimators_100.csv',
 '12000_forest_num_estimators_100.csv',
 '13000_forest_num_estimators_100.csv',
 '14000_forest_num_estimators_100.csv',
 '15000_forest_num_estimators_100.csv',
 '16000_forest_num_estimators_100.csv',
 '17000_forest_num_estimators_100.csv',
 '18000_forest_num_estimators_100.csv',
 '19000_forest_num_estimators_100.csv',
 '20000_forest_num_estimators_100.csv']

In [259]:
def store_combined_dataframe(csv_substring: str) -> pd.DataFrame:
    dir_files = get_all_common_csv(csv_substring)
    
    df_index: List[int] = [x for x in range(1000, 20001, 1000)]
    df_columns: List[str] = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'accuracy', 'train time', 'predict time']
    
    df = pd.DataFrame(columns=df_columns)
    
    for file in dir_files:
        train_size = get_train_size_from_csv(file)
        
        current_df = pd.read_csv(f'./results/csv/{file}', index_col=0)
        current_df = current_df.transpose()

        train_time, predict_time = current_df['accuracy'][['train time', 'predict time']]
        time_df = pd.Series([train_time, predict_time], index=['train time', 'predict time']).to_frame().transpose()
        time_df.index = [train_size]
    
        f1_scores_df = current_df.loc['f1-score'][:-2].to_frame().transpose()
        f1_scores_df.index = [train_size]
        
        df_row = pd.merge(f1_scores_df, time_df, left_index=True, right_index=True)
        
        df = pd.concat([df, df_row])
        df.to_csv(f'./results/analysis/{csv_substring}')
        
    
    return df


In [260]:
for sub in all_common_substrings_gen:
    store_combined_dataframe(sub)