In [9]:
import os
import pandas as pd 
import numpy as np 

In [41]:
def process_table(boot_file: str, mcmc_file: str, save: bool = True) -> pd.DataFrame:
    
    # load the file which contains the index of the n(z) sample
    boot_file = np.loadtxt(boot_file + '.txt')
    
    # load the MCMC file 
    mcmc = np.loadtxt(mcmc_file + '.txt')
    
    # create dataframes with specific column names
    boot_df = pd.DataFrame(boot_file, columns=['Index', '-logL'])
    mcmc_df = pd.DataFrame(mcmc, columns=['N', '-logL'] + ['p'+str(i+1) for i in range(11)])
    mcmc_df['N'] = mcmc_df['N'].astype('int')
    
    # find the rows common to both files (by comparing the log-likelihood)
    index_df = pd.DataFrame(boot_df[boot_df['-logL'].isin(mcmc_df['-logL'])]['Index'], dtype=int)
    index_df.reset_index(drop=True, inplace = True)
    
    # combine both files
    final_df = pd.concat([index_df, mcmc_df], axis = 1)
    final_df = final_df.dropna(axis=0)
    
    if save:
        final_df.to_csv(mcmc_file + '_boot.csv')
    
    return final_df

In [42]:
folder = 'KV-450-Bayes-Random-Set-4000-Long-Chain-Index/'
files = np.asarray(os.listdir(folder))

In [43]:
mcmc_files = files[['__' in files[i] for i in range(len(files))]]
mcmc_files = [x.split('.')[0] for x in mcmc_files]

In [44]:
all_df = []

for i in range(8):
    df = process_table('mp_output', folder + mcmc_files[i], save = False)
    all_df.append(df)

In [48]:
samples = pd.concat(all_df, axis = 0)
samples.reset_index(inplace=True)

In [60]:
set_2 = samples[(samples['Index'] >= 0) & (samples['Index'] <= 999)]
set_3 = samples[(samples['Index'] >= 1000) & (samples['Index'] <= 1999)]
set_4 = samples[(samples['Index'] >= 2000) & (samples['Index'] <= 2999)]
set_5 = samples[(samples['Index'] >= 3000) & (samples['Index'] <= 3999)]

In [62]:
# samples.to_csv('all_samples.csv')
# set_2.to_csv('set_2.csv')
# set_3.to_csv('set_3.csv')
# set_4.to_csv('set_4.csv')
# set_5.to_csv('set_5.csv')