# Read and combine all the individual hdf5 data files



In [1]:
import pandas as pd
import numpy as np
import os
import h5py
import matplotlib.pyplot as plt

from io import StringIO


In [2]:
# Directory where your .h5 files are stored
base_path = "../result_tables"

list_of_tables = []
for table_name in os.listdir(base_path):
    if table_name.endswith(".h5"):
        list_of_tables.append(table_name)

for table_name in os.listdir(os.path.join(base_path, "WDMS")):
    if table_name.endswith(".h5"):
        list_of_tables.append('WDMS/'+table_name)


print("List of tables:", list_of_tables)


List of tables: ['WRs_LMC.h5', 'Be_sdOB_table.h5', 'ns_table.h5', 'WRs_SMC.h5', 'example_obs_df_full.h5', 'algols.h5', 'bh_table.h5', 'stripped_star_table.h5', 'example_obs_df.h5', 'contact1.h5', 'BSS_data.h5', 'WDMS/WD_Binary_Pathways_X.h5', 'WDMS/RebassaMansergas2012.h5', 'WDMS/WD_Binary_Pathways_VI.h5', 'WDMS/Zorotovic2010.h5', 'WDMS/Escorza2019.h5', 'WDMS/Jorissen2019.h5', 'WDMS/Shahaf2024.h5']


In [3]:
triplet_cols = ["RA", "Dec", "Period", "Eccentricity", "M1", "M1_sin3i", "M2", "M2_sin3i", "q", "Mass Function"]

dfs = []

for n, table_dir in enumerate([base_path + '/' + x for x in list_of_tables]):
    print(f"Processing table {n+1}/{len(list_of_tables)}: {table_dir}")

    new_df = pd.DataFrame()

    try: 
        # Open all the hdf5 files
        with h5py.File(table_dir, "r") as f:

            # Load and parse metadata
            metadata_json = f["metadata_json"][()].decode("utf-8")
            metadata_df = pd.read_json(StringIO(metadata_json), orient="records")

            for col in triplet_cols:
                # Load triplet arrays
                loerr = f[col][:, 0]
                vals = f[col][:, 1]
                uperr = f[col][:, 2]
                new_df[col] = vals
                new_df[col + '_uperr'] = loerr
                new_df[col + '_loerr'] = uperr

        new_df = pd.concat([metadata_df, new_df], axis=1)
        new_df['table'] = [list_of_tables[n] for x in range(len(new_df))]
        dfs.append(new_df)
        
    except Exception as e:
        print(f"Error processing {table_dir}: {e}")
        continue
    
    df_final = pd.concat(dfs)

Processing table 1/18: ../result_tables/WRs_LMC.h5
Processing table 2/18: ../result_tables/Be_sdOB_table.h5
Processing table 3/18: ../result_tables/ns_table.h5
Processing table 4/18: ../result_tables/WRs_SMC.h5
Processing table 5/18: ../result_tables/example_obs_df_full.h5
Processing table 6/18: ../result_tables/algols.h5
Processing table 7/18: ../result_tables/bh_table.h5
Processing table 8/18: ../result_tables/stripped_star_table.h5
Processing table 9/18: ../result_tables/example_obs_df.h5
Error processing ../result_tables/example_obs_df.h5: "Unable to synchronously open object (object 'metadata_json' doesn't exist)"
Processing table 10/18: ../result_tables/contact1.h5
Processing table 11/18: ../result_tables/BSS_data.h5
Processing table 12/18: ../result_tables/WDMS/WD_Binary_Pathways_X.h5
Processing table 13/18: ../result_tables/WDMS/RebassaMansergas2012.h5
Processing table 14/18: ../result_tables/WDMS/WD_Binary_Pathways_VI.h5
Processing table 15/18: ../result_tables/WDMS/Zorotovic2

In [4]:
# # Debugging 

# # Open all the hdf5 files
# with h5py.File('../result_tables/BSS_data.h5', "r") as f:
#     print(f.keys() )


# with h5py.File( "../result_tables/BSS_data.h5", "r") as f:
#     # Load triplet arrays
#     eccentricity = f["Eccentricity"][:]
#     mass1 = f["M1"][:]

#     # Load and parse metadata
#     metadata_json = f["metadata_json"][()].decode("utf-8")
#     metadata_df = pd.read_json(StringIO(metadata_json), orient="records")

# print(metadata_df.keys() )

# print(metadata_df['System Name'])

# print(eccentricity)

In [5]:
df_final['one_minus_eccentricity_loerr'] = np.zeros_like(df_final['Eccentricity'])
df_final['one_minus_eccentricity'] = 1 - df_final['Eccentricity']
df_final['one_minus_eccentricity_uperr'] = np.zeros_like(df_final['Eccentricity'])

# Now plot the data 

In [6]:
import plotly.express as px

def plotly_vars(col1, col2, logx = False, logy = False, df = None, h5_file = None, exclude_type2 = None):
    """Scatter plot of col2 (y-axis) vs. col1 (x-axis). Either df or h5_file must be specified.""" 

    if h5_file is not None:
        with h5py.File(h5_file) as f:
            col1_vals = f[col1][()]     # shape (N, 3)
            col2_vals = f[col2][()]     # shape (N, 3)

            col1_pos = col1_vals[:, 2]
            col1_neg = col1_vals[:, 0]

            col2_pos = col2_vals[:, 2]
            col2_neg = col2_vals[:, 0]

            metadata_json = f["metadata_json"][()].decode("utf-8")
            metadata_df = pd.read_json(StringIO(metadata_json), orient="records")
        
            type1s = metadata_df['Type1'].values
            type2s = metadata_df['Type2'].values

            plot_df = pd.DataFrame({col1 : col1_vals[:, 1], col2 : col2_vals[:, 1], 'Type1' : type1s, 'Type2' : type2s})
            plot_df['marker_size'] = plot_df['Type2'].apply(lambda g: 1 if g == 'WD' else 5)
    
            fig = px.scatter(plot_df, x = col1, y = col2, color = 'Type2', hover_data = ['Type1', 'System Name'], 
                             error_x = col1_pos, error_x_minus = col1_neg, 
                             error_y = col2_pos, error_y_minus = col2_neg,
                             size = 'marker_size', size_max = 5)

    else:
        if df is None:
            print('You need to specify either df or h5_file!')
            return None

        df['marker_size'] = df['Type2'].apply(lambda g: 1 if g == 'WD' else 5)

        # exclude part of the df
        plot_df = df[df['Type2'] != exclude_type2]

        fig = px.scatter(plot_df, x = col1, y = col2, color = 'Type2', hover_data = ['Type1','System Name'], 
                         error_x = col1 + '_uperr', error_x_minus = col1 + '_loerr', 
                         error_y = col2 + '_uperr', error_y_minus = col2 + '_loerr', 
                         size = 'marker_size', size_max = 5)
    
    fig.update_layout(
        width=900,
        height=600,
        xaxis_title_font=dict(size=18),
        yaxis_title_font=dict(size=18),
        legend=dict(font=dict(size=16))
    )

    fig.update_yaxes(tickfont=dict(size=14))
    fig.update_xaxes(tickfont=dict(size=14))
    if logx:
        fig.update_xaxes(type='log')
    if logy:
        fig.update_yaxes(type='log')
    fig.show();
    
    return fig

In [7]:
df_final.keys()

Index(['System Name', 'Type1', 'Type2', 'Detection Method', 'Reference',
       'Notes', 'RA', 'RA_uperr', 'RA_loerr', 'Dec', 'Dec_uperr', 'Dec_loerr',
       'Period', 'Period_uperr', 'Period_loerr', 'Eccentricity',
       'Eccentricity_uperr', 'Eccentricity_loerr', 'M1', 'M1_uperr',
       'M1_loerr', 'M1_sin3i', 'M1_sin3i_uperr', 'M1_sin3i_loerr', 'M2',
       'M2_uperr', 'M2_loerr', 'M2_sin3i', 'M2_sin3i_uperr', 'M2_sin3i_loerr',
       'q', 'q_uperr', 'q_loerr', 'Mass Function', 'Mass Function_uperr',
       'Mass Function_loerr', 'table', 'one_minus_eccentricity_loerr',
       'one_minus_eccentricity', 'one_minus_eccentricity_uperr'],
      dtype='object')

In [8]:
fig = plotly_vars('M1', 'M2', logx = False, logy = False, df = df_final)#, exclude_type2= 'WD')



In [15]:
fig = plotly_vars('Period', 'Eccentricity', logx = True, logy = False, df = df_final)#, exclude_type2= 'WD')


In [10]:
fig = plotly_vars('M1', 'Eccentricity', logx = True, logy = False, df = df_final, exclude_type2= 'NS')


In [11]:
fig = plotly_vars('M2', 'Eccentricity', logx = False, logy = False, df = df_final, exclude_type2= 'NS')


In [12]:
# donor vs period
fig = plotly_vars('M2', 'Period', logx = False, logy = True, df = df_final, exclude_type2= 'NS')


In [13]:
# accretor vs period
fig = plotly_vars('q', 'Period', logx = False, logy = True, df = df_final)


In [14]:
# accretor vs period
fig = plotly_vars('q', 'Eccentricity', logx = False, logy = False, df = df_final)
