In [None]:
from astropy.coordinates import SkyCoord
from astroquery.ned import Ned
import numpy as np
from sklearn.linear_model import HuberRegressor
from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
import pandas as pd
import astropy.units as u
from astropy.cosmology import Planck15 as cosmo
from astropy.coordinates import SkyCoord
from astropy.io import fits
from astropy.table import Table
from astropy.table import join
from scipy import stats

In [None]:
def query_cluster_info(object_name):
    try:
        # search for RA and DEC
        basic_info = Ned.query_object(object_name)
        ra = basic_info['RA'][0]
        dec = basic_info['DEC'][0]

        # search redshift
        redshift_table = Ned.get_table(object_name, table="redshifts")
        uncertainties_all = np.array(redshift_table['Published Redshift Uncertainty'])

        # filter out unreliable values
        nonzero_mask = (uncertainties_all != 0) & (~np.isnan(uncertainties_all))
        if not np.any(nonzero_mask):
            best_redshift = None
            best_uncertainty = None
        else:
            uncertainties = np.abs(uncertainties_all[nonzero_mask])
            best_idx = np.argmin(uncertainties)
            true_idx = np.where(nonzero_mask)[0][best_idx]
            best_redshift = redshift_table['Published Redshift'][true_idx]
            best_uncertainty = redshift_table['Published Redshift Uncertainty'][true_idx]

        # print
        print(f"\nObject: {object_name}")
        print(f"RA: {ra:.5f}, Dec: {dec:.5f}")
        if best_redshift is not None:
            print(f"Best redshift: {best_redshift:.5f} ± {best_uncertainty:.5f}")
        else:
            print("No valid redshift with uncertainty found.")
            
        return ra, dec, best_redshift, best_uncertainty

    except Exception as e:
        print(f"\nObject: {object_name}")
        print(f"Query failed: {e}")
        return None, None, None, None


In [None]:
cluster_list = [
    "Abell 1285", "Abell 2457", "Abell 3822", "Abell 1307", "Abell 2597", "Abell 3827",
    "Abell 1644", "Abell 2811", "Abell 3911", "Abell 1650", "Abell 2837", "Abell 3921",
    "Abell 1651", "Abell 3112", "Abell 1750", "Abell 3158", "Abell 4010", "Abell 1837",
    "Abell 3266", "Abell 754", "Abell 2029", "Abell 3558", "Abell 780", "Abell 2055",
    "Abell 3571", "Abell 85", "Abell 208", "Abell 3667", "Abell 2147", "Abell 3694",
    "Abell 2420", "Abell 3695", "Abell 2426", "Abell 3814", "RXCJ1539.5-8335", "RXCJ1558.3-1410"
]


In [None]:
cluster_data = []

for name in cluster_list:
    ra, dec, z, z_err = query_cluster_info(name)
    cluster_data.append({
        'Cluster Name': name,
        'RA': ra,
        'Dec': dec,
        'Redshift': z,
        'Redshift Error': z_err
    })

cluster_df = pd.DataFrame(cluster_data)


In [None]:
# read eROSITA morphology catalog
file_path = "/users_path/merger_trace/data/eROSITA_cat/erass1_cluster_morphology_v1.0.fits"

with fits.open(file_path) as hdul:
    morphology_data = Table(hdul[1].data)  

print(morphology_data[:5])

print(morphology_data.colnames)


In [None]:
# read eROSITA primary cluster catalog
primary_path = "/users_path/merger_trace/data/eROSITA_cat/erass1cl_primary_v3.2.fits"

with fits.open(primary_path) as hdul:
    primary_data = Table(hdul[1].data)

print(primary_data[:5])

print(primary_data.colnames)


In [None]:
# merge data from morphology table and primary table
merged_data = join(
    morphology_data,
    primary_data,
    keys='DETUID',
    table_names=['morph', 'prim'],
    uniq_col_name='{table_name}_{col_name}'
)


In [None]:
max_sep = 20 * u.arcsec
max_z_diff = 0.01
# create SkyCoord
coord_cluster = SkyCoord(ra=cluster_df['RA'].values * u.deg,
                         dec=cluster_df['Dec'].values * u.deg)
coord_erosita = SkyCoord(ra=merged_data['RA'] * u.deg,
                         dec=merged_data['DEC'] * u.deg)

# match all pairs
idx_erosita, idx_cluster, sep2d, _ = coord_cluster.search_around_sky(
    coord_erosita, max_sep
)

# filter matched pairs by redshifts
z1 = cluster_df['Redshift'].values[idx_cluster]
z2 = merged_data['BEST_Z'][idx_erosita]
z_diff = np.abs(z1 - z2)

valid = z_diff < max_z_diff

# save all pairs satifying conditions
matched_pairs = list(zip(idx_cluster[valid], idx_erosita[valid], sep2d[valid]))



In [None]:
rows = []

for i, j, sep in matched_pairs:
    row = {
        'Cluster_Name': cluster_df.iloc[i]['Cluster Name'],           # cluster names
        'Cluster_RA': cluster_df.iloc[i]['RA'],               # cluster RA
        'Cluster_Dec': cluster_df.iloc[i]['Dec'], 
        'Cluster_z': cluster_df.iloc[i]['Redshift'],
        'Cluster_z_err': cluster_df.iloc[i]['Redshift Error'],
        'EROSITA_id': merged_data['DETUID'][j],
        'EROSITA_Name': merged_data['morph_NAME'][j],    # erosita names
        'EROSITA_RA': merged_data['RA'][j],              # erosita RA
        'EROSITA_Dec': merged_data['DEC'][j],            # erosita DEC
        'EROSITA_z': merged_data['BEST_Z'][j], 
        'EROSITA_z_err': merged_data['BEST_ZERR'][j],
        'D_COMB':merged_data['D_COMB'][j],
        'D_SHAPE':merged_data['D_SHAPE'][j],
        'R_500': merged_data['R500'][j]
    }
    rows.append(row)

match_table = pd.DataFrame(rows)
print(match_table)



In [None]:
def theta_from_R500(z_cluster, r_500):
    """
    calculate the cluster dec, ra range based on r_500
    r_500: unit kpc
    """

    DA_kpc   = cosmo.angular_diameter_distance(z_cluster).to_value(u.kpc)  # convert to kpc
    theta_rad = r_500 / DA_kpc
    return np.degrees(theta_rad)   # theta_deg


def mask_within_theta_spherical(df, ra_c_deg, dec_c_deg, theta_deg):
    """
    find galxies region within cluster
    """
    center = SkyCoord(ra_c_deg * u.deg, dec_c_deg * u.deg)
    pts    = SkyCoord(df['ra'].to_numpy() * u.deg, df['dec'].to_numpy() * u.deg)
    sep_deg = pts.separation(center).deg
    return sep_deg <= theta_deg

def mask_within_theta_spherical_specz(df, ra_c_deg, dec_c_deg, theta_deg):
    """
    find galxies region within cluster
    """
    center = SkyCoord(ra_c_deg * u.deg, dec_c_deg * u.deg)
    pts    = SkyCoord(df['ra_specz'].to_numpy() * u.deg, df['dec_specz'].to_numpy() * u.deg)
    sep_deg = pts.separation(center).deg
    return sep_deg <= theta_deg

In [None]:
def cal_bluefrac(df_photo, z_cluster, r_500, ra_cluster_deg, dec_cluster_deg, z_threshold = 0.05, mag_lim_u=22.0, mag_lim_g = 22.0):

    df = df_photo

    # ==== Parameters ====
    z_cl = z_cluster                  # Cluster redshift
    dz = z_threshold                     # Photo-z half width (scaled by 1+z)
    min_odds = 0.5                # Minimum photo-z quality threshold
    #mag_lim_r = 22.0              # Magnitude limit for completeness
    mag_lim_u = mag_lim_u
    mag_lim_g= mag_lim_g
    max_magerr = 1.0             # Maximum photometric error allowed
    delta_blue = 0.20             # Blue galaxy definition: offset from red sequence (mag)
    extendedness_lim = 0.5

    theta_deg = theta_from_R500(z_cluster, r_500)
    location_mask = mask_within_theta_spherical(df, ra_cluster_deg, dec_cluster_deg, theta_deg)


    # ==== Magnitudes ====
    u = df['u_cmodel_mag']; g = df['g_cmodel_mag']; r = df['r_cmodel_mag']
    uerr = df['u_cmodel_magerr']; gerr = df['g_cmodel_magerr']; rerr = df['r_cmodel_magerr']

    # ==== Select galaxies ====
    # Basic star/galaxy separation using "extendedness"
    is_gal = (df['extendedness'] > extendedness_lim)

    # Photo-z membership selection 
    pz = df['z_b'].astype(float)
    members = (np.abs(pz - z_cl) < dz * (1 + z_cl)) & (df['odds'] >= min_odds)

    # magnitude quality cuts
    #snr_ok = (uerr < max_magerr) & (gerr < max_magerr) & (rerr < max_magerr)
    snr_ok = (uerr < max_magerr) & (gerr < max_magerr)
    #mag_ok = (r < mag_lim_r) & (u < mag_lim_u) & (g < mag_lim_g)
    mag_ok = (u < mag_lim_u) & (g < mag_lim_g)

    # Final base mask: galaxies, members, quality cuts, and finite mags
    base_mask = is_gal & members & snr_ok & mag_ok & location_mask
    base_mask &= np.isfinite(u) & np.isfinite(g) & np.isfinite(r)

    # ==== Fit red sequence in (u-g) vs g ====
    ug = (u - g).values
    gmag = g.values
    umag = u.values

    X = gmag[base_mask].reshape(-1, 1)
    Y = ug[base_mask]

    # Robust regression (Huber) with iterative 3-sigma clipping
    reg = HuberRegressor().fit(X, Y)
    for _ in range(2):
        yhat = reg.predict(X)
        resid = Y - yhat
        sig = np.std(resid)
        keep = np.abs(resid) < 3 * sig
        reg = HuberRegressor().fit(X[keep], Y[keep])

    slope = reg.coef_[0]
    intercept = reg.intercept_

    # ==== Define blue galaxies ====
    ug_model = slope * g + intercept
    is_blue = (u - g) < (ug_model - delta_blue)
    is_blue_abs = ((u - g) < 1.2)

    # ==== Compute blue fraction ====
    N_tot = int(base_mask.sum())
    N_blue = int((base_mask & is_blue).sum())
    N_blue_abs = int((base_mask & is_blue_abs).sum())
    blue_frac = N_blue / N_tot if N_tot > 0 else np.nan
    blue_frac_abs = N_blue_abs / N_tot if N_tot > 0 else np.nan

    print(f"N_total={N_tot}, N_blue={N_blue}, blue_fraction={blue_frac:.3f}")
    print(f"N_total={N_tot}, N_blue={N_blue_abs}, blue_fraction={blue_frac_abs:.3f}")
    print(f"RS fit (u-g) = {slope:.3f} * g + {intercept:.3f}; blue cut: Δ={delta_blue:.2f} mag")

    # ==== Pearson and Spearman correlation ====
    xv = g[base_mask].to_numpy()
    yv = (u - g)[base_mask].to_numpy()
    good = np.isfinite(xv) & np.isfinite(yv)
    rp, pp = pearsonr(xv[good], yv[good])
    rs, ps = spearmanr(xv[good], yv[good])
    print(f"Pearson r={rp:.3f} (p={pp:.2e}); Spearman ρ={rs:.3f} (p={ps:.2e})")


    plt.figure(figsize=(8, 6))

    # Scatter plot of all members
    plt.scatter(g[base_mask], (u - g)[base_mask], s=10, alpha=0.5, label='Cluster members')

    # Fitted red sequence line
    g_range = np.linspace(g[base_mask].min(), g[base_mask].max(), 200)
    ug_rs = slope * g_range + intercept
    plt.plot(g_range, ug_rs, 'r-', lw=2, label='Red sequence fit')

    # Blue cut line
    plt.plot(g_range, ug_rs - delta_blue, 'b--', lw=2, label=f'Blue cut (Δ={delta_blue:.2f} mag)')

    # Highlight blue galaxies
    plt.scatter(g[base_mask & is_blue], (u - g)[base_mask & is_blue], 
                s=10, color='blue', alpha=0.7, label='Blue galaxies')

    plt.gca().invert_xaxis()  # Brighter to the left
    plt.xlabel('g magnitude')
    plt.ylabel('u - g colour')
    plt.legend()
    plt.title(f'Blue fraction = {blue_frac:.3f} (N={N_tot}, N_blue={N_blue})')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return blue_frac, blue_frac_abs, gmag[base_mask], umag[base_mask]

In [None]:
cluster_names = ['Abell 4010', 'Abell 1651', 'Abell 1644', 'Abell 3558', 'Abell 3921', 'RXCJ1539.5-8335']
folder_names = ['A4010', 'A1651', 'A1644', 'A3558', 'A3921', 'RXCJ1539.5-8335']

In [None]:
z_vec = []
for cluster in cluster_names:
    z_vec.append(match_table[match_table['Cluster_Name']==cluster]['Cluster_z'].iloc[0])

In [None]:
blue_frac_vec = []
blue_frac_abs_vec = []
D_comb_vec = []
D_shape_vec = []
gmag_vec = []
umag_vec = []
for i, name in enumerate(cluster_names):
    folder_name = folder_names[i]
    file_path = f"/users_path/merger_trace/data/LoVoCCS/photometric_data/{folder_name}/{folder_name}_00-1111_gal_dered_dezp_bpz_merge.csv"
    df_photo = pd.read_csv(file_path)

    z_cluster = match_table['Cluster_z'][match_table['Cluster_Name'] == name].iloc[0]
    r_500 = match_table['R_500'][match_table['Cluster_Name'] == name].iloc[0]
    ra_cluster = match_table['Cluster_RA'][match_table['Cluster_Name'] == name].iloc[0]
    dec_cluster = match_table['Cluster_Dec'][match_table['Cluster_Name'] == name].iloc[0]
    D_comb = match_table['D_COMB'][match_table['Cluster_Name'] == name].iloc[0]
    D_shape = match_table['D_SHAPE'][match_table['Cluster_Name'] == name].iloc[0]

    blue_frac, blue_frac_abs, gmag, umag = cal_bluefrac(df_photo, z_cluster, r_500, ra_cluster, dec_cluster, z_threshold=0.05, mag_lim_u=23.0,mag_lim_g =25.0)
    mask = np.isfinite(gmag) & np.isfinite(umag)
    gmag_plot = np.asarray(gmag)[mask]
    umag_plot = np.asarray(umag)[mask]

    bins = np.arange(14, 26, 0.25)

    # --- plot ---
    fig, ax = plt.subplots(figsize=(5.5, 4))
    ax.hist(gmag_plot, bins=bins, histtype='step', label='g mag', density=True)
    ax.hist(umag_plot, bins=bins, histtype='step', label='u mag', density=True)
    ax.set_xlabel('Magnitude')
    ax.set_ylabel('PDF')
    ax.set_title(f'{name}  u/g magnitude hist')
    ax.legend()
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
    plt.close()

    blue_frac_vec.append(blue_frac)
    blue_frac_abs_vec.append(blue_frac_abs)
    D_comb_vec.append(D_comb)
    D_shape_vec.append(D_shape)
    gmag_vec.append(np.asarray(gmag)) 
    umag_vec.append(np.asarray(umag))

In [None]:
D_comb = np.array(D_comb_vec)#[:-1])
D_shape = np.array(D_shape_vec)#[:-1])
blue_frac_vec = np.array(blue_frac_vec)#[:-1])
blue_frac_abs_vec = np.array(blue_frac_abs_vec)#[:-1])


r1, p1 = pearsonr(D_comb, blue_frac_vec)
r2, p2 = pearsonr(D_shape, blue_frac_abs_vec)

print(f"Pearson r (D_shapeD vs blue_frac) = {r1:.3f}, p = {p1:.3e}")
print(f"Pearson r (D_shape vs blue_frac_abs) = {r2:.3f}, p = {p2:.3e}")

# plot
plt.figure(figsize=(7,5))
plt.scatter(D_comb, blue_frac_vec, color='tab:blue', label=f'blue_frac (r={r1:.2f})', alpha=0.7)
#plt.scatter(D_comb, blue_frac_abs, color='tab:orange', label=f'blue_frac_abs (r={r2:.2f})', alpha=0.7)
plt.xlabel('D_shape')
plt.ylabel('Blue fraction')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
df = pd.DataFrame({
    'cluster_name': cluster_names,
    'blue_frac': blue_frac_vec,
    'redshift': z_vec,
    'D_comb': D_comb_vec
})

# save to csv for plots
df.to_csv("cluster_blue_frac_vs_z.csv", index=False)


In [None]:
def cal_bluefrac_specz(df_photo, z_cluster, r_500, ra_cluster_deg, dec_cluster_deg, mag_lim_u = 22.0, mag_lim_g = 22.0):

    df = df_photo  

    # ==== Parameters ====
    z_cl = z_cluster                  # Cluster redshift
    dz = 0.01                     # Photo-z half width (scaled by 1+z)
    #mag_lim_r = 22.0              # Magnitude limit for completeness
    mag_lim_u = mag_lim_u              # Magnitude limit for completeness
    mag_lim_g = mag_lim_g             # Magnitude limit for completeness
    max_magerr = 1.0             # Maximum photometric error allowed
    extendedness_lim = 0.5
    delta_blue = 0.20             # Blue galaxy definition: offset from red sequence (mag)

    theta_deg = theta_from_R500(z_cluster, r_500)
    location_mask = mask_within_theta_spherical_specz(df, ra_cluster_deg, dec_cluster_deg, theta_deg)


    # ==== Magnitudes ====
    u = df['u_cmodel_mag_cat']; g = df['g_cmodel_mag_cat']; r = df['r_cmodel_mag_cat']
    uerr = df['u_cmodel_magerr_cat']; gerr = df['g_cmodel_magerr_cat']; rerr = df['r_cmodel_magerr_cat']

    # ==== Select galaxies ====
    # Basic star/galaxy separation using "extendedness"
    is_gal = (df['extendedness_cat'] > extendedness_lim)

    # Photo-z membership selection 
    pz = df['z_specz'].astype(float)
    members = (np.abs(pz - z_cl) < dz * (1 + z_cl))

    # magnitude quality cuts
    snr_ok = (uerr < max_magerr) & (gerr < max_magerr) & (rerr < max_magerr)
    mag_ok = (u < mag_lim_u) & (g < mag_lim_g)

    # Final base mask: galaxies, members, quality cuts, and finite mags
    base_mask = is_gal & members & snr_ok & mag_ok & location_mask
    base_mask &= np.isfinite(u) & np.isfinite(g) & np.isfinite(r)

    # ==== Fit red sequence in (u-g) vs g ====
    ug = (u - g).values
    gmag = g.values
    umag = u.values

    X = gmag[base_mask].reshape(-1, 1)
    Y = ug[base_mask]

    # Robust regression (Huber) with iterative 3-sigma clipping
    reg = HuberRegressor().fit(X, Y)
    for _ in range(2):
        yhat = reg.predict(X)
        resid = Y - yhat
        sig = np.std(resid)
        keep = np.abs(resid) < 3 * sig
        reg = HuberRegressor().fit(X[keep], Y[keep])

    slope = reg.coef_[0]
    intercept = reg.intercept_

    # ==== Define blue galaxies ====
    ug_model = slope * g + intercept
    is_blue = (u - g) < (ug_model - delta_blue)
    is_blue_abs = ((u - g) < 1.2)

    # ==== Compute blue fraction ====
    N_tot = int(base_mask.sum())
    N_blue = int((base_mask & is_blue).sum())
    N_blue_abs = int((base_mask & is_blue_abs).sum())
    blue_frac = N_blue / N_tot if N_tot > 0 else np.nan
    blue_frac_abs = N_blue_abs / N_tot if N_tot > 0 else np.nan

    print(f"N_total={N_tot}, N_blue={N_blue}, blue_fraction={blue_frac:.3f}")
    print(f"N_total={N_tot}, N_blue={N_blue_abs}, blue_fraction={blue_frac_abs:.3f}")
    print(f"RS fit (u-g) = {slope:.3f} * g + {intercept:.3f}; blue cut: Δ={delta_blue:.2f} mag")

    # ==== Pearson and Spearman correlation ====
    xv = g[base_mask].to_numpy()
    yv = (u - g)[base_mask].to_numpy()
    good = np.isfinite(xv) & np.isfinite(yv)
    rp, pp = pearsonr(xv[good], yv[good])
    rs, ps = spearmanr(xv[good], yv[good])
    print(f"Pearson r={rp:.3f} (p={pp:.2e}); Spearman ρ={rs:.3f} (p={ps:.2e})")


    plt.figure(figsize=(8, 6))

    # Scatter plot of all members
    plt.scatter(g[base_mask], (u - g)[base_mask], s=10, alpha=0.5, label='Cluster members')

    # Fitted red sequence line
    g_range = np.linspace(g[base_mask].min(), g[base_mask].max(), 200)
    ug_rs = slope * g_range + intercept
    plt.plot(g_range, ug_rs, 'r-', lw=2, label='Red sequence fit')

    # Blue cut line
    plt.plot(g_range, ug_rs - delta_blue, 'b--', lw=2, label=f'Blue cut (Δ={delta_blue:.2f} mag)')

    # Highlight blue galaxies
    plt.scatter(g[base_mask & is_blue], (u - g)[base_mask & is_blue], 
                s=10, color='blue', alpha=0.7, label='Blue galaxies')

    plt.gca().invert_xaxis()  # Brighter to the left
    plt.xlabel('g magnitude')
    plt.ylabel('u - g colour')
    plt.legend()
    plt.title(f'Blue fraction = {blue_frac:.3f} (N={N_tot}, N_blue={N_blue})')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return blue_frac, blue_frac_abs,  gmag[base_mask], umag[base_mask]

In [None]:
blue_frac_vec_specz = []
blue_frac_abs_vec_specz = []
D_comb_vec = []
D_shape_vec = []
umag_vec_specz = []
gmag_vec_specz = []
for i, name in enumerate(cluster_names):
    folder_name = folder_names[i]
    file_path = f"/users_path/merger_trace/data/LoVoCCS/photometric_data/{folder_name}/{folder_name}_00-1111_gal_match_specz_dered_dezp.csv"
    df_photo = pd.read_csv(file_path)

    z_cluster = match_table['Cluster_z'][match_table['Cluster_Name'] == name].iloc[0]
    r_500 = match_table['R_500'][match_table['Cluster_Name'] == name].iloc[0]
    ra_cluster = match_table['Cluster_RA'][match_table['Cluster_Name'] == name].iloc[0]
    dec_cluster = match_table['Cluster_Dec'][match_table['Cluster_Name'] == name].iloc[0]
    D_comb = match_table['D_COMB'][match_table['Cluster_Name'] == name].iloc[0]
    D_shape = match_table['D_SHAPE'][match_table['Cluster_Name'] == name].iloc[0]

    try:
        blue_frac, blue_frac_abs, gmag, umag = cal_bluefrac_specz(df_photo, z_cluster, r_500, ra_cluster, dec_cluster, mag_lim_u=23.0,mag_lim_g =25.0)
            # --- plot ---
        fig, ax = plt.subplots(figsize=(5.5, 4))
        ax.hist(gmag_plot, bins=bins, histtype='step', label='g mag', density=True)
        ax.hist(umag_plot, bins=bins, histtype='step', label='u mag', density=True)
        ax.set_xlabel('Magnitude')
        ax.set_ylabel('PDF')
        ax.set_title(f'{name}  u/g magnitude hist')
        ax.legend()
        ax.grid(alpha=0.3)
        plt.tight_layout()
        plt.show()
        plt.close()
          
        blue_frac_vec_specz.append(blue_frac)
        blue_frac_abs_vec_specz.append(blue_frac_abs)
        D_comb_vec.append(D_comb)
        D_shape_vec.append(D_shape)
        gmag_vec_specz.append(np.asarray(gmag)) 
        umag_vec_specz.append(np.asarray(umag))
        
    except:
        print(f'not enough results for {name}')

In [None]:
D_comb = np.array(D_comb_vec)
D_shape = np.array(D_shape_vec)
blue_frac_specz = np.array(blue_frac_vec_specz)
blue_frac_abs_specz = np.array(blue_frac_abs_vec_specz)


r1, p1 = pearsonr(D_comb, blue_frac_specz)
r2, p2 = pearsonr(D_shape, blue_frac_abs_specz)

print(f"Pearson r (D_shape vs blue_frac) = {r1:.3f}, p = {p1:.3e}")
print(f"Pearson r (D_shape vs blue_frac_abs) = {r2:.3f}, p = {p2:.3e}")

# plot
plt.figure(figsize=(7,5))
plt.scatter(D_comb, blue_frac_specz, color='tab:blue', label=f'blue_frac (r={r1:.2f})', alpha=0.7)
#plt.scatter(D_comb, blue_frac_abs_specz, color='tab:orange', label=f'blue_frac_abs (r={r2:.2f})', alpha=0.7)
plt.xlabel('D_shape')
plt.ylabel('Blue fraction')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
g_all = np.concatenate(gmag_vec)
u_all = np.concatenate(umag_vec)
g_all_specz = np.concatenate(gmag_vec_specz)
u_all_specz = np.concatenate(umag_vec_specz)

In [None]:
def _clean(a):
    a = np.asarray(a, float)
    return a[np.isfinite(a)]

def ecdf(x):
    x = np.sort(x)
    y = np.arange(1, len(x)+1) / len(x)
    return x, y


def color_bins(a, b, step=0.05, qlo=1, qhi=99):
    import numpy as np
    x = np.concatenate([np.asarray(a, float), np.asarray(b, float)])
    x = x[np.isfinite(x)]
    if x.size == 0:
        return np.linspace(0, 1, 2)  
    lo, hi = np.nanpercentile(x, [qlo, qhi])
    lo -= 0.05*(hi-lo); hi += 0.05*(hi-lo) 
    return np.arange(lo, hi+step, step)

def compare_dists(a, b, label_a, label_b, bins=np.arange(14, 26, 0.25)):
    a = _clean(a); b = _clean(b)
    print(f'{label_a}: N={len(a)}, median={np.nanmedian(a):.3f}, IQR=({np.nanpercentile(a,25):.3f},{np.nanpercentile(a,75):.3f})')
    print(f'{label_b}: N={len(b)}, median={np.nanmedian(b):.3f}, IQR=({np.nanpercentile(b,25):.3f},{np.nanpercentile(b,75):.3f})')

    # --- two-sample tests ---
    ks = stats.ks_2samp(a, b, alternative='two-sided', mode='auto')
    cvm = stats.cramervonmises_2samp(a, b)
    try:
        ad = stats.anderson_ksamp([a, b])   # SciPy≥1.9
        ad_str = f'AD stat={ad.statistic:.3f}, p≈{ad.significance_level/100:.3f}'
    except Exception:
        ad_str = '(AD not available)'

    print(f'KS     : D={ks.statistic:.3f}, p={ks.pvalue:.3g}')
    print(f'CvM    : T={cvm.statistic:.3f}, p={cvm.pvalue:.3g}')
    print(f'AD     : {ad_str}')
    print('-'*50)

    # --- histogram (PDF) ---
    fig, ax = plt.subplots(figsize=(5.2, 4))
    ax.hist(a, bins=bins, histtype='step', density=True, label=label_a)
    ax.hist(b, bins=bins, histtype='step', density=True, label=label_b)
    ax.set_xlabel('magnitude'); ax.set_ylabel('PDF'); ax.legend(); ax.grid(alpha=0.3)
    plt.show()

    # --- ECDF ---
    xa, ya = ecdf(a); xb, yb = ecdf(b)
    fig, ax = plt.subplots(figsize=(5.2, 4))
    ax.plot(xa, ya, drawstyle='steps-post', label=label_a)
    ax.plot(xb, yb, drawstyle='steps-post', label=label_b)
    ax.set_xlabel('magnitude'); ax.set_ylabel('ECDF'); ax.legend(); ax.grid(alpha=0.3)
    plt.show()

compare_dists(g_all,        g_all_specz, 'g_all', 'g_specz')
compare_dists(u_all,        u_all_specz, 'u_all', 'u_specz')


In [None]:
ug_all = u_all - g_all
ug_all_specz = u_all_specz - g_all_specz

In [None]:
bins = color_bins(ug_all, ug_all_specz)
compare_dists(ug_all, ug_all_specz, 'u-g all', 'u-g specz', bins=bins)

In [None]:
# create dataframe
df = pd.DataFrame({
    "g_all": pd.Series(g_all),
    "g_all_specz": pd.Series(g_all_specz),
    "u_all": pd.Series(u_all),
    "u_all_specz": pd.Series(u_all_specz),
    "ug_all": pd.Series(ug_all),
    "ug_all_specz": pd.Series(ug_all_specz),
})

# save to csv for plots
df.to_csv("blue_frac_magnitude_samples.csv", index=False)
