In [1]:
# Library imports (see /Utilities/setup_env.py for full import list and figure setup)

from pathlib import Path
import sys

repo_root = Path.cwd().parents[1]  # .../Figure_analysis
repo_root = repo_root.parent       # repo root
sys.path.insert(0, str(repo_root))

from Figure_analysis.Utilities.setup_env import *

In [2]:
def fetch_tc_fit_params(db_path, selected_construct, selected_site_base, selected_valtype_tc = 'modrate'):
    """
    Fetch fitted time-course parameters for a specific rg_id, site_base, and valtype.
    
    Parameters
    ----------
    db_path : str
        Path to the SQLite database file.
    selected_construct : str
        Construct disp_name
    selected_site_base : str
        Site and base combination (e.g., '18_A', '25_C').
    selected_valtype_tc : str
        Value type for filtering (e.g., 'modrate').

    Returns
    -------
    pd.DataFrame
        DataFrame containing fitted parameters with columns: fit_run_id, log_kobs,
        log_kdeg, log_fmod0, kobs, kdeg, fmod0, log_kobs_err, log_kdeg_err,
        log_fmod0_err, diag_r2.
    """

    conn = sqlite3.connect(db_path)
    fitted_params_df = pd.read_sql_query(
        """
        WITH run AS (
            SELECT id AS fit_run_id, rg_id
            FROM probe_tc_fit_runs
            WHERE (rg_id, nt_id) IN (
                SELECT DISTINCT pr.rg_id, mn.id
                FROM probe_reactions pr
                JOIN probe_fmod_values fv ON fv.rxn_id = pr.id
                JOIN meta_nucleotides mn ON mn.id = fv.nt_id
                JOIN meta_constructs mc ON mc.id = pr.construct_id
                WHERE mc.disp_name = :selected_construct
                AND mn.site || '_' || UPPER(mn.base) = :site_base
            )
            AND fit_kind = 'round3_constrained'
            AND valtype = :valtype_mod
        )
        SELECT
            p.fit_run_id,
            r.rg_id,
            pr.temperature,
            pr.replicate,
            MAX(CASE WHEN p.param_name IN ('log_kobs','logkobs')
                    THEN p.param_numeric END) AS log_kappa,
            MAX(CASE WHEN p.param_name IN ('log_kdeg','logkdeg')
                    THEN p.param_numeric END) AS log_kdeg,
            MAX(CASE WHEN p.param_name IN ('log_fmod0','logfmod0')
                    THEN p.param_numeric END) AS log_fmod0,
            MAX(CASE WHEN p.param_name IN ('log_kobs_err','logkobs_err')
                    THEN p.param_numeric END) AS log_kappa_err,
            MAX(CASE WHEN p.param_name IN ('log_kdeg_err','logkdeg_err')
                    THEN p.param_numeric END) AS log_kdeg_err,
            MAX(CASE WHEN p.param_name IN ('log_fmod0_err','logfmod0_err')
                    THEN p.param_numeric END) AS log_fmod0_err,
            MAX(CASE WHEN p.param_name = 'diag:r2'
                    THEN p.param_numeric END) AS r2
        FROM probe_tc_fit_params p
        JOIN run r
            ON r.fit_run_id = p.fit_run_id
        JOIN probe_reactions pr
            ON pr.rg_id = r.rg_id
        GROUP BY
            p.fit_run_id,
            r.rg_id,
            pr.temperature,
            pr.replicate
        ORDER BY
            pr.temperature ASC,
            pr.replicate ASC;
        """,
        conn,
        params={"selected_construct": selected_construct, "site_base": selected_site_base, "valtype_mod": selected_valtype_tc},
    )
    conn.close()
    return fitted_params_df

def mean_with_error(df, value_col='value', err_col='err'):
    """
    Compute an unweighted mean and its propagated uncertainty.

    Mean is the simple arithmetic average.
    Error propagation for averaging N independent values:
        err_mean = sqrt(sum(err_i^2)) / N

    Parameters
    ----------
    df : pandas.DataFrame
        Must contain `value_col` and `err_col`.
    value_col : str, default 'value'
        Column containing measured values.
    err_col : str, default 'err'
        Column containing 1Ïƒ uncertainties for each value.

    Returns
    -------
    mean : float
        Simple arithmetic mean.
    err_mean : float
        Propagated uncertainty on the mean.
    n : int
        Number of points used (after dropping NaNs).
    """
    # Drop missing values
    df = df.dropna(subset=[value_col, err_col]).copy()
    n = len(df)
    if n == 0:
        return np.nan, np.nan, 0

    # Simple mean
    mean = df[value_col].mean()

    # Propagated error on the mean
    err_mean = np.sqrt((df[err_col] ** 2).sum()) / n

    return mean, err_mean, n

In [3]:
# --- summarize ln_kobs by temperature (mean + propagated error) ---

def summarize_ln_kobs_by_temp(df, temp_col="temperature"):
    """
    Group a tc-fit dataframe by temperature and compute mean ln_kobs and its propagated error.
    Assumes df already has columns: ln_kobs, ln_kobs_err.
    """
    rows = []
    for temp, g in df.groupby(temp_col, sort=True):
        mean, err, n = mean_with_error(g, value_col="ln_kobs", err_col="ln_kobs_err")
        rows.append({
            "temperature": temp,
            "ln_kobs_mean": mean,
            "ln_kobs_err": err,
            "n": n
        })
    return pd.DataFrame(rows).sort_values("temperature").reset_index(drop=True)


# Prep dataframes
# WT: build ln_kobs and summarize per temp
A18_wt = fetch_tc_fit_params(NERD_SQLITE, "4U_wt", "18_A", selected_valtype_tc="modrate")
A18_wt["ln_kobs"] = A18_wt["log_kappa"] + A18_wt["log_kdeg"]
A18_wt["ln_kobs_err"] = np.sqrt(A18_wt["log_kappa_err"]**2 + A18_wt["log_kdeg_err"]**2)
A18_wt = A18_wt[~A18_wt["rg_id"].isin([10, 20, 32])]

# A8C: same
A18_a8c = fetch_tc_fit_params(NERD_SQLITE, "4U_a8c", "18_A", selected_valtype_tc="modrate")
A18_a8c["ln_kobs"] = A18_a8c["log_kappa"] + A18_a8c["log_kdeg"]
A18_a8c["ln_kobs_err"] = np.sqrt(A18_a8c["log_kappa_err"]**2 + A18_a8c["log_kdeg_err"]**2)
A18_a8c = A18_a8c[~A18_a8c["rg_id"].isin([9])]

# WT summary
A18_wt_sum = summarize_ln_kobs_by_temp(A18_wt)
A18_wt_sum['construct'] = '4U_wt'
# A8C summary
A18_a8c_sum = summarize_ln_kobs_by_temp(A18_a8c)
A18_a8c_sum['construct'] = '4U_a8c'

# combined long-form (nice for seaborn/matplotlib)
A18_sum_long = pd.concat([A18_wt_sum, A18_a8c_sum], ignore_index=True)

os.makedirs('A18_kinetic_params', exist_ok=True)
A18_sum_long.to_csv('A18_kinetic_params/A18_ln_kobs_by_temp.csv', index=False)

In [4]:
to_fit = {'4U_wt': ['7_A', '8_A', '9_C', '15_A', '16_A', '18_A', '22_A', '25_C', '26_A', '29_A', '34_A', '35_A'], 
          '4U_a8c': ['7_A', '8_C', '9_C', '15_A', '16_A', '18_A', '22_A', '25_C', '26_A', '29_A', '34_A', '35_A']}

outlier_rg_ids = {'4U_wt': [10, 20, 32], '4U_a8c': [9]}

site_df_list = []
for construct, sites in to_fit.items():
    for site in sites:
        site_df = fetch_tc_fit_params(NERD_SQLITE, construct, site, selected_valtype_tc="modrate")
        site_df["ln_kobs"] = site_df["log_kappa"] + site_df["log_kdeg"]
        site_df["ln_kobs_err"] = np.sqrt(site_df["log_kappa_err"]**2 + site_df["log_kdeg_err"]**2)
        site_df = site_df[~site_df["rg_id"].isin(outlier_rg_ids[construct])]

        site_sum = summarize_ln_kobs_by_temp(site_df)
        site_sum['construct'] = construct
        site_sum['site'] = site

        site_df_list.append(site_sum)
os.makedirs('mean_lnkobs_data', exist_ok=True)
all_kinetic_params = pd.concat(site_df_list, ignore_index=True).dropna()
all_kinetic_params.to_csv('mean_lnkobs_data/all_lnkobs_by_temp.csv', index=False)