In [143]:
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch
import matplotlib.ticker as ticker
thesis_dir = '/home/azstephe/liverRegression/regression_liver/data/figs/'

colors = {
        'Rat': '#94c4be',      
        'Macaque': '#89a1b6',
        'Cow': '#d1c177',      
        'Pig': '#d59698',
        'Mouse': '#a78dc7',
        'Neg': '#a3a3a3'
}

super_super_lights = {
    'Rat': '#d9ecea',
    'Macaque': '#d5dde7',
    'Cow': '#f2ecd0',
    'Pig': '#f5d6d7',
    'Mouse': '#e2d8f0',
    'Neg': '#e0e0e0'
}

super_lights = {
    'Rat': '#cce4e0',
    'Macaque': '#c6d2de',
    'Cow': '#e8e0bb',
    'Pig': '#ebc5c7',
    'Mouse': '#d7cdea',
    'Neg': '#d1d1d1'

}

darks = {
        'Rat': '#7ea7a2',      
        'Macaque': '#74899b',
        'Cow': '#bfb164',      
        'Pig': '#b67076',
        'Mouse': '#8669a7',
        'Neg': '#8e8e8e'
}

lights = {
        'Rat': '#add0b0',      
        'Macaque': '#9eb9d1',
        'Cow': '#e3d289',      
        'Pig': '#f4b9bb',
        'Mouse': '#c6b3df',
        'Neg': '#b6b6b6'
}

super_darks = {
    'Rat': '#2e4946',       # dark teal
    'Macaque': '#2a3a46',   # dark slate blue
    'Cow': '#4d471f',       # dark olive
    'Pig': '#472d2f',       # dark rose
    'Mouse': '#5f4a77',      # dark gray
    'Neg': '#353535'      # dark gray
}

def histogram(pred_df, name, status):
    species = status.split(' ')[-1]
    plt.hist(pred_df, bins=50, color=colors[species])
    plt.xlim(0,4.1)
    plt.ylim(0, 1)
    plt.xlabel(f'{name}')
    plt.ylabel('Count')
    plt.title(f'Histogram of {status}')
    # plt.savefig(f"{thesis_dir}{direc}{status.replace(' ', '_')}.png", dpi=300, bbox_inches='tight')
    plt.show()

table = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/figs/tables_mse/135log_model_eval_table_FINAL_mse.tsv', sep='\t')
mouseTable = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/figs/tables_mse/mouse_135log_model_eval_table_FINAL_mse.tsv', sep='\t')

def locate(tab, species, group, metric, model):
    num_series = tab.loc[(tab['species'] == species) &
                         (tab['Group'] == group) &
                         (tab['Metric'] == metric)][model]
    
    # Use .item() to extract the single scalar value
    return num_series.item()

def correlations(tab, species, group, model):
    pearson_r = locate(tab, species, group, 'Pearson', model),
    pearson_p = locate(tab, species, group, 'Pearson_p', model),
    spearman_rho = locate(tab, species, group, 'Spearman', model),
    spearman_p = locate(tab, species, group, 'Spearman_p', model),
    # x = true_df.squeeze()
    # y = pred_df.squeeze()
    
    # pearson_corr, pearson_p_test = scipy.stats.pearsonr(x, y)
    # # print(f"Pearson correlation coefficient for {group}: {pearson_corr:.4f}, p-test: {pearson_p_test:.4g}")
    
    # spearman_corr, spearman_p_test = scipy.stats.spearmanr(x, y)
    # # print(f"Spearman correlation coefficient for {group}: {spearman_corr:.4f}, p-test: {spearman_p_test:.4g}")
    return float(pearson_r[0]), float(pearson_p[0]), float(spearman_rho[0]), float(spearman_p[0])

In [68]:
# test log scatter
from sklearn.linear_model import LinearRegression
import matplotlib.colors as mcolors
from matplotlib.colors import LinearSegmentedColormap, to_rgb

direc='test_scatter_9_25/'

def format_pval(p):
    return "0.0" if p == 0 else f"{p:.2e}"
    
def mean_squared_error(x, y):
    x = np.asarray(x)
    y = np.asarray(y)
    return np.mean((x - y) ** 2)

def format_value(metric_name, value):
    """Format values depending on whether it's a P-value metric or not."""
    # Check for '_p' which is more general for 'pearson_p', 'spearman_p' etc.
    if "_p" in metric_name:
        return f"{value:.2e}"  # Scientific notation for p-values
    else:
        return f"{value:.3g}"  # General format for other metrics
    
def make_smooth_colormap(name, colors_hex, n_points=256):
    colors_rgb = [to_rgb(c) for c in colors_hex]
    return LinearSegmentedColormap.from_list(name, colors_rgb, N=n_points)
    
def make_triple_colormap(base_hex, name):
    if name == 'Cow_triple':
        return make_smooth_colormap('Cow_contrast', [lights['Cow'], '#e07014', '#db3218', '#db0f6b'])
    elif name == 'Mouse_triple':
        return make_smooth_colormap('Mouse_contrast', [lights['Mouse'], '#4E3987', '#0055C2', '#09C3E8'])
    elif name == 'Pig_triple':
        return make_smooth_colormap('Pig_contrast', [lights['Pig'], '#9E494C', '#A61FA1', '#5613A8'])
    elif name == 'Macaque_triple':
        return make_smooth_colormap('Macaque_contrast', [lights['Macaque'], '#348394', '#1CC7B5', '#08C747'])
    elif name == 'Rat_triple':
        return make_smooth_colormap('Rat_contrast', [lights['Rat'], '#3F7028', '#D6C215', '#FF9E17'])
    else:
        return make_smooth_colormap(name + '_contrast', ['#ffffcc', base_hex, '#3b528b'])

triple_cmaps = {name: make_triple_colormap(hex_color, name + '_triple')
                for name, hex_color in colors.items()}

def scatter(pred_df, true_df, title, xlabel, ylabel, r, rP, rho, rhoP, mse, species):
    x = true_df.squeeze()
    y = pred_df.squeeze()

    gridsize=50
    
    plt.figure(figsize=(6, 6))
    plt.xlim(0, 4)
    plt.ylim(0, 4)
    plt.plot([0, 4], [0, 4], color='black', linestyle='--', linewidth=1, dashes=(10, 10))

    custom_cmap = triple_cmaps[species.capitalize()]
        
    hb = plt.hexbin(x, y, gridsize=gridsize, cmap=custom_cmap, mincnt=1)  # higher gridsize = finer bins
    cb = plt.colorbar(hb)

    # Set colorbar ticks to integers only
    cb.locator = ticker.MaxNLocator(integer=True)
    cb.update_ticks()

    cb.ax.tick_params(labelsize=18)

    plt.xlabel(f'{xlabel}', fontsize=22, labelpad=5)
    plt.ylabel(f'{ylabel}', fontsize=22, labelpad=5)
    
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)

    x = true_df.squeeze().values.reshape(-1, 1)
    y = pred_df.squeeze().values

    # Fit linear regression
    reg = LinearRegression().fit(x, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    reg_line = reg.predict(x)
    
    plt.plot(x, reg_line, color='#57320A', linewidth=1.2, label=f'Linear Fit (slope = {slope:.2f})')

    textstr = f'$r$ = {r:.2f}\n$\\rho$ = {rho:.2f}\nmse = {mse:.2f}'
    plt.text(0.03, 0.97, textstr, transform=plt.gca().transAxes,
             fontsize=22, verticalalignment='top')
    textstr = f'm = {slope:.2f}'
    plt.text(0.03, 0.7, textstr, transform=plt.gca().transAxes,
             fontsize=22, verticalalignment='top', color='#57320A')

    ax = plt.gca()
    ax.set_aspect('equal', adjustable='box')
    ax.tick_params(width=0.5, length=4)
    for spine in ax.spines.values():
        spine.set_linewidth(0.5)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1.0))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1.0))
    ax.xaxis.set_minor_locator(ticker.MultipleLocator(0.5))
    ax.yaxis.set_minor_locator(ticker.MultipleLocator(0.5))
    
    # --- Add these lines to change tick length ---
    # Customize major ticks
    ax.tick_params(which='major', length=10, width=2)
    # Customize minor ticks
    ax.tick_params(which='minor', length=5, width=1)

    plt.tight_layout()

    # plt.savefig(f"{thesis_dir}{direc}{title.replace(' ', '_')}_density_{str(gridsize)}.tif", dpi=300, bbox_inches='tight')
    # plt.show()
    plt.close()
    return slope

results_corr = []
# species_list = ['macaque', 'rat', 'cow', 'pig']
species_list = ['cow']

model_list = ['bdbi7l3n', 'kf8188qf', 'cq45eb2s']
# model_list = ['bdbi7l3n']
for species in species_list:
    print(f'\n{species}')
    for model in model_list:
        print(f'\n{model}')
        mhc = 200
        if model == 'kf8188qf':
            mhc = 100
        
        # load all the DFs
        pred_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}_FINAL/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        # input sanity check
        if len(pred_df) != test1_len+test2_len+test3_len:
            print("ERROR1: predictions are a different length than testidation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_df.head(test1_len)
        pred_test2_df = pred_df.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_df.tail(test3_len)

        #############
        pred_orthologs_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}_FINAL/activations_{species}_TEST_orthologs.csv', header=None)

        pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        neg_len = 2*len(neg_df)
        pos_len = 2*len(pos_df)
        
        # input sanity check
        if len(pred_orthologs_df) != neg_len+pos_len:
            print("ERROR2: predictions are a different length than testidation sets")

        doubled_neg_df = pd.concat([neg_df, neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_pos_df = pd.concat([pos_df, pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_pos_df = pred_orthologs_df.head(pos_len)
        pred_neg_df = pred_orthologs_df.tail(neg_len)

        
        test2R, test2RP, test2Rho, test2RhoP = correlations(pred_test2_df, doubled_test2_df, 'test2')
        
        test3R, test3RP, test3Rho, test3RhoP = correlations(pred_test3_df, doubled_test3_df, 'test3')
        
        orthoR, orthoRP, orthoRho, orthoRhoP = correlations(pred_pos_df, doubled_pos_df, 'mouse test orthologs')
        test2Mse = mean_squared_error(doubled_test2_df, pred_test2_df)
        test3Mse = mean_squared_error(doubled_test3_df, pred_test3_df)
        orthoMse = mean_squared_error(pred_pos_df, doubled_pos_df)

        slope2 = scatter(pred_test2_df, doubled_test2_df, f'{species.capitalize()} Prediction Accuracy for Test2 ({model})', 'Real', 'Predicted', test2R, test2RP, test2Rho, test2RhoP, test2Mse, species)
        slope3 = scatter(pred_test3_df, doubled_test3_df, f'{species.capitalize()} Prediction Accuracy for Test3 ({model})', 'Real', 'Predicted', test3R, test3RP, test3Rho, test3RhoP, test3Mse, species)
        slopeOrtho = scatter(pred_pos_df, doubled_pos_df, f'{species.capitalize()} Prediction Accuracy for orthologs ({model})', 'Real', 'Predicted', orthoR, orthoRP, orthoRho, orthoRhoP, orthoMse, species)

        
        results_corr.append({
            'species': species,
            'model': model,
            'set': 'test2',
            'pearson_r': test2R,
            'pearson_p': test2RP * mhc,
            'spearman_rho': test2Rho,
            'spearman_p': test2RhoP * mhc,
            'mse': test2Mse,
            'slope': slope2
        })
        results_corr.append({
            'species': species,
            'model': model,
            'set': 'test3',
            'pearson_r': test3R,
            'pearson_p': test3RP * mhc,
            'spearman_rho': test3Rho,
            'spearman_p': test3RhoP * mhc,
            'mse': test3Mse,
            'slope': slope3
        })
        results_corr.append({
            'species': species,
            'model': model,
            'set': 'orthologs',
            'pearson_r': orthoR,
            'pearson_p': orthoRP * mhc,
            'spearman_rho': orthoRho,
            'spearman_p': orthoRhoP * mhc,
            'mse': orthoMse,
            'slope': slopeOrtho
        })


species = 'mouse'
for model in model_list:
    mhc = 200
    if model == 'kf8188qf':
        mhc = 100
    pred_orthologs_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}_FINAL/activations_{species}_TEST.csv', header=None)

    pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
    neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
    neg_len = 2*len(neg_df)
    pos_len = 2*len(pos_df)
        
        # input sanity check
    if len(pred_orthologs_df) != neg_len+pos_len:
        print("ERROR3: precdictions are a different length than testidation sets")

    doubled_neg_df = pd.concat([neg_df, neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
    doubled_pos_df = pd.concat([pos_df, pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
    pred_pos_df = pred_orthologs_df.head(pos_len)
    pred_neg_df = pred_orthologs_df.tail(neg_len)

    orthoR, orthoRP, orthoRho, orthoRhoP = correlations(pred_pos_df, doubled_pos_df, 'mouse test')
    orthoMse = mean_squared_error(pred_pos_df, doubled_pos_df)

    slopeOrtho = scatter(pred_pos_df, doubled_pos_df, f'{species.capitalize()} Prediction Accuracy for orthologs ({model})', 'Real', 'Predicted', orthoR, orthoRP, orthoRho,orthoRhoP, orthoMse, species)
    
    results_corr.append({
            'species': species,
            'model': model,
            'set': 'orthologs',
            'pearson_r': orthoR,
            'pearson_p': orthoRP * mhc,
            'spearman_rho': orthoRho,
            'spearman_p': orthoRhoP * mhc,
            'mse': orthoMse,
            'slope': slopeOrtho
        })


results_corr_df = pd.DataFrame(results_corr)

df = results_corr_df
id_vars = ['species', 'model', 'set']
metric_vars = ['pearson_r', 'pearson_p', 'spearman_rho', 'spearman_p', 'mse', 'slope']

df_long = df.melt(
    id_vars=id_vars,
    value_vars=metric_vars,
    var_name='metric',
    value_name='value'
)

# 3. Pivot the long DataFrame to get models as columns
# The index levels will be species, set, and then the metric name
final_df = df_long.pivot_table(
    index=['species', 'set', 'metric'],
    columns='model',
    values='value'
)

# Optional: To ensure the metrics are in the original order,
# you can set the 'metric' column as a categorical type.
final_df = final_df.reindex(metric_vars, level='metric')
final_df_display = final_df.copy()

def format_value(metric_name, value):
    """Format values depending on whether it's a P-value metric or not."""
    # Check for '_p' which is more general for 'pearson_p', 'spearman_p' etc.
    if "_p" in metric_name:
        return f"{value:.2e}"  # Scientific notation for p-values
    else:
        return f"{value:.3g}"  # General format for other metrics

# Loop through each model's column to apply the formatting
for col in final_df_display.columns:
    final_df_display[col] = final_df_display.apply(
        # Access the 'metric' from the index using row.name[2]
        # (assuming it's the 3rd level of your index)
        lambda row: format_value(row.name[2], row[col]),
        axis=1
    )

# Display the formatted table
# final_df_display


cow

bdbi7l3n

kf8188qf

cq45eb2s


In [150]:
# test log scatter
from sklearn.linear_model import LinearRegression
import matplotlib.colors as mcolors
from matplotlib.colors import LinearSegmentedColormap, to_rgb

direc='test_scatter_9_25/'

def format_pval(p):
    return "0.0" if p == 0 else f"{p:.2e}"
    
def mean_squared_error(x, y):
    x = np.asarray(x)
    y = np.asarray(y)
    return np.mean((x - y) ** 2)

def format_value(metric_name, value):
    """Format values depending on whether it's a P-value metric or not."""
    # Check for '_p' which is more general for 'pearson_p', 'spearman_p' etc.
    if "_p" in metric_name:
        return f"{value:.2e}"  # Scientific notation for p-values
    else:
        return f"{value:.3g}"  # General format for other metrics
    
def make_smooth_colormap(name, colors_hex, n_points=256):
    colors_rgb = [to_rgb(c) for c in colors_hex]
    return LinearSegmentedColormap.from_list(name, colors_rgb, N=n_points)
    
def make_triple_colormap(base_hex, name):
    if name == 'Cow_triple':
        return make_smooth_colormap('Cow_contrast', [lights['Cow'], '#e07014', '#db3218', '#db0f6b'])
    elif name == 'Mouse_triple':
        return make_smooth_colormap('Mouse_contrast', [lights['Mouse'], '#4E3987', '#0055C2', '#09C3E8'])
    elif name == 'Pig_triple':
        return make_smooth_colormap('Pig_contrast', [lights['Pig'], '#9E494C', '#A61FA1', '#5613A8'])
    elif name == 'Macaque_triple':
        return make_smooth_colormap('Macaque_contrast', [lights['Macaque'], '#348394', '#1CC7B5', '#08C747'])
    elif name == 'Rat_triple':
        return make_smooth_colormap('Rat_contrast', [lights['Rat'], '#3F7028', '#D6C215', '#FF9E17'])
    else:
        return make_smooth_colormap(name + '_contrast', ['#ffffcc', base_hex, '#3b528b'])

triple_cmaps = {name: make_triple_colormap(hex_color, name + '_triple')
                for name, hex_color in colors.items()}

def scatter(pred_df, true_df, title, xlabel, ylabel, r, rP, rho, rhoP, mse, species):
    x = true_df.squeeze()
    y = pred_df.squeeze()

    gridsize=50
    
    plt.figure(figsize=(6, 6))
    plt.xlim(0, 4)
    plt.ylim(0, 4)
    plt.plot([0, 4], [0, 4], color='black', linestyle='--', linewidth=1, dashes=(10, 10))

    custom_cmap = triple_cmaps[species.capitalize()]
        
    hb = plt.hexbin(x, y, gridsize=gridsize, cmap=custom_cmap, mincnt=1)  # higher gridsize = finer bins
    cb = plt.colorbar(hb)

    # Set colorbar ticks to integers only
    cb.locator = ticker.MaxNLocator(integer=True)
    cb.update_ticks()

    cb.ax.tick_params(labelsize=18)

    plt.xlabel(f'{xlabel}', fontsize=22, labelpad=5)
    plt.ylabel(f'{ylabel}', fontsize=22, labelpad=5)
    
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)

    x = true_df.squeeze().values.reshape(-1, 1)
    y = pred_df.squeeze().values

    # Fit linear regression
    reg = LinearRegression().fit(x, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    reg_line = reg.predict(x)
    
    plt.plot(x, reg_line, color='#57320A', linewidth=1.2, label=f'Linear Fit (slope = {slope:.2f})')

    textstr = f'$r$ = {r:.2f}\n$\\rho$ = {rho:.2f}\nmse = {mse:.2f}'
    plt.text(0.03, 0.97, textstr, transform=plt.gca().transAxes,
             fontsize=22, verticalalignment='top')
    textstr = f'm = {slope:.2f}'
    plt.text(0.03, 0.7, textstr, transform=plt.gca().transAxes,
             fontsize=22, verticalalignment='top', color='#57320A')

    ax = plt.gca()
    ax.set_aspect('equal', adjustable='box')
    ax.tick_params(width=0.5, length=4)
    for spine in ax.spines.values():
        spine.set_linewidth(0.5)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1.0))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1.0))
    ax.xaxis.set_minor_locator(ticker.MultipleLocator(0.5))
    ax.yaxis.set_minor_locator(ticker.MultipleLocator(0.5))
    
    # --- Add these lines to change tick length ---
    # Customize major ticks
    ax.tick_params(which='major', length=10, width=2)
    # Customize minor ticks
    ax.tick_params(which='minor', length=5, width=1)

    plt.tight_layout()

    plt.savefig(f"{thesis_dir}{direc}{title.replace(' ', '_')}_density_{str(gridsize)}.tif", dpi=300, bbox_inches='tight')
    # plt.show()
    plt.close()
    return slope
    
table = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/figs/tables_mse/135log_model_eval_table_FINAL_mse.tsv', sep='\t')
mouseTable = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/figs/tables_mse/mouse_135log_model_eval_table_FINAL_mse.tsv', sep='\t')
    
results_corr = []
species_list = ['macaque', 'rat', 'cow', 'pig']
# species_list = ['cow']
model_list = ['bdbi7l3n', 'kf8188qf', 'cq45eb2s']
# model_list = ['bdbi7l3n']
for species in species_list:
    print(f'\n{species}')
    for model in model_list:
        print(f'\n{model}')
        # load all the DFs
        pred_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}_FINAL/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        # input sanity check
        if len(pred_df) != test1_len+test2_len+test3_len:
            print("ERROR1: predictions are a different length than testidation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_df.head(test1_len)
        pred_test2_df = pred_df.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_df.tail(test3_len)

        #############
        pred_orthologs_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}_FINAL/activations_{species}_TEST_orthologs.csv', header=None)

        pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        neg_len = 2*len(neg_df)
        pos_len = 2*len(pos_df)
        
        # input sanity check
        if len(pred_orthologs_df) != neg_len+pos_len:
            print("ERROR2: predictions are a different length than testidation sets")

        doubled_neg_df = pd.concat([neg_df, neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_pos_df = pd.concat([pos_df, pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_pos_df = pred_orthologs_df.head(pos_len)
        pred_neg_df = pred_orthologs_df.tail(neg_len)

        test2R, test2RP, test2Rho, test2RhoP = correlations(table, species, 'Test2', model)
        
        test3R, test3RP, test3Rho, test3RhoP = correlations(table, species, 'Test3', model)
        
        orthoR, orthoRP, orthoRho, orthoRhoP = correlations(table, species, 'Test', model)

        test2Mse = locate(table, species, 'Test2', 'MSE', model)
        test3Mse = locate(table, species, 'Test3', 'MSE', model)
        orthoMse = locate(table, species, 'Test', 'MSE', model)

        slope2 = scatter(pred_test2_df, doubled_test2_df, f'{species.capitalize()} Prediction Accuracy for Test2 ({model})', 'Real', 'Predicted', test2R, test2RP, test2Rho, test2RhoP, test2Mse, species)
        slope3 = scatter(pred_test3_df, doubled_test3_df, f'{species.capitalize()} Prediction Accuracy for Test3 ({model})', 'Real', 'Predicted', test3R, test3RP, test3Rho, test3RhoP, test3Mse, species)
        slopeOrtho = scatter(pred_pos_df, doubled_pos_df, f'{species.capitalize()} Prediction Accuracy for orthologs ({model})', 'Real', 'Predicted', orthoR, orthoRP, orthoRho, orthoRhoP, orthoMse, species)       
        
        results_corr.append({
            'species': species,
            'model': model,
            'set': 'test2',
            'pearson_r': locate(table, species, 'Test2', 'Pearson', model),
            'pearson_p': locate(table, species, 'Test2', 'Pearson_p', model),
            'spearman_rho': locate(table, species, 'Test2', 'Spearman', model),
            'spearman_p': locate(table, species, 'Test2', 'Spearman_p', model),
            'mse': locate(table, species, 'Test2', 'MSE', model),
            'slope': slope2
        })
        results_corr.append({
            'species': species,
            'model': model,
            'set': 'test3',
            'pearson_r': locate(table, species, 'Test3', 'Pearson', model),
            'pearson_p': locate(table, species, 'Test3', 'Pearson_p', model),
            'spearman_rho': locate(table, species, 'Test3', 'Spearman', model),
            'spearman_p': locate(table, species, 'Test3', 'Spearman_p', model),
            'mse': locate(table, species, 'Test3', 'MSE', model),
            'slope': slope3
        })
        results_corr.append({
            'species': species,
            'model': model,
            'set': 'orthologs',
            'pearson_r': locate(table, species, 'Test', 'Pearson', model),
            'pearson_p': locate(table, species, 'Test', 'Pearson_p', model),
            'spearman_rho': locate(table, species, 'Test', 'Spearman', model),
            'spearman_p': locate(table, species, 'Test', 'Spearman_p', model),
            'mse': locate(table, species, 'Test', 'MSE', model),
            'slope': slopeOrtho
        })


species = 'mouse'
for model in model_list:
    pred_orthologs_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}_FINAL/activations_{species}_TEST.csv', header=None)

    pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
    neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
    neg_len = 2*len(neg_df)
    pos_len = 2*len(pos_df)
        
        # input sanity check
    if len(pred_orthologs_df) != neg_len+pos_len:
        print("ERROR3: precdictions are a different length than testidation sets")

    doubled_neg_df = pd.concat([neg_df, neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
    doubled_pos_df = pd.concat([pos_df, pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
    pred_pos_df = pred_orthologs_df.head(pos_len)
    pred_neg_df = pred_orthologs_df.tail(neg_len)

    orthoR, orthoRP, orthoRho, orthoRhoP = correlations(mouseTable, species, 'Test', model)
    orthoMse = locate(mouseTable, species, 'Test', 'MSE', model)

    slopeOrtho = scatter(pred_pos_df, doubled_pos_df, f'{species.capitalize()} Prediction Accuracy for orthologs ({model})', 'Real', 'Predicted', orthoR, orthoRP, orthoRho,orthoRhoP, orthoMse, species)
    
    results_corr.append({
            'species': species,
            'model': model,
            'set': 'orthologs',
            'pearson_r': locate(mouseTable, species, 'Test', 'Pearson', model),
            'pearson_p': locate(mouseTable, species, 'Test', 'Pearson_p', model),
            'spearman_rho': locate(mouseTable, species, 'Test', 'Spearman', model),
            'spearman_p': locate(mouseTable, species, 'Test', 'Spearman_p', model),
            'mse': locate(mouseTable, species, 'Test', 'MSE', model),
            'slope': slopeOrtho
        })


results_corr_df = pd.DataFrame(results_corr)

df = results_corr_df
id_vars = ['species', 'model', 'set']
metric_vars = ['pearson_r', 'pearson_p', 'spearman_rho', 'spearman_p', 'mse', 'slope']

df_long = df.melt(
    id_vars=id_vars,
    value_vars=metric_vars,
    var_name='metric',
    value_name='value'
)

# 3. Pivot the long DataFrame to get models as columns
# The index levels will be species, set, and then the metric name
final_df = df_long.pivot_table(
    index=['species', 'set', 'metric'],
    columns='model',
    values='value'
)

# Optional: To ensure the metrics are in the original order,
# you can set the 'metric' column as a categorical type.
final_df = final_df.reindex(metric_vars, level='metric')
final_df_display = final_df.copy()

def format_value(metric_name, value):
    """Format values depending on whether it's a P-value metric or not."""
    # Check for '_p' which is more general for 'pearson_p', 'spearman_p' etc.
    if "_p" in metric_name:
        return f"{value:.2e}"  # Scientific notation for p-values
    else:
        return f"{value:.3g}"  # General format for other metrics

# Loop through each model's column to apply the formatting
for col in final_df_display.columns:
    final_df_display[col] = final_df_display.apply(
        # Access the 'metric' from the index using row.name[2]
        # (assuming it's the 3rd level of your index)
        lambda row: format_value(row.name[2], row[col]),
        axis=1
    )

# Display the formatted table
# final_df_display



macaque

bdbi7l3n

kf8188qf

cq45eb2s

rat

bdbi7l3n

kf8188qf

cq45eb2s

cow

bdbi7l3n

kf8188qf

cq45eb2s

pig

bdbi7l3n

kf8188qf

cq45eb2s


In [151]:
final_df_display

Unnamed: 0_level_0,Unnamed: 1_level_0,model,bdbi7l3n,cq45eb2s,kf8188qf
species,set,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cow,orthologs,pearson_r,0.385,0.475,0.42
cow,orthologs,pearson_p,1.34e-89,2.35e-142,1.58e-108
cow,orthologs,spearman_rho,0.391,0.485,0.432
cow,orthologs,spearman_p,8.86e-93,9.78e-150,4.19e-115
cow,orthologs,mse,2.32,1.26,1.14
...,...,...,...,...,...
rat,test3,pearson_p,3.38e-47,4.56e-79,1.03e-74
rat,test3,spearman_rho,0.309,0.398,0.384
rat,test3,spearman_p,1.86e-44,7.61e-77,3.40e-71
rat,test3,mse,1.8,0.898,0.884


In [99]:
results_corr_df

Unnamed: 0,species,model,set,pearson_r,pearson_p,spearman_rho,spearman_p,mse,slope
0,cow,bdbi7l3n,test2,"25 0.364 Name: bdbi7l3n, dtype: float64","26 1.690000e-20 Name: bdbi7l3n, dtype: float64","27 0.364 Name: bdbi7l3n, dtype: float64","28 1.730000e-20 Name: bdbi7l3n, dtype: float64","29 2.28 Name: bdbi7l3n, dtype: float64",0.21382
1,cow,bdbi7l3n,test3,"30 0.319 Name: bdbi7l3n, dtype: float64","31 4.220000e-32 Name: bdbi7l3n, dtype: float64","32 0.321 Name: bdbi7l3n, dtype: float64","33 1.100000e-32 Name: bdbi7l3n, dtype: float64","34 2.22 Name: bdbi7l3n, dtype: float64",0.184987
2,cow,bdbi7l3n,orthologs,"10 0.385 Name: bdbi7l3n, dtype: float64","11 1.340000e-89 Name: bdbi7l3n, dtype: float64","12 0.391 Name: bdbi7l3n, dtype: float64","13 8.860000e-93 Name: bdbi7l3n, dtype: float64","14 2.32 Name: bdbi7l3n, dtype: float64",0.225692
3,mouse,bdbi7l3n,orthologs,"10 0.496 Name: bdbi7l3n, dtype: float64","11 0.0 Name: bdbi7l3n, dtype: float64","12 0.502 Name: bdbi7l3n, dtype: float64","13 0.0 Name: bdbi7l3n, dtype: float64","14 1.44 Name: bdbi7l3n, dtype: float64",0.243126


In [117]:
table.loc[(table['species'] == 'cow') &
               (table['Group'] == 'Train') &
               (table['Metric'] == 'Pearson')]['bdbi7l3n'][0]

0.406

In [118]:
def locate(tab, species, group, metric, model):
    num = tab.loc[(tab['species'] == species) &
                       (tab['Group'] == group) &
                       (tab['Metric'] == metric)][model]
    return num

locate(table, 'cow', 'Test2', 'Pearson', 'bdbi7l3n')

0.406

In [130]:
def locate(tab, species, group, metric, model):
    num_series = tab.loc[(tab['species'] == species) &
                         (tab['Group'] == group) &
                         (tab['Metric'] == metric)][model]
    
    # Use .item() to extract the single scalar value
    return num_series.item()
locate(table, 'cow', 'Test2', 'Pearson', 'bdbi7l3n')

0.364