In [1]:
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import numpy as np

def pearson_spearman(x, y):
    pearson_corr, pearson_p_value = scipy.stats.pearsonr(x, y)
    print(f"Pearson correlation coefficient: {pearson_corr:.4f}, p-value: {pearson_p_value:.4g}")

    spearman_corr, spearman_p_value = scipy.stats.spearmanr(x, y)
    print(f"Spearman correlation coefficient: {spearman_corr:.4f}, p-value: {spearman_p_value:.4g}")
    
def correlations(pred_df, true_df, group, species_name, model_name):
    x = true_df.squeeze()
    y = pred_df.squeeze()
    
    # Pearson
    pearson_corr, pearson_p_test = scipy.stats.pearsonr(x, y)
    # Spearman
    spearman_corr, spearman_p_test = scipy.stats.spearmanr(x, y)

    return pd.DataFrame([{
        "group": group,
        "species": species_name,
        "model": model_name,
        "pearson_corr": pearson_corr,
        # "pearson_p": pearson_p_test,
        "spearman_corr": spearman_corr,
        # "spearman_p": spearman_p_test
    }])
    
def scatter(pred_df, true_df, title, xlabel, ylabel):
    x = true_df.squeeze()
    y = pred_df.squeeze() 
    
    plt.figure(figsize=(8, 6))
    plt.xlim(0, 4)
    plt.ylim(0, 4)
    plt.plot([0, 4], [0, 4], color='#015088', linestyle='--', label='y = x')
    # plt.gcf().set_facecolor('#f3f0dfff')
    # plt.gca().set_facecolor('#f3f0dfff')
    plt.scatter(x, y, alpha=0.2)
    plt.xlabel(f'{xlabel}')
    plt.ylabel(f'{ylabel}')
    plt.title(f'{title}')
    plt.grid(True)
    plt.show()

def histogram(pred_df, title, xlabel):
    plt.hist(pred_df, bins=50) #, color='#015088'
    plt.xlim(0, 4)
    # plt.gcf().set_facecolor('#f3f0dfff')
    # plt.gca().set_facecolor('#f3f0dfff')
    plt.xlabel(f'{xlabel}')
    plt.ylabel('Count')
    plt.title(f'{title}')
    plt.show()

def add_stats(df):
    df['mean'] = df.mean(axis=1)
    df['range'] = df.max(axis=1)-df.min(axis=1)
    df['std'] = df.std(axis=1)

def do_scatter(x, y, xlabel, ylabel, title, combined):
    plt.scatter(x, y, alpha=0.2)
    sc = plt.scatter(x, y, c=combined['std'], cmap='coolwarm', alpha=0.6)  # 'coolwarm' goes from blue to red
    plt.colorbar(sc, label='Standard Deviation')  # adds a color bar legend
    plt.xlabel(f'{xlabel}')
    plt.ylabel(f'{ylabel}')
    plt.title(f'{title}')
    plt.grid(True)
    plt.show()
    
def bayesian_scatter(combined, xaxis, yaxis):
    x = combined[f'{xaxis}']
    y = combined[f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted all', combined)
    
    x = combined[combined['label'] == 'val1'][f'{xaxis}']
    y = combined[combined['label'] == 'val1'][f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted val1', combined[combined['label'] == 'val1'])
    
    x = combined[combined['label'] == 'val2'][f'{xaxis}']
    y = combined[combined['label'] == 'val2'][f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted val2', combined[combined['label'] == 'val2'])
    
    x = combined[combined['label'] == 'val3'][f'{xaxis}']
    y = combined[combined['label'] == 'val3'][f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted val3', combined[combined['label'] == 'val3'])

In [15]:
# 5 BEST LOG MODELS
import pandas as pd
import scipy.stats

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig']
    preds = [pred_trainPos, pred_valPos, pred_testPos, pred_val2_df, pred_val3_df, pred_test2_df, pred_test3_df, pred_train4_df, pred_val4_df, pred_test4_df]
    trues = [doubled_trainPos, doubled_valPos, doubled_testPos, doubled_val2_df, doubled_val3_df, doubled_test2_df, doubled_test3_df, doubled_train4_df, doubled_val4_df, doubled_test4_df]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, _ = scipy.stats.pearsonr(x, y)
        spearman, _ = scipy.stats.spearmanr(x, y)
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred']
    negValues = [pred_trainNeg.mean().iloc[0], pred_valNeg.mean().iloc[0], pred_testNeg.mean().iloc[0], pred_val1_df.mean().iloc[0], pred_test1_df.mean().iloc[0], pred_train5_df.mean().iloc[0], pred_val5_df.mean().iloc[0], pred_test5_df.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
neg_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['bdbi7l3n', '7vsdq5k2', 'wnfdrgcc', '8i7h7nsh', 'ph4wrpxu']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'
        # This section loads all necessary data into variables in the main scope
        # so the correlate() function can access them.

        # load TRAIN DFs
        negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_TRAIN_500bp.bed'
        if species == 'macaque':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_TRAIN_500bp.bed'
        elif species == 'cow':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_TRAIN_500bp.bed'
        elif species == 'pig':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_TRAIN_500bp.bed'
        
        pred_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TRAIN.csv', header=None)
        
        trainPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_TRAINONLY.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        trainNeg = pd.read_csv(negTrainPath, header=None, delim_whitespace=True).iloc[:,4]

        trainPos_len = 2*len(trainPos)
        trainNeg_len = 2*len(trainNeg)
        
        if len(pred_TRAIN) != trainPos_len+trainNeg_len:
            print(f"ERROR TRAIN ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_trainPos = pd.concat([trainPos, trainPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_trainNeg = pd.concat([trainNeg, trainNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_trainPos = pred_TRAIN.head(trainPos_len)
        pred_trainNeg = pred_TRAIN.tail(trainNeg_len)

        #############################################################################

        negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_VAL_500bp.bed'
        if species == 'macaque':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_VAL_500bp.bed'
        elif species == 'cow':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_VAL_500bp.bed'
        elif species == 'pig':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_VAL_500bp.bed'
        
        # load VAL ORTHO DFs
        pred_VAL_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL_orthologs.csv', header=None)
        
        valPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        valNeg = pd.read_csv(negValPath, header=None, delim_whitespace=True).iloc[:,4]
        
        valPos_len = 2*len(valPos)
        valNeg_len = 2*len(valNeg)
        
        if len(pred_VAL_ortho) != valPos_len+valNeg_len:
            print(f"ERROR VALORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_valPos = pd.concat([valPos, valPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valNeg = pd.concat([valNeg, valNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_valPos = pred_VAL_ortho.head(valPos_len)
        pred_valNeg = pred_VAL_ortho.tail(valNeg_len)

        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST_orthologs.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)

        #############################################################################
        # load VAL DFs
        pred_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/val_splits/val1/{species}_liver_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val3/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        val3_len = 2*len(val3_df)
        
        if len(pred_VAL) != val1_len+val2_len+val3_len:
            print(f"ERROR VAL ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val1_df = pd.concat([val1_df, val1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val3_df = pd.concat([val3_df, val3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val1_df = pred_VAL.head(val1_len)
        pred_val2_df = pred_VAL.iloc[val1_len:val1_len + val2_len]
        pred_val3_df = pred_VAL.tail(val3_len)

        #############################################################################
        # load TEST DFs
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        if len(pred_TEST) != test1_len+test2_len+test3_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_TEST.head(test1_len)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_TEST.tail(test3_len)

        #############################################################################
        # load cow+pig TRAIN DFs
        pred_cow_pig_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TRAIN.csv', header=None)
        
        train4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        train5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        train4_len = 2*len(train4Pos_df)
        train5_len = 2*len(train5Neg_df)
        
        if len(pred_cow_pig_TRAIN) != train4_len+train5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_train4_df = pd.concat([train4Pos_df, train4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_train5_df = pd.concat([train5Neg_df, train5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_train4_df = pred_cow_pig_TRAIN.head(train4_len)
        pred_train5_df = pred_cow_pig_TRAIN.tail(train5_len)

        #############################################################################
        # load cow+pig VAL DFs
        pred_cow_pig_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_VAL.csv', header=None)
        
        val4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        val4_len = 2*len(val4Pos_df)
        val5_len = 2*len(val5Neg_df)
        
        if len(pred_cow_pig_VAL) != val4_len+val5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val4_df = pd.concat([val4Pos_df, val4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val5_df = pd.concat([val5Neg_df, val5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val4_df = pred_cow_pig_VAL.head(val4_len)
        pred_val5_df = pred_cow_pig_VAL.tail(val5_len)
        
        #############################################################################
        # load cow + pig TEST DFs
        pred_cow_pig_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TEST.csv', header=None)
        
        test4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test4/cow_pig_liver_pos_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test5/cow_pig_liver_neg_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test4_len = 2*len(test4Pos_df)
        test5_len = 2*len(test5Neg_df)
        
        if len(pred_cow_pig_TEST) != test4_len+test5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test4_df = pd.concat([test4Pos_df, test4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test5_df = pd.concat([test5Neg_df, test5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test4_df = pred_cow_pig_TEST.head(test4_len)
        pred_test5_df = pred_cow_pig_TEST.tail(test5_len)

        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate()
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)

        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)
        

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig'
]

custom_group_order_neg = [
    'Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_reordered = pivot_df[model_list]
pivot_neg_df_reordered = pivot_neg_df[model_list]

# Display the final pivoted DataFrame
print("--- Final Pivoted Results ---")
display(pivot_df_reordered.style.format("{:.3f}"))
display(pivot_neg_df_reordered.style.format("{:.3f}"))

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/log_model_eval_table_FINAL.tsv'
# pivot_df_reordered.to_csv(output_filename, sep='\t', float_format='%.3f')

output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/log_model_neg_table_FINAL.tsv'
# pivot_neg_df_reordered.to_csv(output_filename, sep='\t', float_format='%.3f')

# print(f'Results successfully saved to: {output_filename}')

--- Final Pivoted Results ---


Unnamed: 0_level_0,Unnamed: 1_level_0,model,bdbi7l3n,7vsdq5k2,wnfdrgcc,8i7h7nsh,ph4wrpxu
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
cow,Train,Pearson,0.405617,0.390403,0.387163,0.384434,0.3881
cow,Train,Spearman,0.401242,0.385022,0.380621,0.374787,0.380675
cow,Validation,Pearson,0.400304,0.407063,0.397384,0.386406,0.392952
cow,Validation,Spearman,0.389891,0.399477,0.389664,0.376793,0.387375
cow,Test,Pearson,0.385182,0.36688,0.362467,0.358449,0.368336
cow,Test,Spearman,0.391406,0.375057,0.368167,0.365179,0.378843
cow,Val2,Pearson,0.546808,0.510822,0.488816,0.478311,0.489886
cow,Val2,Spearman,0.539543,0.504295,0.473447,0.475269,0.48221
cow,Val3,Pearson,0.256095,0.267099,0.279355,0.272216,0.278065
cow,Val3,Spearman,0.263485,0.287832,0.29806,0.283412,0.300816


Unnamed: 0_level_0,Unnamed: 1_level_0,model,bdbi7l3n,7vsdq5k2,wnfdrgcc,8i7h7nsh,ph4wrpxu
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
cow,Train neg,Avg Neg Prediction,0.62423,0.625045,0.682094,0.69773,0.698444
cow,Val neg,Avg Neg Prediction,0.634693,0.635603,0.69407,0.712532,0.710847
cow,Test neg,Avg Neg Prediction,0.619087,0.617863,0.672667,0.687981,0.688572
cow,Val1 avg pred,Avg Neg Prediction,0.626253,0.626998,0.680471,0.697044,0.689855
cow,Test1 avg pred,Avg Neg Prediction,0.596296,0.589748,0.640766,0.655376,0.661679
cow,Train Cow+Pig Pred,Avg Neg Prediction,0.658225,0.683852,0.72447,0.735462,0.737362
cow,Val Cow+Pig Pred,Avg Neg Prediction,0.681521,0.73734,0.770893,0.80747,0.766122
cow,Test Cow+Pig Pred,Avg Neg Prediction,0.677357,0.715497,0.744649,0.756681,0.747752
macaque,Train neg,Avg Neg Prediction,0.606251,0.600971,0.660338,0.678177,0.678299
macaque,Val neg,Avg Neg Prediction,0.609978,0.611235,0.666402,0.687101,0.688063


In [18]:
# 1 3 5 LOG MODELS
import pandas as pd
import scipy.stats

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig']
    preds = [pred_trainPos, pred_valPos, pred_testPos, pred_val2_df, pred_val3_df, pred_test2_df, pred_test3_df, pred_train4_df, pred_val4_df, pred_test4_df]
    trues = [doubled_trainPos, doubled_valPos, doubled_testPos, doubled_val2_df, doubled_val3_df, doubled_test2_df, doubled_test3_df, doubled_train4_df, doubled_val4_df, doubled_test4_df]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, _ = scipy.stats.pearsonr(x, y)
        spearman, _ = scipy.stats.spearmanr(x, y)
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred']
    negValues = [pred_trainNeg.mean().iloc[0], pred_valNeg.mean().iloc[0], pred_testNeg.mean().iloc[0], pred_val1_df.mean().iloc[0], pred_test1_df.mean().iloc[0], pred_train5_df.mean().iloc[0], pred_val5_df.mean().iloc[0], pred_test5_df.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)


# --- Main Script ---
all_results = []
neg_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['bdbi7l3n', 'kf8188qf', 'cq45eb2s']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'
        # This section loads all necessary data into variables in the main scope
        # so the correlate() function can access them.

        # load TRAIN DFs
        negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_TRAIN_500bp.bed'
        if species == 'macaque':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_TRAIN_500bp.bed'
        elif species == 'cow':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_TRAIN_500bp.bed'
        elif species == 'pig':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_TRAIN_500bp.bed'
        
        pred_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TRAIN.csv', header=None)
        
        trainPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_TRAINONLY.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        trainNeg = pd.read_csv(negTrainPath, header=None, delim_whitespace=True).iloc[:,4]

        trainPos_len = 2*len(trainPos)
        trainNeg_len = 2*len(trainNeg)
        
        if len(pred_TRAIN) != trainPos_len+trainNeg_len:
            print(f"ERROR TRAIN ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_trainPos = pd.concat([trainPos, trainPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_trainNeg = pd.concat([trainNeg, trainNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_trainPos = pred_TRAIN.head(trainPos_len)
        pred_trainNeg = pred_TRAIN.tail(trainNeg_len)

        #############################################################################

        negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_VAL_500bp.bed'
        if species == 'macaque':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_VAL_500bp.bed'
        elif species == 'cow':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_VAL_500bp.bed'
        elif species == 'pig':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_VAL_500bp.bed'
        
        # load VAL ORTHO DFs
        pred_VAL_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL_orthologs.csv', header=None)
        
        valPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        valNeg = pd.read_csv(negValPath, header=None, delim_whitespace=True).iloc[:,4]
        
        valPos_len = 2*len(valPos)
        valNeg_len = 2*len(valNeg)
        
        if len(pred_VAL_ortho) != valPos_len+valNeg_len:
            print(f"ERROR VALORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_valPos = pd.concat([valPos, valPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valNeg = pd.concat([valNeg, valNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_valPos = pred_VAL_ortho.head(valPos_len)
        pred_valNeg = pred_VAL_ortho.tail(valNeg_len)

        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST_orthologs.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)

        #############################################################################
        # load VAL DFs
        pred_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/val_splits/val1/{species}_liver_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val3/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        val3_len = 2*len(val3_df)
        
        if len(pred_VAL) != val1_len+val2_len+val3_len:
            print(f"ERROR VAL ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val1_df = pd.concat([val1_df, val1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val3_df = pd.concat([val3_df, val3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val1_df = pred_VAL.head(val1_len)
        pred_val2_df = pred_VAL.iloc[val1_len:val1_len + val2_len]
        pred_val3_df = pred_VAL.tail(val3_len)

        #############################################################################
        # load TEST DFs
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        if len(pred_TEST) != test1_len+test2_len+test3_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_TEST.head(test1_len)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_TEST.tail(test3_len)

        #############################################################################
        # load cow+pig TRAIN DFs
        pred_cow_pig_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TRAIN.csv', header=None)
        
        train4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        train5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        train4_len = 2*len(train4Pos_df)
        train5_len = 2*len(train5Neg_df)
        
        if len(pred_cow_pig_TRAIN) != train4_len+train5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_train4_df = pd.concat([train4Pos_df, train4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_train5_df = pd.concat([train5Neg_df, train5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_train4_df = pred_cow_pig_TRAIN.head(train4_len)
        pred_train5_df = pred_cow_pig_TRAIN.tail(train5_len)

        #############################################################################
        # load cow+pig VAL DFs
        pred_cow_pig_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_VAL.csv', header=None)
        
        val4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        val4_len = 2*len(val4Pos_df)
        val5_len = 2*len(val5Neg_df)
        
        if len(pred_cow_pig_VAL) != val4_len+val5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val4_df = pd.concat([val4Pos_df, val4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val5_df = pd.concat([val5Neg_df, val5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val4_df = pred_cow_pig_VAL.head(val4_len)
        pred_val5_df = pred_cow_pig_VAL.tail(val5_len)
        
        #############################################################################

        # load TEST DFs
        pred_cow_pig_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TEST.csv', header=None)
        
        test4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test4/cow_pig_liver_pos_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test5/cow_pig_liver_neg_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test4_len = 2*len(test4Pos_df)
        test5_len = 2*len(test5Neg_df)
        
        if len(pred_cow_pig_TEST) != test4_len+test5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test4_df = pd.concat([test4Pos_df, test4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test5_df = pd.concat([test5Neg_df, test5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test4_df = pred_cow_pig_TEST.head(test4_len)
        pred_test5_df = pred_cow_pig_TEST.tail(test5_len)
        
        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate()
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)

        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig'
]

custom_group_order_neg = [
    'Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_reordered = pivot_df[model_list]
pivot_neg_df_reordered = pivot_neg_df[model_list]

# Display the final pivoted DataFrame
print("--- Final Pivoted Results ---")
display(pivot_df_reordered.style.format("{:.3f}"))
display(pivot_neg_df_reordered.style.format("{:.3f}"))

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/135log_model_eval_table_FINAL.tsv'
pivot_df_reordered.to_csv(output_filename, sep='\t', float_format='%.3f')

output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/135log_model_neg_table_FINAL.tsv'
pivot_neg_df_reordered.to_csv(output_filename, sep='\t', float_format='%.3f')

# print(f'Results successfully saved to: {output_filename}')

--- Final Pivoted Results ---


Unnamed: 0_level_0,Unnamed: 1_level_0,model,bdbi7l3n,kf8188qf,cq45eb2s
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cow,Train,Pearson,0.406,0.429,0.518
cow,Train,Spearman,0.401,0.429,0.518
cow,Validation,Pearson,0.4,0.439,0.506
cow,Validation,Spearman,0.39,0.443,0.504
cow,Test,Pearson,0.385,0.42,0.475
cow,Test,Spearman,0.391,0.432,0.485
cow,Val2,Pearson,0.547,0.546,0.618
cow,Val2,Spearman,0.54,0.542,0.59
cow,Val3,Pearson,0.256,0.362,0.423
cow,Val3,Spearman,0.263,0.363,0.418


Unnamed: 0_level_0,Unnamed: 1_level_0,model,bdbi7l3n,kf8188qf,cq45eb2s
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cow,Train neg,Avg Neg Prediction,0.624,0.498,0.422
cow,Val neg,Avg Neg Prediction,0.635,0.52,0.48
cow,Test neg,Avg Neg Prediction,0.619,0.482,0.46
cow,Val1 avg pred,Avg Neg Prediction,0.626,0.472,0.427
cow,Test1 avg pred,Avg Neg Prediction,0.596,0.416,0.393
cow,Train Cow+Pig Pred,Avg Neg Prediction,0.658,0.596,0.472
cow,Val Cow+Pig Pred,Avg Neg Prediction,0.682,0.685,0.576
cow,Test Cow+Pig Pred,Avg Neg Prediction,0.677,0.626,0.59
macaque,Train neg,Avg Neg Prediction,0.606,0.373,0.379
macaque,Val neg,Avg Neg Prediction,0.61,0.428,0.412


In [7]:
# 2KB MODEL
import pandas as pd
import scipy.stats

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Test','Test2', 'Test3', 'Test Cow+Pig']
    preds = [pred_testPos, pred_test2_df, pred_test3_df, pred_test4_df]
    trues = [doubled_testPos, doubled_test2_df, doubled_test3_df, doubled_test4_df]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, _ = scipy.stats.pearsonr(x, y)
        spearman, _ = scipy.stats.spearmanr(x, y)
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Test neg', 'Test1 avg pred', 'Test Cow+Pig Pred']
    negValues = [pred_testNeg.mean().iloc[0], pred_test1_df.mean().iloc[0], pred_test5_df.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
neg_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['im88hepv']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'
        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST_orthologs.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)


        #############################################################################
        # load TEST DFs
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        if len(pred_TEST) != test1_len+test2_len+test3_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_TEST.head(test1_len)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_TEST.tail(test3_len)
        
        #############################################################################
        # load TEST DFs
        pred_cow_pig_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TEST.csv', header=None)
        
        test4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_test4/cow_pig_liver_pos_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_test5/cow_pig_liver_neg_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test4_len = 2*len(test4Pos_df)
        test5_len = 2*len(test5Neg_df)
        
        if len(pred_cow_pig_TEST) != test4_len+test5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test4_df = pd.concat([test4Pos_df, test4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test5_df = pd.concat([test5Neg_df, test5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test4_df = pred_cow_pig_TEST.head(test4_len)
        pred_test5_df = pred_cow_pig_TEST.tail(test5_len)
        
        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate()
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)

        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Test', 'Test2', 'Test3','Test Cow+Pig'
]

custom_group_order_neg = [
    'Test neg', 'Test1 avg pred','Test Cow+Pig Pred'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_2kb = pivot_df[model_list]
pivot_neg_df_2kb = pivot_neg_df[model_list]

# Display the final pivoted DataFrame
print("--- Final Pivoted Results ---")
display(pivot_df_2kb.style.format("{:.3f}"))
display(pivot_neg_df_2kb.style.format("{:.3f}"))

# print(f'Results successfully saved to: {output_filename}')

--- Final Pivoted Results ---


Unnamed: 0_level_0,Unnamed: 1_level_0,model,im88hepv
species,Group,Metric,Unnamed: 3_level_1
cow,Test,Pearson,0.422
cow,Test,Spearman,0.43
cow,Test2,Pearson,0.37
cow,Test2,Spearman,0.38
cow,Test3,Pearson,0.371
cow,Test3,Spearman,0.37
cow,Test Cow+Pig,Pearson,0.459
cow,Test Cow+Pig,Spearman,0.484
macaque,Test,Pearson,0.348
macaque,Test,Spearman,0.359


Unnamed: 0_level_0,Unnamed: 1_level_0,model,im88hepv
species,Group,Metric,Unnamed: 3_level_1
cow,Test neg,Avg Neg Prediction,0.658
cow,Test1 avg pred,Avg Neg Prediction,0.642
cow,Test Cow+Pig Pred,Avg Neg Prediction,0.708
macaque,Test neg,Avg Neg Prediction,0.648
macaque,Test1 avg pred,Avg Neg Prediction,0.638
macaque,Test Cow+Pig Pred,Avg Neg Prediction,0.708
pig,Test neg,Avg Neg Prediction,0.658
pig,Test1 avg pred,Avg Neg Prediction,0.638
pig,Test Cow+Pig Pred,Avg Neg Prediction,0.708
rat,Test neg,Avg Neg Prediction,0.675


In [8]:
# EQN MODEL
import pandas as pd
import scipy.stats

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Test','Test2', 'Test3', 'Test Cow+Pig']
    preds = [pred_testPos, pred_test2_df, pred_test3_df, pred_test4_df]
    trues = [doubled_testPos, doubled_test2_df, doubled_test3_df, doubled_test4_df]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, _ = scipy.stats.pearsonr(x, y)
        spearman, _ = scipy.stats.spearmanr(x, y)
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Test neg', 'Test1 avg pred', 'Test Cow+Pig Pred']
    negValues = [pred_testNeg.mean().iloc[0], pred_test1_df.mean().iloc[0], pred_test5_df.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
neg_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['mcf297qb', '7l12zan1']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'
        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST_orthologs.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_eqn/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)


        #############################################################################
        # load TEST DFs
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_eqn/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_eqn/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        if len(pred_TEST) != test1_len+test2_len+test3_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_TEST.head(test1_len)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_TEST.tail(test3_len)
        
        #############################################################################
        # load TEST DFs
        pred_cow_pig_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TEST.csv', header=None)
        
        test4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_eqn/log_test4/cow_pig_liver_pos_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test5/cow_pig_liver_neg_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test4_len = 2*len(test4Pos_df)
        test5_len = 2*len(test5Neg_df)
        
        if len(pred_cow_pig_TEST) != test4_len+test5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test4_df = pd.concat([test4Pos_df, test4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test5_df = pd.concat([test5Neg_df, test5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test4_df = pred_cow_pig_TEST.head(test4_len)
        pred_test5_df = pred_cow_pig_TEST.tail(test5_len)
        
        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate()
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)

        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Test', 'Test2', 'Test3','Test Cow+Pig'
]

custom_group_order_neg = [
    'Test neg', 'Test1 avg pred','Test Cow+Pig Pred'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_eqn = pivot_df[model_list]
pivot_neg_df_eqn = pivot_neg_df[model_list]

# Display the final pivoted DataFrame
print("--- Final Pivoted Results ---")
display(pivot_df_eqn.style.format("{:.3f}"))
display(pivot_neg_df_eqn.style.format("{:.3f}"))

# print(f'Results successfully saved to: {output_filename}')

--- Final Pivoted Results ---


Unnamed: 0_level_0,Unnamed: 1_level_0,model,mcf297qb,7l12zan1
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1
cow,Test,Pearson,0.355,0.375
cow,Test,Spearman,0.366,0.383
cow,Test2,Pearson,0.36,0.383
cow,Test2,Spearman,0.369,0.393
cow,Test3,Pearson,0.283,0.298
cow,Test3,Spearman,0.284,0.299
cow,Test Cow+Pig,Pearson,0.422,0.417
cow,Test Cow+Pig,Spearman,0.455,0.449
macaque,Test,Pearson,0.371,0.34
macaque,Test,Spearman,0.372,0.345


Unnamed: 0_level_0,Unnamed: 1_level_0,model,mcf297qb,7l12zan1
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1
cow,Test neg,Avg Neg Prediction,0.568,0.476
cow,Test1 avg pred,Avg Neg Prediction,0.52,0.453
cow,Test Cow+Pig Pred,Avg Neg Prediction,0.717,0.625
macaque,Test neg,Avg Neg Prediction,0.542,0.447
macaque,Test1 avg pred,Avg Neg Prediction,0.515,0.433
macaque,Test Cow+Pig Pred,Avg Neg Prediction,0.717,0.625
pig,Test neg,Avg Neg Prediction,0.594,0.477
pig,Test1 avg pred,Avg Neg Prediction,0.538,0.447
pig,Test Cow+Pig Pred,Avg Neg Prediction,0.717,0.625
rat,Test neg,Avg Neg Prediction,0.676,0.525
