In [13]:
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import numpy as np

def pearson_spearman(x, y):
    pearson_corr, pearson_p_value = scipy.stats.pearsonr(x, y)
    print(f"Pearson correlation coefficient: {pearson_corr:.4f}, p-value: {pearson_p_value:.4g}")

    spearman_corr, spearman_p_value = scipy.stats.spearmanr(x, y)
    print(f"Spearman correlation coefficient: {spearman_corr:.4f}, p-value: {spearman_p_value:.4g}")

species_list = ['macaque', 'rat', 'cow', 'pig']

def mean_squared_error(x, y):
    x = np.asarray(x)
    y = np.asarray(y)
    return np.mean((x - y) ** 2)

def format_value(metric_name, value):
    """Format values depending on whether it's a P-value metric or not."""
    # Check for '_p' which is more general for 'pearson_p', 'spearman_p' etc.
    if "_p" in metric_name:
        return f"{value:.2e}"  # Scientific notation for p-values
    else:
        return f"{value:.3g}"  # General format for other metrics

mhc = 200
        
        pearson, pearson_p = scipy.stats.pearsonr(x, y)
        spearman, spearman_p = scipy.stats.spearmanr(x, y)
        mse = mean_squared_error(x, y)
        pearson_p *= mhc
        spearman_p *= mhc
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson_p', 'Value': pearson_p})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman_p', 'Value': spearman_p})
        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})

metric_vars = ['Pearson', 'Pearson_p', 'Spearman', 'Spearman_p', 'MSE']
pivot_df_reordered = pivot_df_reordered.reindex(metric_vars, level='Metric')

# Loop through each model's column to apply the formatting
for col in pivot_df_reordered.columns:
    pivot_df_reordered[col] = pivot_df_reordered.apply(
        # Access the 'metric' from the index using row.name[2]
        # (assuming it's the 3rd level of your index)
        lambda row: format_value(row.name[2], row[col]),
        axis=1)

display(pivot_df_reordered)

In [19]:
# 5 BEST LOG MODELS
mhc = 200
# FUNCTIONS SPECIFIC TO THIS SET
def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig']
    preds = [pred_trainPos, pred_valPos, pred_testPos, pred_val2_df, pred_val3_df, pred_test2_df, pred_test3_df, pred_train4_df, pred_val4_df, pred_test4_df]
    trues = [doubled_trainPos, doubled_valPos, doubled_testPos, doubled_val2_df, doubled_val3_df, doubled_test2_df, doubled_test3_df, doubled_train4_df, doubled_val4_df, doubled_test4_df]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, pearson_p = scipy.stats.pearsonr(x, y)
        spearman, spearman_p = scipy.stats.spearmanr(x, y)
        mse = mean_squared_error(x, y)
        pearson_p *= mhc
        spearman_p *= mhc
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson_p', 'Value': pearson_p})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman_p', 'Value': spearman_p})
        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred']
    negValues = [pred_trainNeg.mean().iloc[0], pred_valNeg.mean().iloc[0], pred_testNeg.mean().iloc[0], pred_val1_df.mean().iloc[0], pred_test1_df.mean().iloc[0], pred_train5_df.mean().iloc[0], pred_val5_df.mean().iloc[0], pred_test5_df.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

# MAIN SCRIPT
all_results = []
neg_results = []
model_list = ['bdbi7l3n', '7vsdq5k2', 'wnfdrgcc', '8i7h7nsh', 'ph4wrpxu']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'
        
        # Load and process all dfs so correlate() function can access them
        
        #############################################################################
        # load TRAIN DFs
        negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_TRAIN_500bp.bed'
        if species == 'macaque':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_TRAIN_500bp.bed'
        elif species == 'cow':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_TRAIN_500bp.bed'
        elif species == 'pig':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_TRAIN_500bp.bed'
        
        pred_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TRAIN.csv', header=None)
        
        trainPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_TRAINONLY.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        trainNeg = pd.read_csv(negTrainPath, header=None, delim_whitespace=True).iloc[:,4]

        trainPos_len = 2*len(trainPos)
        trainNeg_len = 2*len(trainNeg)
        
        if len(pred_TRAIN) != trainPos_len+trainNeg_len:
            print(f"ERROR TRAIN ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_trainPos = pd.concat([trainPos, trainPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_trainNeg = pd.concat([trainNeg, trainNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_trainPos = pred_TRAIN.head(trainPos_len)
        pred_trainNeg = pred_TRAIN.tail(trainNeg_len)

        #############################################################################
        # load VALIDATION DFs

        negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_VAL_500bp.bed'
        if species == 'macaque':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_VAL_500bp.bed'
        elif species == 'cow':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_VAL_500bp.bed'
        elif species == 'pig':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_VAL_500bp.bed'
        
        pred_VAL_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL_orthologs.csv', header=None)
        
        valPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        valNeg = pd.read_csv(negValPath, header=None, delim_whitespace=True).iloc[:,4]
        
        valPos_len = 2*len(valPos)
        valNeg_len = 2*len(valNeg)
        
        if len(pred_VAL_ortho) != valPos_len+valNeg_len:
            print(f"ERROR VALORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_valPos = pd.concat([valPos, valPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valNeg = pd.concat([valNeg, valNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_valPos = pred_VAL_ortho.head(valPos_len)
        pred_valNeg = pred_VAL_ortho.tail(valNeg_len)

        #############################################################################
        # load TEST DFs
        
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST_orthologs.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)

        #############################################################################
        # load VAL 1,2,3 DFs
        pred_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/val_splits/val1/{species}_liver_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val3/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        val3_len = 2*len(val3_df)
        
        if len(pred_VAL) != val1_len+val2_len+val3_len:
            print(f"ERROR VAL ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val1_df = pd.concat([val1_df, val1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val3_df = pd.concat([val3_df, val3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val1_df = pred_VAL.head(val1_len)
        pred_val2_df = pred_VAL.iloc[val1_len:val1_len + val2_len]
        pred_val3_df = pred_VAL.tail(val3_len)

        #############################################################################
        # load TEST 1,2,3 DFs
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        if len(pred_TEST) != test1_len+test2_len+test3_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_TEST.head(test1_len)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_TEST.tail(test3_len)

        #############################################################################
        # load cow + pig TRAIN DFs
        pred_cow_pig_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TRAIN.csv', header=None)
        
        train4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        train5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        train4_len = 2*len(train4Pos_df)
        train5_len = 2*len(train5Neg_df)
        
        if len(pred_cow_pig_TRAIN) != train4_len+train5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_train4_df = pd.concat([train4Pos_df, train4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_train5_df = pd.concat([train5Neg_df, train5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_train4_df = pred_cow_pig_TRAIN.head(train4_len)
        pred_train5_df = pred_cow_pig_TRAIN.tail(train5_len)

        #############################################################################
        # load cow + pig VAL DFs
        pred_cow_pig_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_VAL.csv', header=None)
        
        val4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        val4_len = 2*len(val4Pos_df)
        val5_len = 2*len(val5Neg_df)
        
        if len(pred_cow_pig_VAL) != val4_len+val5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val4_df = pd.concat([val4Pos_df, val4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val5_df = pd.concat([val5Neg_df, val5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val4_df = pred_cow_pig_VAL.head(val4_len)
        pred_val5_df = pred_cow_pig_VAL.tail(val5_len)
        
        #############################################################################
        # load cow + pig TEST DFs
        pred_cow_pig_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TEST.csv', header=None)
        
        test4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test4/cow_pig_liver_pos_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test5/cow_pig_liver_neg_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test4_len = 2*len(test4Pos_df)
        test5_len = 2*len(test5Neg_df)
        
        if len(pred_cow_pig_TEST) != test4_len+test5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test4_df = pd.concat([test4Pos_df, test4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test5_df = pd.concat([test5Neg_df, test5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test4_df = pred_cow_pig_TEST.head(test4_len)
        pred_test5_df = pred_cow_pig_TEST.tail(test5_len)
        
        #############################################################################
        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate()
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)
        
        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)
        

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)
summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig'
]

custom_group_order_neg = [
    'Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_reordered = pivot_df[model_list]
pivot_neg_df_reordered = pivot_neg_df[model_list]



metric_vars = ['Pearson', 'Pearson_p', 'Spearman', 'Spearman_p', 'MSE']
pivot_df_reordered = pivot_df_reordered.reindex(metric_vars, level='Metric')

# Loop through each model's column to apply the formatting
for col in pivot_df_reordered.columns:
    pivot_df_reordered[col] = pivot_df_reordered.apply(
        # Access the 'metric' from the index using row.name[2]
        # (assuming it's the 3rd level of your index)
        lambda row: format_value(row.name[2], row[col]),
        axis=1)

display(pivot_df_reordered)
# display(pivot_df_reordered.style.format("{:.3f}"))
        
# display(pivot_neg_df_reordered.style.format("{:.3f}"))

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/log_model_eval_table_FINAL_mse.tsv'
# pivot_df_reordered.to_csv(output_filename, sep='\t')

output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/log_model_neg_table_FINAL.tsv'
# pivot_neg_df_reordered.to_csv(output_neg_filename, sep='\t', float_format='%.3f')

# print(f'Results successfully saved to: {output_filename}')

Unnamed: 0_level_0,Unnamed: 1_level_0,model,bdbi7l3n,7vsdq5k2,wnfdrgcc,8i7h7nsh,ph4wrpxu
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
cow,Train,Pearson,0.406,0.39,0.387,0.384,0.388
cow,Train,Pearson_p,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00
cow,Train,Spearman,0.401,0.385,0.381,0.375,0.381
cow,Train,Spearman_p,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00
cow,Train,MSE,2.27,1.87,1.81,1.79,1.94
...,...,...,...,...,...,...,...
rat,Test Cow+Pig,Pearson,0.435,0.383,0.439,0.346,0.402
rat,Test Cow+Pig,Pearson_p,3.52e-06,1.89e-04,2.56e-06,2.24e-03,4.63e-05
rat,Test Cow+Pig,Spearman,0.456,0.427,0.481,0.376,0.451
rat,Test Cow+Pig,Spearman_p,5.75e-07,6.79e-06,5.68e-08,3.16e-04,8.75e-07


In [18]:
# 1 3 5 LOG MODELS
import pandas as pd
import scipy.stats

def correlate(mhc):
    rows = []
    # Lists for correlation calculations
    groups = ['Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig']
    preds = [pred_trainPos, pred_valPos, pred_testPos, pred_val2_df, pred_val3_df, pred_test2_df, pred_test3_df, pred_train4_df, pred_val4_df, pred_test4_df]
    trues = [doubled_trainPos, doubled_valPos, doubled_testPos, doubled_val2_df, doubled_val3_df, doubled_test2_df, doubled_test3_df, doubled_train4_df, doubled_val4_df, doubled_test4_df]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, pearson_p = scipy.stats.pearsonr(x, y)
        spearman, spearman_p = scipy.stats.spearmanr(x, y)
        mse = mean_squared_error(x, y)
        pearson_p *= mhc
        spearman_p *= mhc
        
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson_p', 'Value': pearson_p})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman_p', 'Value': spearman_p})
        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred']
    negValues = [pred_trainNeg.mean().iloc[0], pred_valNeg.mean().iloc[0], pred_testNeg.mean().iloc[0], pred_val1_df.mean().iloc[0], pred_test1_df.mean().iloc[0], pred_train5_df.mean().iloc[0], pred_val5_df.mean().iloc[0], pred_test5_df.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

mhc = 200

all_results = []
neg_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['bdbi7l3n', 'kf8188qf', 'cq45eb2s']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'

        if model == 'kf8188qf':
            mhc = 100
        
        # Load and process all dfs so correlate() function can access them
        #############################################################################

        # load TRAIN DFs
        negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_TRAIN_500bp.bed'
        if species == 'macaque':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_TRAIN_500bp.bed'
        elif species == 'cow':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_TRAIN_500bp.bed'
        elif species == 'pig':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_TRAIN_500bp.bed'
        
        pred_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TRAIN.csv', header=None)
        
        trainPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_TRAINONLY.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        trainNeg = pd.read_csv(negTrainPath, header=None, delim_whitespace=True).iloc[:,4]

        trainPos_len = 2*len(trainPos)
        trainNeg_len = 2*len(trainNeg)
        
        if len(pred_TRAIN) != trainPos_len+trainNeg_len:
            print(f"ERROR TRAIN ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_trainPos = pd.concat([trainPos, trainPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_trainNeg = pd.concat([trainNeg, trainNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_trainPos = pred_TRAIN.head(trainPos_len)
        pred_trainNeg = pred_TRAIN.tail(trainNeg_len)

        #############################################################################

        negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_VAL_500bp.bed'
        if species == 'macaque':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_VAL_500bp.bed'
        elif species == 'cow':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_VAL_500bp.bed'
        elif species == 'pig':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_VAL_500bp.bed'
        
        # load VAL ORTHO DFs
        pred_VAL_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL_orthologs.csv', header=None)
        
        valPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        valNeg = pd.read_csv(negValPath, header=None, delim_whitespace=True).iloc[:,4]
        
        valPos_len = 2*len(valPos)
        valNeg_len = 2*len(valNeg)
        
        if len(pred_VAL_ortho) != valPos_len+valNeg_len:
            print(f"ERROR VALORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_valPos = pd.concat([valPos, valPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valNeg = pd.concat([valNeg, valNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_valPos = pred_VAL_ortho.head(valPos_len)
        pred_valNeg = pred_VAL_ortho.tail(valNeg_len)

        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST_orthologs.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)

        #############################################################################
        # load VAL DFs
        pred_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/val_splits/val1/{species}_liver_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val3/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        val3_len = 2*len(val3_df)
        
        if len(pred_VAL) != val1_len+val2_len+val3_len:
            print(f"ERROR VAL ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val1_df = pd.concat([val1_df, val1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val3_df = pd.concat([val3_df, val3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val1_df = pred_VAL.head(val1_len)
        pred_val2_df = pred_VAL.iloc[val1_len:val1_len + val2_len]
        pred_val3_df = pred_VAL.tail(val3_len)

        #############################################################################
        # load TEST DFs
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        if len(pred_TEST) != test1_len+test2_len+test3_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_TEST.head(test1_len)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_TEST.tail(test3_len)

        #############################################################################
        # load cow+pig TRAIN DFs
        pred_cow_pig_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TRAIN.csv', header=None)
        
        train4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        train5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        train4_len = 2*len(train4Pos_df)
        train5_len = 2*len(train5Neg_df)
        
        if len(pred_cow_pig_TRAIN) != train4_len+train5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_train4_df = pd.concat([train4Pos_df, train4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_train5_df = pd.concat([train5Neg_df, train5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_train4_df = pred_cow_pig_TRAIN.head(train4_len)
        pred_train5_df = pred_cow_pig_TRAIN.tail(train5_len)

        #############################################################################
        # load cow+pig VAL DFs
        pred_cow_pig_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_VAL.csv', header=None)
        
        val4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        val4_len = 2*len(val4Pos_df)
        val5_len = 2*len(val5Neg_df)
        
        if len(pred_cow_pig_VAL) != val4_len+val5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val4_df = pd.concat([val4Pos_df, val4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val5_df = pd.concat([val5Neg_df, val5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val4_df = pred_cow_pig_VAL.head(val4_len)
        pred_val5_df = pred_cow_pig_VAL.tail(val5_len)
        
        #############################################################################

        # load TEST DFs
        pred_cow_pig_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TEST.csv', header=None)
        
        test4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test4/cow_pig_liver_pos_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test5/cow_pig_liver_neg_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test4_len = 2*len(test4Pos_df)
        test5_len = 2*len(test5Neg_df)
        
        if len(pred_cow_pig_TEST) != test4_len+test5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test4_df = pd.concat([test4Pos_df, test4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test5_df = pd.concat([test5Neg_df, test5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test4_df = pred_cow_pig_TEST.head(test4_len)
        pred_test5_df = pred_cow_pig_TEST.tail(test5_len)
        
        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate(mhc)
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)

        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig'
]

custom_group_order_neg = [
    'Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_reordered = pivot_df[model_list]
pivot_neg_df_reordered = pivot_neg_df[model_list]

metric_vars = ['Pearson', 'Pearson_p', 'Spearman', 'Spearman_p', 'MSE']
pivot_df_reordered = pivot_df_reordered.reindex(metric_vars, level='Metric')

# Loop through each model's column to apply the formatting
for col in pivot_df_reordered.columns:
    pivot_df_reordered[col] = pivot_df_reordered.apply(
        # Access the 'metric' from the index using row.name[2]
        # (assuming it's the 3rd level of your index)
        lambda row: format_value(row.name[2], row[col]),
        axis=1)

display(pivot_df_reordered)

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/135log_model_eval_table_FINAL_mse.tsv'
pivot_df_reordered.to_csv(output_filename, sep='\t', float_format='%.3f')

# output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/135log_model_neg_table_FINAL.tsv'
# pivot_neg_df_reordered.to_csv(output_neg_filename, sep='\t', float_format='%.3f')

# print(f'Results successfully saved to: {output_filename}')

Unnamed: 0_level_0,Unnamed: 1_level_0,model,bdbi7l3n,kf8188qf,cq45eb2s
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cow,Train,Pearson,0.406,0.429,0.518
cow,Train,Pearson_p,0.00e+00,0.00e+00,0.00e+00
cow,Train,Spearman,0.401,0.429,0.518
cow,Train,Spearman_p,0.00e+00,0.00e+00,0.00e+00
cow,Train,MSE,2.27,1.11,1.05
...,...,...,...,...,...
rat,Test Cow+Pig,Pearson,0.435,0.396,0.436
rat,Test Cow+Pig,Pearson_p,1.76e-06,3.60e-05,1.57e-06
rat,Test Cow+Pig,Spearman,0.456,0.433,0.475
rat,Test Cow+Pig,Spearman_p,2.88e-07,2.02e-06,4.70e-08


In [7]:
# 2KB MODEL
import pandas as pd
import scipy.stats

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Test','Test2', 'Test3', 'Test Cow+Pig']
    preds = [pred_testPos, pred_test2_df, pred_test3_df, pred_test4_df]
    trues = [doubled_testPos, doubled_test2_df, doubled_test3_df, doubled_test4_df]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, _ = scipy.stats.pearsonr(x, y)
        spearman, _ = scipy.stats.spearmanr(x, y)
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Test neg', 'Test1 avg pred', 'Test Cow+Pig Pred']
    negValues = [pred_testNeg.mean().iloc[0], pred_test1_df.mean().iloc[0], pred_test5_df.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
neg_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['im88hepv']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'
        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST_orthologs.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)


        #############################################################################
        # load TEST DFs
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        if len(pred_TEST) != test1_len+test2_len+test3_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_TEST.head(test1_len)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_TEST.tail(test3_len)
        
        #############################################################################
        # load TEST DFs
        pred_cow_pig_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TEST.csv', header=None)
        
        test4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_test4/cow_pig_liver_pos_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_test5/cow_pig_liver_neg_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test4_len = 2*len(test4Pos_df)
        test5_len = 2*len(test5Neg_df)
        
        if len(pred_cow_pig_TEST) != test4_len+test5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test4_df = pd.concat([test4Pos_df, test4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test5_df = pd.concat([test5Neg_df, test5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test4_df = pred_cow_pig_TEST.head(test4_len)
        pred_test5_df = pred_cow_pig_TEST.tail(test5_len)
        
        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate()
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)

        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Test', 'Test2', 'Test3','Test Cow+Pig'
]

custom_group_order_neg = [
    'Test neg', 'Test1 avg pred','Test Cow+Pig Pred'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_2kb = pivot_df[model_list]
pivot_neg_df_2kb = pivot_neg_df[model_list]

# Display the final pivoted DataFrame
print("--- Final Pivoted Results ---")
display(pivot_df_2kb.style.format("{:.3f}"))
display(pivot_neg_df_2kb.style.format("{:.3f}"))

# print(f'Results successfully saved to: {output_filename}')

--- Final Pivoted Results ---


Unnamed: 0_level_0,Unnamed: 1_level_0,model,im88hepv
species,Group,Metric,Unnamed: 3_level_1
cow,Test,Pearson,0.422
cow,Test,Spearman,0.43
cow,Test2,Pearson,0.37
cow,Test2,Spearman,0.38
cow,Test3,Pearson,0.371
cow,Test3,Spearman,0.37
cow,Test Cow+Pig,Pearson,0.459
cow,Test Cow+Pig,Spearman,0.484
macaque,Test,Pearson,0.348
macaque,Test,Spearman,0.359


Unnamed: 0_level_0,Unnamed: 1_level_0,model,im88hepv
species,Group,Metric,Unnamed: 3_level_1
cow,Test neg,Avg Neg Prediction,0.658
cow,Test1 avg pred,Avg Neg Prediction,0.642
cow,Test Cow+Pig Pred,Avg Neg Prediction,0.708
macaque,Test neg,Avg Neg Prediction,0.648
macaque,Test1 avg pred,Avg Neg Prediction,0.638
macaque,Test Cow+Pig Pred,Avg Neg Prediction,0.708
pig,Test neg,Avg Neg Prediction,0.658
pig,Test1 avg pred,Avg Neg Prediction,0.638
pig,Test Cow+Pig Pred,Avg Neg Prediction,0.708
rat,Test neg,Avg Neg Prediction,0.675


In [8]:
# EQN MODEL
import pandas as pd
import scipy.stats

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Test','Test2', 'Test3', 'Test Cow+Pig']
    preds = [pred_testPos, pred_test2_df, pred_test3_df, pred_test4_df]
    trues = [doubled_testPos, doubled_test2_df, doubled_test3_df, doubled_test4_df]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, _ = scipy.stats.pearsonr(x, y)
        spearman, _ = scipy.stats.spearmanr(x, y)
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Test neg', 'Test1 avg pred', 'Test Cow+Pig Pred']
    negValues = [pred_testNeg.mean().iloc[0], pred_test1_df.mean().iloc[0], pred_test5_df.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
neg_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['mcf297qb', '7l12zan1']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'
        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST_orthologs.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_eqn/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)


        #############################################################################
        # load TEST DFs
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_eqn/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_eqn/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        if len(pred_TEST) != test1_len+test2_len+test3_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_TEST.head(test1_len)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_TEST.tail(test3_len)
        
        #############################################################################
        # load TEST DFs
        pred_cow_pig_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TEST.csv', header=None)
        
        test4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_eqn/log_test4/cow_pig_liver_pos_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test5/cow_pig_liver_neg_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test4_len = 2*len(test4Pos_df)
        test5_len = 2*len(test5Neg_df)
        
        if len(pred_cow_pig_TEST) != test4_len+test5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test4_df = pd.concat([test4Pos_df, test4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test5_df = pd.concat([test5Neg_df, test5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test4_df = pred_cow_pig_TEST.head(test4_len)
        pred_test5_df = pred_cow_pig_TEST.tail(test5_len)
        
        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate()
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)

        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Test', 'Test2', 'Test3','Test Cow+Pig'
]

custom_group_order_neg = [
    'Test neg', 'Test1 avg pred','Test Cow+Pig Pred'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_eqn = pivot_df[model_list]
pivot_neg_df_eqn = pivot_neg_df[model_list]

# Display the final pivoted DataFrame
print("--- Final Pivoted Results ---")
display(pivot_df_eqn.style.format("{:.3f}"))
display(pivot_neg_df_eqn.style.format("{:.3f}"))

# print(f'Results successfully saved to: {output_filename}')

--- Final Pivoted Results ---


Unnamed: 0_level_0,Unnamed: 1_level_0,model,mcf297qb,7l12zan1
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1
cow,Test,Pearson,0.355,0.375
cow,Test,Spearman,0.366,0.383
cow,Test2,Pearson,0.36,0.383
cow,Test2,Spearman,0.369,0.393
cow,Test3,Pearson,0.283,0.298
cow,Test3,Spearman,0.284,0.299
cow,Test Cow+Pig,Pearson,0.422,0.417
cow,Test Cow+Pig,Spearman,0.455,0.449
macaque,Test,Pearson,0.371,0.34
macaque,Test,Spearman,0.372,0.345


Unnamed: 0_level_0,Unnamed: 1_level_0,model,mcf297qb,7l12zan1
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1
cow,Test neg,Avg Neg Prediction,0.568,0.476
cow,Test1 avg pred,Avg Neg Prediction,0.52,0.453
cow,Test Cow+Pig Pred,Avg Neg Prediction,0.717,0.625
macaque,Test neg,Avg Neg Prediction,0.542,0.447
macaque,Test1 avg pred,Avg Neg Prediction,0.515,0.433
macaque,Test Cow+Pig Pred,Avg Neg Prediction,0.717,0.625
pig,Test neg,Avg Neg Prediction,0.594,0.477
pig,Test1 avg pred,Avg Neg Prediction,0.538,0.447
pig,Test Cow+Pig Pred,Avg Neg Prediction,0.717,0.625
rat,Test neg,Avg Neg Prediction,0.676,0.525


In [14]:
output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/2kb_log_model_eval_table_FINAL.tsv'
pivot_df_2kb.to_csv(output_filename, sep='\t', float_format='%.3f')

output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/eqn_log_model_neg_table_FINAL.tsv'
pivot_neg_df_eqn.to_csv(output_neg_filename, sep='\t', float_format='%.3f')

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/eqn_log_model_eval_table_FINAL.tsv'
pivot_df_eqn.to_csv(output_filename, sep='\t', float_format='%.3f')

output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/2kb_log_model_neg_table_FINAL.tsv'
pivot_neg_df_2kb.to_csv(output_neg_filename, sep='\t', float_format='%.3f')

In [16]:
# 5 BEST LOG MODELS MOUSE ONLY
mhc = 200
import pandas as pd
import scipy.stats

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Train', 'Validation', 'Test']
    preds = [pred_trainPos, pred_valPos, pred_testPos ]
    trues = [doubled_trainPos, doubled_valPos, doubled_testPos]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, pearson_p = scipy.stats.pearsonr(x, y)
        spearman, spearman_p = scipy.stats.spearmanr(x, y)
        mse = mean_squared_error(x, y)
        pearson_p *= mhc
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson_p', 'Value': pearson_p})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman_p', 'Value': spearman_p})
        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Val neg', 'Test neg']
    negValues = [pred_valNeg.mean().iloc[0], pred_testNeg.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
neg_results = []
species_list = ['mouse']
model_list = ['bdbi7l3n', '7vsdq5k2', 'wnfdrgcc', '8i7h7nsh', 'ph4wrpxu']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'
        
        # load TRAIN VAL DFs
        pred_TRAIN_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TRAIN_VAL.csv', header=None)
        
        trainPos = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_TRAINONLY.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        valPos = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4] 
        valNeg = pd.read_csv('/home/azstephe/regression_liver/data/splits/negatives/nonMouse_liver_andRat_andCow_andPig_andMacaque_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]

        trainPos_len = 2*len(trainPos)
        valPos_len = 2*len(valPos)
        valNeg_len = 2*len(valNeg)
        
        if len(pred_TRAIN_VAL) != trainPos_len+valPos_len+valNeg_len:
            print(f"ERROR TRAIN ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_trainPos = pd.concat([trainPos, trainPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valPos = pd.concat([valPos, valPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valNeg = pd.concat([valNeg, valNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_trainPos = pred_TRAIN_VAL.head(trainPos_len)
        pred_valPos = pred_TRAIN_VAL.iloc[trainPos_len:trainPos_len + valPos_len]
        pred_valNeg = pred_TRAIN_VAL.tail(valNeg_len)

        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)


        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate()
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)
        
        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)
        

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Train', 'Validation', 'Test'
]

custom_group_order_neg = [
    'Val neg', 'Test neg'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_reordered = pivot_df[model_list]
pivot_neg_df_reordered = pivot_neg_df[model_list]

metric_vars = ['Pearson', 'Pearson_p', 'Spearman', 'Spearman_p', 'MSE']
pivot_df_reordered = pivot_df_reordered.reindex(metric_vars, level='Metric')

# Loop through each model's column to apply the formatting
for col in pivot_df_reordered.columns:
    pivot_df_reordered[col] = pivot_df_reordered.apply(
        # Access the 'metric' from the index using row.name[2]
        # (assuming it's the 3rd level of your index)
        lambda row: format_value(row.name[2], row[col]),
        axis=1)

display(pivot_df_reordered)

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/mouse_log_model_eval_table_FINAL_mse.tsv'
pivot_df_reordered.to_csv(output_filename, sep='\t', float_format='%.3f')

# output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/mouse_log_model_neg_table_FINAL.tsv'
# pivot_neg_df_reordered.to_csv(output_neg_filename, sep='\t', float_format='%.3f')

# print(f'Results successfully saved to: {output_filename}')

Unnamed: 0_level_0,Unnamed: 1_level_0,model,bdbi7l3n,7vsdq5k2,wnfdrgcc,8i7h7nsh,ph4wrpxu
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
mouse,Train,Pearson,0.494,0.497,0.507,0.485,0.478
mouse,Train,Pearson_p,0.0,0.0,0.0,0.0,0.0
mouse,Train,Spearman,0.498,0.509,0.521,0.502,0.501
mouse,Train,Spearman_p,0.0,0.0,0.0,0.0,0.0
mouse,Train,MSE,1.39,1.07,1.02,1.07,1.18
mouse,Validation,Pearson,0.483,0.457,0.465,0.458,0.451
mouse,Validation,Pearson_p,3.49e-232,2.26e-205,3.68e-213,4.93e-206,1.25e-198
mouse,Validation,Spearman,0.491,0.477,0.488,0.48,0.484
mouse,Validation,Spearman_p,4.29e-244,5.5e-228,2.2899999999999998e-240,1.1300000000000001e-231,1.2e-235
mouse,Validation,MSE,1.42,1.13,1.07,1.11,1.22


In [20]:
# 2KB + EQN MODEL MOUSE 
import pandas as pd
import scipy.stats

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Test']
    preds = [pred_testPos ]
    trues = [doubled_testPos]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, _ = scipy.stats.pearsonr(x, y)
        spearman, _ = scipy.stats.spearmanr(x, y)
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Test neg']
    negValues = [pred_testNeg.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

all_results = []
neg_results = []
species_list = ['mouse']
model_list = ['mcf297qb', 'im88hepv']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'
        
        if model=='mcf297qb':
            # load TEST ORTHO DFs
            pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
            
            testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_eqn/log_pos/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
            testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
            
            testPos_len = 2*len(testPos)
            testNeg_len = 2*len(testNeg)
            
            if len(pred_TEST_ortho) != testPos_len+testNeg_len:
                print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
            
            doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
            doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
            
            pred_testPos = pred_TEST_ortho.head(testPos_len)
            pred_testNeg = pred_TEST_ortho.tail(testNeg_len)  

        else:
            # load TEST ORTHO DFs
            pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
            
            testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_pos/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
            testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/neg/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
            
            testPos_len = 2*len(testPos)
            testNeg_len = 2*len(testNeg)
            
            if len(pred_TEST_ortho) != testPos_len+testNeg_len:
                print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
            
            doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
            doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
            
            pred_testPos = pred_TEST_ortho.head(testPos_len)
            pred_testNeg = pred_TEST_ortho.tail(testNeg_len)  
            

        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate()
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)
        
        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)
        

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Train', 'Validation', 'Test'
]

custom_group_order_neg = [
    'Val neg', 'Test neg'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'      
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_reordered = pivot_df[model_list]
pivot_neg_df_reordered = pivot_neg_df[model_list]

# Display the final pivoted DataFrame
print("--- Final Pivoted Results ---")
display(pivot_df_reordered.style.format("{:.3f}"))
display(pivot_neg_df_reordered.style.format("{:.3f}"))

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/mouse_eqn_2kb_log_model_eval_table_FINAL.tsv'
# pivot_df_reordered.to_csv(output_filename, sep='\t', float_format='%.3f')

output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/mouse_eqn_2kb_log_model_neg_table_FINAL.tsv'
# pivot_neg_df_reordered.to_csv(output_neg_filename, sep='\t', float_format='%.3f')

# print(f'Results successfully saved to: {output_filename}')

--- Final Pivoted Results ---


Unnamed: 0_level_0,Unnamed: 1_level_0,model,mcf297qb,im88hepv
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1
mouse,Test,Pearson,0.479,0.464
mouse,Test,Spearman,0.477,0.464


Unnamed: 0_level_0,Unnamed: 1_level_0,model,mcf297qb,im88hepv
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1
mouse,Test neg,Avg Neg Prediction,0.541,0.645


In [21]:
# 1 3 5 LOG MODELS MOUSE
import pandas as pd
import scipy.stats

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Train', 'Validation', 'Test']
    preds = [pred_trainPos, pred_valPos, pred_testPos ]
    trues = [doubled_trainPos, doubled_valPos, doubled_testPos]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, _ = scipy.stats.pearsonr(x, y)
        spearman, _ = scipy.stats.spearmanr(x, y)
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Val neg', 'Test neg']
    negValues = [pred_valNeg.mean().iloc[0], pred_testNeg.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
neg_results = []
species_list = ['mouse']
model_list = ['bdbi7l3n', 'kf8188qf', 'cq45eb2s']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'
        
        # load TRAIN VAL DFs
        pred_TRAIN_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TRAIN_VAL.csv', header=None)
        
        trainPos = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_TRAINONLY.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        valPos = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4] 
        valNeg = pd.read_csv('/home/azstephe/regression_liver/data/splits/negatives/nonMouse_liver_andRat_andCow_andPig_andMacaque_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]

        trainPos_len = 2*len(trainPos)
        valPos_len = 2*len(valPos)
        valNeg_len = 2*len(valNeg)
        
        if len(pred_TRAIN_VAL) != trainPos_len+valPos_len+valNeg_len:
            print(f"ERROR TRAIN ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_trainPos = pd.concat([trainPos, trainPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valPos = pd.concat([valPos, valPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valNeg = pd.concat([valNeg, valNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_trainPos = pred_TRAIN_VAL.head(trainPos_len)
        pred_valPos = pred_TRAIN_VAL.iloc[trainPos_len:trainPos_len + valPos_len]
        pred_valNeg = pred_TRAIN_VAL.tail(valNeg_len)

        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)


        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate()
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)
        
        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)
        

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Train', 'Validation', 'Test'
]

custom_group_order_neg = [
    'Val neg', 'Test neg'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_reordered = pivot_df[model_list]
pivot_neg_df_reordered = pivot_neg_df[model_list]

# Display the final pivoted DataFrame
print("--- Final Pivoted Results ---")
display(pivot_df_reordered.style.format("{:.3f}"))
display(pivot_neg_df_reordered.style.format("{:.3f}"))

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/mouse_log_model_eval_table_FINAL.tsv'
# pivot_df_reordered.to_csv(output_filename, sep='\t', float_format='%.3f')

output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/mouse_log_model_neg_table_FINAL.tsv'
# pivot_neg_df_reordered.to_csv(output_neg_filename, sep='\t', float_format='%.3f')

# print(f'Results successfully saved to: {output_filename}')

--- Final Pivoted Results ---


Unnamed: 0_level_0,Unnamed: 1_level_0,model,bdbi7l3n,kf8188qf,cq45eb2s
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mouse,Train,Pearson,0.494,0.644,0.645
mouse,Train,Spearman,0.498,0.64,0.639
mouse,Validation,Pearson,0.483,0.584,0.597
mouse,Validation,Spearman,0.491,0.582,0.588
mouse,Test,Pearson,0.496,0.6,0.628
mouse,Test,Spearman,0.502,0.601,0.626


Unnamed: 0_level_0,Unnamed: 1_level_0,model,bdbi7l3n,kf8188qf,cq45eb2s
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mouse,Val neg,Avg Neg Prediction,0.619,0.46,0.429
mouse,Test neg,Avg Neg Prediction,0.609,0.446,0.424


In [3]:
# MOUSE TRAIN NEG
import pandas as pd
import scipy.stats

model_list = ['bdbi7l3n', '7vsdq5k2', 'wnfdrgcc', '8i7h7nsh', 'ph4wrpxu', 'im88hepv', 'mcf297qb', 'kf8188qf', 'cq45eb2s']


for model in model_list:
    print(model)
    model_dir = f'{model}_FINAL'

    if model=='im88hepv':
        pred_trainNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_mouse_TRAIN_NEG.csv', header=None)
        
    else:
        pred_trainNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_mouse_TRAIN_NEG.csv', header=None)

    mean_value = pred_trainNeg.mean().iloc[0]
    print(f'{mean_value:.3f}')
    


bdbi7l3n
0.612
7vsdq5k2
0.584
wnfdrgcc
0.648
8i7h7nsh
0.671
ph4wrpxu
0.674
im88hepv
0.646
mcf297qb
0.443
kf8188qf
0.403
cq45eb2s
0.401
