In [3]:
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import numpy as np

def pearson_spearman(x, y):
    pearson_corr, pearson_p_value = scipy.stats.pearsonr(x, y)
    print(f"Pearson correlation coefficient: {pearson_corr:.4f}, p-value: {pearson_p_value:.4g}")

    spearman_corr, spearman_p_value = scipy.stats.spearmanr(x, y)
    print(f"Spearman correlation coefficient: {spearman_corr:.4f}, p-value: {spearman_p_value:.4g}")

species_list = ['macaque', 'rat', 'cow', 'pig']

def mean_squared_error(x, y):
    x = np.asarray(x)
    y = np.asarray(y)
    return np.mean((x - y) ** 2)

def format_value(metric_name, value):
    """Format values depending on whether it's a P-value metric or not."""
    # Check for '_p' which is more general for 'pearson_p', 'spearman_p' etc.
    if "_p" in metric_name:
        return f"{value:.2e}"  # Scientific notation for p-values
    else:
        return f"{value:.3g}"  # General format for other metrics

In [6]:
# 1 3 5 LOG MODELS
import pandas as pd
import scipy.stats

def correlate(mhc):
    rows = []
    # Lists for correlation calculations
    groups = ['Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig']
    preds = [pred_trainPos, pred_valPos, pred_testPos, pred_val2_df, pred_val3_df, pred_test2_df, pred_test3_df, pred_train4_df, pred_val4_df, pred_test4_df]
    trues = [doubled_trainPos, doubled_valPos, doubled_testPos, doubled_val2_df, doubled_val3_df, doubled_test2_df, doubled_test3_df, doubled_train4_df, doubled_val4_df, doubled_test4_df]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, pearson_p = scipy.stats.pearsonr(x, y)
        spearman, spearman_p = scipy.stats.spearmanr(x, y)
        mse = mean_squared_error(x, y)
        pearson_p *= mhc
        spearman_p *= mhc
        
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson_p', 'Value': pearson_p})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman_p', 'Value': spearman_p})
        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred']
    negValues = [pred_trainNeg.mean().iloc[0], pred_valNeg.mean().iloc[0], pred_testNeg.mean().iloc[0], pred_val1_df.mean().iloc[0], pred_test1_df.mean().iloc[0], pred_train5_df.mean().iloc[0], pred_val5_df.mean().iloc[0], pred_test5_df.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

mhc = 200

all_results = []
neg_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['p1bd3srf', '5lesvftp', 'ho7szpgy']

for species in species_list:
    print(species)
    for model in model_list:
        print(model)
        model_dir = f'{model}_FINAL'

        if model == 'kf8188qf':
            mhc = 100
        
        # Load and process all dfs so correlate() function can access them
        #############################################################################

        # load TRAIN DFs
        negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_TRAIN_500bp.bed'
        if species == 'macaque':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_TRAIN_500bp.bed'
        elif species == 'cow':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_TRAIN_500bp.bed'
        elif species == 'pig':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_TRAIN_500bp.bed'
        
        pred_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TRAIN.csv', header=None)
        
        trainPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_TRAINONLY.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        trainNeg = pd.read_csv(negTrainPath, header=None, delim_whitespace=True).iloc[:,4]

        trainPos_len = 2*len(trainPos)
        trainNeg_len = 2*len(trainNeg)
        
        if len(pred_TRAIN) != trainPos_len+trainNeg_len:
            print(f"ERROR TRAIN ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_trainPos = pd.concat([trainPos, trainPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_trainNeg = pd.concat([trainNeg, trainNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_trainPos = pred_TRAIN.head(trainPos_len)
        pred_trainNeg = pred_TRAIN.tail(trainNeg_len)

        #############################################################################

        negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_VAL_500bp.bed'
        if species == 'macaque':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_VAL_500bp.bed'
        elif species == 'cow':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_VAL_500bp.bed'
        elif species == 'pig':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_VAL_500bp.bed'
        
        # load VAL ORTHO DFs
        pred_VAL_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL_orthologs.csv', header=None)
        
        valPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        valNeg = pd.read_csv(negValPath, header=None, delim_whitespace=True).iloc[:,4]
        
        valPos_len = 2*len(valPos)
        valNeg_len = 2*len(valNeg)
        
        if len(pred_VAL_ortho) != valPos_len+valNeg_len:
            print(f"ERROR VALORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_valPos = pd.concat([valPos, valPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valNeg = pd.concat([valNeg, valNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_valPos = pred_VAL_ortho.head(valPos_len)
        pred_valNeg = pred_VAL_ortho.tail(valNeg_len)

        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST_orthologs.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        print(pred_testPos.mean())
        
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)

        #############################################################################
        # load VAL DFs
        pred_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/val_splits/val1/{species}_liver_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val3/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        val3_len = 2*len(val3_df)
        
        if len(pred_VAL) != val1_len+val2_len+val3_len:
            print(f"ERROR VAL ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val1_df = pd.concat([val1_df, val1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val3_df = pd.concat([val3_df, val3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val1_df = pred_VAL.head(val1_len)
        pred_val2_df = pred_VAL.iloc[val1_len:val1_len + val2_len]
        pred_val3_df = pred_VAL.tail(val3_len)

        #############################################################################
        # load TEST DFs
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        if len(pred_TEST) != test1_len+test2_len+test3_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_TEST.head(test1_len)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_TEST.tail(test3_len)

        #############################################################################
        # load cow+pig TRAIN DFs
        pred_cow_pig_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TRAIN.csv', header=None)
        
        train4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        train5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        train4_len = 2*len(train4Pos_df)
        train5_len = 2*len(train5Neg_df)
        
        if len(pred_cow_pig_TRAIN) != train4_len+train5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_train4_df = pd.concat([train4Pos_df, train4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_train5_df = pd.concat([train5Neg_df, train5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_train4_df = pred_cow_pig_TRAIN.head(train4_len)
        pred_train5_df = pred_cow_pig_TRAIN.tail(train5_len)

        #############################################################################
        # load cow+pig VAL DFs
        pred_cow_pig_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_VAL.csv', header=None)
        
        val4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        val4_len = 2*len(val4Pos_df)
        val5_len = 2*len(val5Neg_df)
        
        if len(pred_cow_pig_VAL) != val4_len+val5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val4_df = pd.concat([val4Pos_df, val4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val5_df = pd.concat([val5Neg_df, val5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val4_df = pred_cow_pig_VAL.head(val4_len)
        pred_val5_df = pred_cow_pig_VAL.tail(val5_len)
        
        #############################################################################

        # load TEST DFs
        pred_cow_pig_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TEST.csv', header=None)
        
        test4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test4/cow_pig_liver_pos_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test5/cow_pig_liver_neg_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test4_len = 2*len(test4Pos_df)
        test5_len = 2*len(test5Neg_df)
        
        if len(pred_cow_pig_TEST) != test4_len+test5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test4_df = pd.concat([test4Pos_df, test4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test5_df = pd.concat([test5Neg_df, test5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test4_df = pred_cow_pig_TEST.head(test4_len)
        pred_test5_df = pred_cow_pig_TEST.tail(test5_len)
        
        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate(mhc)
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)

        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig'
]

custom_group_order_neg = [
    'Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_reordered = pivot_df[model_list]
pivot_neg_df_reordered = pivot_neg_df[model_list]

metric_vars = ['Pearson', 'Pearson_p', 'Spearman', 'Spearman_p', 'MSE']
pivot_df_reordered = pivot_df_reordered.reindex(metric_vars, level='Metric')

# Loop through each model's column to apply the formatting
for col in pivot_df_reordered.columns:
    pivot_df_reordered[col] = pivot_df_reordered.apply(
        # Access the 'metric' from the index using row.name[2]
        # (assuming it's the 3rd level of your index)
        lambda row: format_value(row.name[2], row[col]),
        axis=1)

display(pivot_df_reordered)

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/mouse_cow_pig_model_eval_table_FINAL_mse.tsv'
pivot_df_reordered.to_csv(output_filename, sep='\t', float_format='%.3f')

output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/neg_mouse_cow_pig_model_eval_table_FINAL_mse.tsv'
pivot_neg_df_reordered.to_csv(output_neg_filename, sep='\t', float_format='%.3f')

# print(f'Results successfully saved to: {output_filename}')

macaque
p1bd3srf
0    1.240048
dtype: float64
5lesvftp
0    1.052396
dtype: float64
ho7szpgy
0    1.188445
dtype: float64
rat
p1bd3srf
0    1.376158
dtype: float64
5lesvftp
0    1.178847
dtype: float64
ho7szpgy
0    1.302211
dtype: float64
cow
p1bd3srf
0    1.447919
dtype: float64
5lesvftp
0    1.247139
dtype: float64
ho7szpgy
0    1.395171
dtype: float64
pig
p1bd3srf
0    1.340672
dtype: float64
5lesvftp
0    1.1427
dtype: float64
ho7szpgy
0    1.291942
dtype: float64


Unnamed: 0_level_0,Unnamed: 1_level_0,model,p1bd3srf,5lesvftp,ho7szpgy
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cow,Train,Pearson,0.494,0.458,0.47
cow,Train,Pearson_p,0.00e+00,0.00e+00,0.00e+00
cow,Train,Spearman,0.501,0.457,0.478
cow,Train,Spearman_p,0.00e+00,0.00e+00,0.00e+00
cow,Train,MSE,1.07,1.48,1.21
...,...,...,...,...,...
rat,Test Cow+Pig,Pearson,0.365,0.382,0.379
rat,Test Cow+Pig,Pearson_p,6.72e-04,2.10e-04,2.54e-04
rat,Test Cow+Pig,Spearman,0.389,0.425,0.425
rat,Test Cow+Pig,Spearman_p,1.27e-04,7.70e-06,7.56e-06


In [5]:
# 1 3 5 LOG MODELS
import pandas as pd
import scipy.stats

def correlate(mhc):
    rows = []
    # Lists for correlation calculations
    groups = ['Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig']
    preds = [pred_trainPos, pred_valPos, pred_testPos, pred_val2_df, pred_val3_df, pred_test2_df, pred_test3_df, pred_train4_df, pred_val4_df, pred_test4_df]
    trues = [doubled_trainPos, doubled_valPos, doubled_testPos, doubled_val2_df, doubled_val3_df, doubled_test2_df, doubled_test3_df, doubled_train4_df, doubled_val4_df, doubled_test4_df]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, pearson_p = scipy.stats.pearsonr(x, y)
        spearman, spearman_p = scipy.stats.spearmanr(x, y)
        mse = mean_squared_error(x, y)
        pearson_p *= mhc
        spearman_p *= mhc
        
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson_p', 'Value': pearson_p})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman_p', 'Value': spearman_p})
        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
          
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred']
    negValues = [pred_trainNeg.mean().iloc[0], pred_valNeg.mean().iloc[0], pred_testNeg.mean().iloc[0], pred_val1_df.mean().iloc[0], pred_test1_df.mean().iloc[0], pred_train5_df.mean().iloc[0], pred_val5_df.mean().iloc[0], pred_test5_df.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

mhc = 200

all_results = []
neg_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['bdbi7l3n', 'kf8188qf', 'cq45eb2s']

for species in species_list:
    print(species)
    for model in model_list:
        print(model)
        model_dir = f'{model}_FINAL'

        if model == 'kf8188qf':
            mhc = 100
        
        # Load and process all dfs so correlate() function can access them
        #############################################################################

        # load TRAIN DFs
        negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_TRAIN_500bp.bed'
        if species == 'macaque':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_TRAIN_500bp.bed'
        elif species == 'cow':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_TRAIN_500bp.bed'
        elif species == 'pig':
            negTrainPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_TRAIN_500bp.bed'
        
        pred_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TRAIN.csv', header=None)
        
        trainPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_TRAINONLY.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        trainNeg = pd.read_csv(negTrainPath, header=None, delim_whitespace=True).iloc[:,4]

        trainPos_len = 2*len(trainPos)
        trainNeg_len = 2*len(trainNeg)
        
        if len(pred_TRAIN) != trainPos_len+trainNeg_len:
            print(f"ERROR TRAIN ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_trainPos = pd.concat([trainPos, trainPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_trainNeg = pd.concat([trainNeg, trainNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_trainPos = pred_TRAIN.head(trainPos_len)
        pred_trainNeg = pred_TRAIN.tail(trainNeg_len)

        #############################################################################

        negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonRat_liver_andMacaque_andCow_andPig_VAL_500bp.bed'
        if species == 'macaque':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonMacaque_liver_andRat_andCow_andPig_VAL_500bp.bed'
        elif species == 'cow':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonCow_liver_andMacaque_andRat_andPig_VAL_500bp.bed'
        elif species == 'pig':
            negValPath = f'/home/azstephe/liverRegression/regression_liver/data/splits/negatives/nonPig_liver_andMacaque_andRat_andCow_VAL_500bp.bed'
        
        # load VAL ORTHO DFs
        pred_VAL_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL_orthologs.csv', header=None)
        
        valPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        valNeg = pd.read_csv(negValPath, header=None, delim_whitespace=True).iloc[:,4]
        
        valPos_len = 2*len(valPos)
        valNeg_len = 2*len(valNeg)
        
        if len(pred_VAL_ortho) != valPos_len+valNeg_len:
            print(f"ERROR VALORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_valPos = pd.concat([valPos, valPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valNeg = pd.concat([valNeg, valNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_valPos = pred_VAL_ortho.head(valPos_len)
        pred_valNeg = pred_VAL_ortho.tail(valNeg_len)

        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST_orthologs.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        print(pred_testPos.mean())
        
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)

        #############################################################################
        # load VAL DFs
        pred_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/val_splits/val1/{species}_liver_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val3/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        val3_len = 2*len(val3_df)
        
        if len(pred_VAL) != val1_len+val2_len+val3_len:
            print(f"ERROR VAL ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val1_df = pd.concat([val1_df, val1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val3_df = pd.concat([val3_df, val3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val1_df = pred_VAL.head(val1_len)
        pred_val2_df = pred_VAL.iloc[val1_len:val1_len + val2_len]
        pred_val3_df = pred_VAL.tail(val3_len)

        #############################################################################
        # load TEST DFs
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        if len(pred_TEST) != test1_len+test2_len+test3_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_TEST.head(test1_len)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_TEST.tail(test3_len)

        #############################################################################
        # load cow+pig TRAIN DFs
        pred_cow_pig_TRAIN = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TRAIN.csv', header=None)
        
        train4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        train5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_TRAIN_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        train4_len = 2*len(train4Pos_df)
        train5_len = 2*len(train5Neg_df)
        
        if len(pred_cow_pig_TRAIN) != train4_len+train5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_train4_df = pd.concat([train4Pos_df, train4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_train5_df = pd.concat([train5Neg_df, train5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_train4_df = pred_cow_pig_TRAIN.head(train4_len)
        pred_train5_df = pred_cow_pig_TRAIN.tail(train5_len)

        #############################################################################
        # load cow+pig VAL DFs
        pred_cow_pig_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_VAL.csv', header=None)
        
        val4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_pos_mouse_macaque_rat_closed_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/cow_pig/cow_pig_liver_neg_mouse_macaque_rat_open_VAL_chromName_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        val4_len = 2*len(val4Pos_df)
        val5_len = 2*len(val5Neg_df)
        
        if len(pred_cow_pig_VAL) != val4_len+val5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_val4_df = pd.concat([val4Pos_df, val4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val5_df = pd.concat([val5Neg_df, val5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val4_df = pred_cow_pig_VAL.head(val4_len)
        pred_val5_df = pred_cow_pig_VAL.tail(val5_len)
        
        #############################################################################

        # load TEST DFs
        pred_cow_pig_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_cow_pig_TEST.csv', header=None)
        
        test4Pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test4/cow_pig_liver_pos_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test5Neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test5/cow_pig_liver_neg_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test4_len = 2*len(test4Pos_df)
        test5_len = 2*len(test5Neg_df)
        
        if len(pred_cow_pig_TEST) != test4_len+test5_len:
            print(f"ERROR TEST ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_test4_df = pd.concat([test4Pos_df, test4Pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test5_df = pd.concat([test5Neg_df, test5Neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test4_df = pred_cow_pig_TEST.head(test4_len)
        pred_test5_df = pred_cow_pig_TEST.tail(test5_len)
        
        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate(mhc)
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)

        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Train', 'Validation', 'Test', 'Val2', 'Val3', 'Test2', 'Test3', 'Train Cow+Pig', 'Val Cow+Pig', 'Test Cow+Pig'
]

custom_group_order_neg = [
    'Train neg', 'Val neg', 'Test neg', 'Val1 avg pred', 'Test1 avg pred', 'Train Cow+Pig Pred', 'Val Cow+Pig Pred', 'Test Cow+Pig Pred'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_reordered = pivot_df[model_list]
pivot_neg_df_reordered = pivot_neg_df[model_list]

metric_vars = ['Pearson', 'Pearson_p', 'Spearman', 'Spearman_p', 'MSE']
pivot_df_reordered = pivot_df_reordered.reindex(metric_vars, level='Metric')

# Loop through each model's column to apply the formatting
for col in pivot_df_reordered.columns:
    pivot_df_reordered[col] = pivot_df_reordered.apply(
        # Access the 'metric' from the index using row.name[2]
        # (assuming it's the 3rd level of your index)
        lambda row: format_value(row.name[2], row[col]),
        axis=1)

display(pivot_df_reordered)

# output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/135log_model_eval_table_FINAL_mse.tsv'
# pivot_df_reordered.to_csv(output_filename, sep='\t', float_format='%.3f')

# output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/135log_model_neg_table_FINAL.tsv'
# pivot_neg_df_reordered.to_csv(output_neg_filename, sep='\t', float_format='%.3f')

# print(f'Results successfully saved to: {output_filename}')

macaque
bdbi7l3n
0    0.900561
dtype: float64
kf8188qf
0    1.44354
dtype: float64
cq45eb2s
0    1.358886
dtype: float64
rat
bdbi7l3n
0    0.939075
dtype: float64
kf8188qf
0    1.582231
dtype: float64
cq45eb2s
0    1.520884
dtype: float64
cow
bdbi7l3n
0    0.976135
dtype: float64
kf8188qf
0    1.572218
dtype: float64
cq45eb2s
0    1.47207
dtype: float64
pig
bdbi7l3n
0    0.934451
dtype: float64
kf8188qf
0    1.457113
dtype: float64
cq45eb2s
0    1.387212
dtype: float64


Unnamed: 0_level_0,Unnamed: 1_level_0,model,bdbi7l3n,kf8188qf,cq45eb2s
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cow,Train,Pearson,0.406,0.429,0.518
cow,Train,Pearson_p,0.00e+00,0.00e+00,0.00e+00
cow,Train,Spearman,0.401,0.429,0.518
cow,Train,Spearman_p,0.00e+00,0.00e+00,0.00e+00
cow,Train,MSE,2.27,1.11,1.05
...,...,...,...,...,...
rat,Test Cow+Pig,Pearson,0.435,0.396,0.436
rat,Test Cow+Pig,Pearson_p,1.76e-06,3.60e-05,1.57e-06
rat,Test Cow+Pig,Spearman,0.456,0.433,0.475
rat,Test Cow+Pig,Spearman_p,2.88e-07,2.02e-06,4.70e-08


In [8]:
# 1 3 5 LOG MODELS MOUSE
import pandas as pd
import scipy.stats
mhc = 200
def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Train', 'Validation', 'Test']
    preds = [pred_trainPos, pred_valPos, pred_testPos ]
    trues = [doubled_trainPos, doubled_valPos, doubled_testPos]

    # Calculate correlations
    for group, pred_df, true_df in zip(groups, preds, trues):
        x = true_df.squeeze()
        y = pred_df.squeeze()
        pearson, pearson_p = scipy.stats.pearsonr(x, y)
        spearman, spearman_p = scipy.stats.spearmanr(x, y)
        mse = mean_squared_error(x, y)
        pearson_p *= mhc
        spearman_p *= mhc
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson_p', 'Value': pearson_p})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman_p', 'Value': spearman_p})
        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
    return pd.DataFrame(rows)

def negatives():
    rows = []
     # Lists for negative average calculations
    negGroup = ['Val neg', 'Test neg']
    negValues = [pred_valNeg.mean().iloc[0], pred_testNeg.mean().iloc[0]]
    
    # Add negative value averages
    for group, negv in zip(negGroup, negValues):
        rows.append({'Group': group, 'Metric': 'Avg Neg Prediction', 'Value': negv})

    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
neg_results = []
species_list = ['mouse']
model_list = ['p1bd3srf', '5lesvftp', 'ho7szpgy']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'
        
        # load TRAIN VAL DFs
        pred_TRAIN_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TRAIN_VAL.csv', header=None)
        
        trainPos = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_TRAINONLY.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        valPos = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4] 
        valNeg = pd.read_csv('/home/azstephe/regression_liver/data/splits/negatives/nonMouse_liver_andRat_andCow_andPig_andMacaque_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]

        trainPos_len = 2*len(trainPos)
        valPos_len = 2*len(valPos)
        valNeg_len = 2*len(valNeg)
        
        if len(pred_TRAIN_VAL) != trainPos_len+valPos_len+valNeg_len:
            print(f"ERROR TRAIN ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_trainPos = pd.concat([trainPos, trainPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valPos = pd.concat([valPos, valPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_valNeg = pd.concat([valNeg, valNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_trainPos = pred_TRAIN_VAL.head(trainPos_len)
        pred_valPos = pred_TRAIN_VAL.iloc[trainPos_len:trainPos_len + valPos_len]
        pred_valNeg = pred_TRAIN_VAL.tail(valNeg_len)

        #############################################################################

        # load TEST ORTHO DFs
        pred_TEST_ortho = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        testPos = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        testNeg = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        testPos_len = 2*len(testPos)
        testNeg_len = 2*len(testNeg)
        
        if len(pred_TEST_ortho) != testPos_len+testNeg_len:
            print(f"ERROR TEST ORTHO ({species}, {model}): predictions are a different length than validation sets")
        
        doubled_testPos = pd.concat([testPos, testPos]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_testNeg = pd.concat([testNeg, testNeg]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_testPos = pred_TEST_ortho.head(testPos_len)
        pred_testNeg = pred_TEST_ortho.tail(testNeg_len)


        # Call the correlate function which now uses the globally available DFs
        corr_df = correlate()
        corr_df['species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)
        
        neg_df = negatives()
        neg_df['species'] = species
        neg_df['model'] = model
        neg_results.append(neg_df)
        

# #############################################################################
# FINAL PROCESSING
# #############################################################################

summary_df = pd.concat(all_results)

summary_neg_df = pd.concat(neg_results)

# Define the custom order to place negative groups at the bottom.
custom_group_order = [
    'Train', 'Validation', 'Test'
]

custom_group_order_neg = [
    'Val neg', 'Test neg'
]

# Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_neg_df['Group'] = pd.Categorical(summary_neg_df['Group'], categories=custom_group_order_neg, ordered=True)


# Pivot so each model is a column
pivot_df = summary_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

pivot_neg_df = summary_neg_df.pivot_table(
    index=['species', 'Group', 'Metric'],
    columns='model',
    values='Value'
)

# Sort the index to maintain a logical order (will now use the custom group order)
pivot_df = pivot_df.sort_index(level=['species', 'Group', 'Metric'])
pivot_neg_df = pivot_neg_df.sort_index(level=['species', 'Group', 'Metric'])

pivot_df_reordered = pivot_df[model_list]
pivot_neg_df_reordered = pivot_neg_df[model_list]

metric_vars = ['Pearson', 'Pearson_p', 'Spearman', 'Spearman_p', 'MSE']
pivot_df_reordered = pivot_df_reordered.reindex(metric_vars, level='Metric')

# Loop through each model's column to apply the formatting
for col in pivot_df_reordered.columns:
    pivot_df_reordered[col] = pivot_df_reordered.apply(
        # Access the 'metric' from the index using row.name[2]
        # (assuming it's the 3rd level of your index)
        lambda row: format_value(row.name[2], row[col]),
        axis=1)

display(pivot_df_reordered)

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/mouse_mouse_cow_pig_model_eval_table_FINAL_mse.tsv'
pivot_df_reordered.to_csv(output_filename, sep='\t', float_format='%.3f')

output_neg_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/mouse_mouse_cow_pig_model_neg_table_FINAL.tsv'
pivot_neg_df_reordered.to_csv(output_neg_filename, sep='\t', float_format='%.3f')

# print(f'Results successfully saved to: {output_filename}')

Unnamed: 0_level_0,Unnamed: 1_level_0,model,p1bd3srf,5lesvftp,ho7szpgy
species,Group,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mouse,Train,Pearson,0.605,0.584,0.581
mouse,Train,Pearson_p,0.0,0.0,0.0
mouse,Train,Spearman,0.6,0.581,0.575
mouse,Train,Spearman_p,0.0,0.0,0.0
mouse,Train,MSE,0.692,0.951,0.809
mouse,Validation,Pearson,0.551,0.544,0.555
mouse,Validation,Pearson_p,6.64e-317,1.59e-307,0.0
mouse,Validation,Spearman,0.549,0.543,0.548
mouse,Validation,Spearman_p,1.38e-313,2.14e-305,3.19e-312
mouse,Validation,MSE,0.784,1.01,0.859


In [15]:
mouse135 = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/figs/tables/mouse_135log_model_eval_table_FINAL_mse.tsv', sep='\t')
all135 = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/figs/tables/135log_model_eval_table_FINAL.tsv', sep='\t')


Unnamed: 0,species,Group,Metric,bdbi7l3n,kf8188qf,cq45eb2s
0,mouse,Train,Pearson,0.494,0.644,0.645
1,mouse,Train,Pearson_p,0.0,0.0,0.0
2,mouse,Train,Spearman,0.498,0.64,0.639
3,mouse,Train,Spearman_p,0.0,0.0,0.0
4,mouse,Train,MSE,1.39,0.521,0.622
5,mouse,Validation,Pearson,0.483,0.584,0.597
6,mouse,Validation,Pearson_p,3.49e-232,0.0,0.0
7,mouse,Validation,Spearman,0.491,0.582,0.588
8,mouse,Validation,Spearman_p,8.59e-242,0.0,0.0
9,mouse,Validation,MSE,1.42,0.638,0.699


In [None]:
# 1 3 5 BEST LOG MODELS FOLDCHANGE
import pandas as pd
import scipy.stats

def correlate(mhc):
    rows = []
    # Lists for correlation calculations
    groups = ['Val2', 'Test2']
    dfs = [val2_foldchange_df, test2_foldchange_df]

    # Calculate correlations
    for group, df in zip(groups, dfs):
        x = df['true'].squeeze()
        y = df['pred'].squeeze()
        pearson, pp = scipy.stats.pearsonr(x, y)
        spearman, ps = scipy.stats.spearmanr(x, y)

        mse = mean_squared_error(x, y)
        
        same_sign = np.sign(df['true']) == np.sign(df['pred'])
        num_ss = same_sign.sum()
        len_ss = len(df)
        perc_ss = num_ss / len_ss
        
        same_sign_avg = np.sign(df.loc[::2]['true']) == np.sign(df.loc[::2]['avg_pred'])
        num_ssa = same_sign_avg.sum()
        len_ssa = len(df) / 2
        perc_ssa = num_ssa / len_ssa
        
        rows.append({'Group': group, 'Metric': 'Same Sign Count', 'Value': num_ss})
        rows.append({'Group': group, 'Metric': 'Total Count', 'Value': len_ss})
        # print(f'total count: {len_ss}')
        rows.append({'Group': group, 'Metric': 'Same Sign %', 'Value': perc_ss})
     
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson P-Val', 'Value': pp*mhc})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman P-Val', 'Value': ps*mhc})

        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
        
    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['p1bd3srf', '5lesvftp', 'ho7szpgy']

for species in species_list:
    for model in model_list:
        mhc = 200
        if model == 'kf8188qf':
            mhc = 100
        model_dir = f'{model}_FINAL'

        nonMouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/{species}_liver_pos_ALL.bed', sep="\t", header=None)
        mouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/mouse_liver_pos_ALL.bed', sep="\t", header=None)
        one_to_one_peaks_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/oneToOnePeaks/{species}_mouse.bed', header=None, sep='\t')
        one_to_one_peaks_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/oneToOnePeaks/{species}_mouse.bed', header=None, sep='\t')

        peaks = one_to_one_peaks_VAL[[4, 14]]

        merged_df = peaks.merge(nonMouse_true, how='left', left_on=4, right_on=3, suffixes=('', '_non'))
        
        # Aligned orthologous peaks
        merged_df = merged_df.merge(mouse_true, how='left', left_on=14, right_on=3, suffixes=('_NON', '_mouse'))

        #############################################################################
        # load MOUSE VAL DF

        pred_mouse_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_mouse_TRAIN_VAL.csv', header=None)
        
        mouse_train = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_TRAINONLY.narrowPeak', sep="\t", header=None)
        mouse_val = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_VAL.narrowPeak', sep="\t", header=None)
        
        mouse_train_len = 2*len(mouse_train)
        mouse_val_len = 2*len(mouse_val)

        doubled_mouse_val_df = pd.concat([mouse_val, mouse_val]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_mouse_val_df = doubled_mouse_val_df.rename(columns={0:'mouse_chr'})
    
        pred_mouse_val_df = pred_mouse_VAL.iloc[mouse_train_len:mouse_train_len + mouse_val_len].reset_index(drop=True)
        pred_mouse_val_df = pd.concat([doubled_mouse_val_df.drop(columns=5), pred_mouse_val_df], axis=1)
        pred_mouse_val_df = pred_mouse_val_df.rename(columns={4:'mouse_true', 0:'mouse_pred'}).reset_index(drop=True)
        
        merged_mouse_val_true_pred = peaks.merge(pred_mouse_val_df, how='left', left_on=14, right_on=3)
        mouse_val_true_pred = merged_mouse_val_true_pred.rename(columns={3:'mouse_peak'}).drop([14], axis=1)

        #############################################################################
        # load non-mouse VAL2 DF

        pred_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/val_splits/val1/{species}_liver_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).rename(columns={0: 'mac_chr'})
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        pred_val2_df = pred_VAL.iloc[val1_len:val1_len + val2_len].reset_index(drop=True)
        doubled_val2_combined = pd.concat([doubled_val2_df, pred_val2_df], axis=1)

        non_val2_true_pred = doubled_val2_combined.rename(columns={4: 'non_true', 0:'non_pred'})
        merged_non_val2_true_pred = peaks.merge(non_val2_true_pred, how='left', left_on=4, right_on=3).drop(14, axis=1)
        non_val2_true_pred = merged_non_val2_true_pred.rename(columns={3:'non_peak'}).drop(4, axis=1)
        #############################################################################

        # make VAL2 merged df
        val2_foldchange_df = pd.concat([mouse_val_true_pred, non_val2_true_pred], axis=1)
        
        val2_foldchange_df['true'] = val2_foldchange_df['mouse_true']-val2_foldchange_df['non_true']
        val2_foldchange_df['pred'] = val2_foldchange_df['mouse_pred']-val2_foldchange_df['non_pred']
        
        mouse_av = (val2_foldchange_df.loc[::2, 'mouse_pred'].values + val2_foldchange_df.loc[1::2, 'mouse_pred'].values) / 2
        mac_av = (val2_foldchange_df.loc[::2, 'non_pred'].values + val2_foldchange_df.loc[1::2, 'non_pred'].values) / 2
        
        # Add the averages back to the DataFrame as a new column
        val2_foldchange_df.loc[::2, 'mouse_pred_avg'] = mouse_av
        val2_foldchange_df.loc[::2, 'non_pred_avg'] = mac_av 
        val2_foldchange_df['avg_pred'] = val2_foldchange_df['mouse_pred_avg']-val2_foldchange_df['non_pred_avg']

        #############################################################################
        # TIME FOR TEST2 FOLDCHANGE

        peaks = one_to_one_peaks_TEST[[4, 14]]

        merged_df = peaks.merge(nonMouse_true, how='left', left_on=4, right_on=3, suffixes=('', '_non'))
        
        # Aligned orthologous peaks
        merged_df = merged_df.merge(mouse_true, how='left', left_on=14, right_on=3, suffixes=('_NON', '_mouse'))

        #############################################################################
        
        # load MOUSE TEST DF

        pred_mouse_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_mouse_TEST.csv', header=None)
        
        mouse_test = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/mouse_liver_TEST_500bp.bed', sep="\t", header=None)
        
        mouse_test_len = 2*len(mouse_test)

        doubled_mouse_test_df = pd.concat([mouse_test, mouse_test]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_mouse_test_df = doubled_mouse_test_df.rename(columns={0:'mouse_chr'})
    
        pred_mouse_test_df = pred_mouse_TEST.head(mouse_test_len).reset_index(drop=True)
        pred_mouse_test_df = pd.concat([doubled_mouse_test_df.drop(columns=5), pred_mouse_test_df], axis=1)
        pred_mouse_test_df = pred_mouse_test_df.rename(columns={4:'mouse_true', 0:'mouse_pred'}).reset_index(drop=True)
        
        merged_mouse_test_true_pred = peaks.merge(pred_mouse_test_df, how='left', left_on=14, right_on=3)
        mouse_test_true_pred = merged_mouse_test_true_pred.rename(columns={3:'mouse_peak'}).drop([14], axis=1)

        #############################################################################
        # load nonmouse TEST2 DF
        
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).rename(columns={0: 'mac_chr'})
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len].reset_index(drop=True)
        doubled_test2_combined = pd.concat([doubled_test2_df, pred_test2_df], axis=1)

        non_test2_true_pred = doubled_test2_combined.rename(columns={4: 'non_true', 0:'non_pred'})
        merged_non_test2_true_pred = peaks.merge(non_test2_true_pred, how='left', left_on=4, right_on=3).drop(14, axis=1)
        non_test2_true_pred = merged_non_test2_true_pred.rename(columns={3:'non_peak'}).drop(4, axis=1)

        #############################################################################

        # make TEST2 merged df
        test2_foldchange_df = pd.concat([mouse_test_true_pred, non_test2_true_pred], axis=1)
        
        test2_foldchange_df['true'] = test2_foldchange_df['mouse_true']-test2_foldchange_df['non_true']
        test2_foldchange_df['pred'] = test2_foldchange_df['mouse_pred']-test2_foldchange_df['non_pred']
        
        mouse_av = (test2_foldchange_df.loc[::2, 'mouse_pred'].values + test2_foldchange_df.loc[1::2, 'mouse_pred'].values) / 2
        non_av = (test2_foldchange_df.loc[::2, 'non_pred'].values + test2_foldchange_df.loc[1::2, 'non_pred'].values) / 2
        
        # Add the averages back to the DataFrame as a new column
        test2_foldchange_df.loc[::2, 'mouse_pred_avg'] = mouse_av
        test2_foldchange_df.loc[::2, 'non_pred_avg'] = non_av 
        test2_foldchange_df['avg_pred'] = test2_foldchange_df['mouse_pred_avg']-test2_foldchange_df['non_pred_avg']
        
        corr_df = correlate(mhc)
        corr_df['Species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)
        


#############################################################################

summary_df = pd.concat(all_results)

custom_group_order = [ 'Val2', 'Test2' ]
custom_metric_order = ['Same Sign Count', 'Total Count', 'Same Sign %', 'Pearson', 'Pearson P-Val', 'Spearman', 'Spearman P-Val', 'MSE']

# # Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_df['Metric'] = pd.Categorical(summary_df['Metric'], categories=custom_metric_order, ordered=True)


pivot_df = summary_df.pivot_table(
    index=["Species", "Group", "Metric"],
    columns="model",
    values="Value"
).reset_index()

pivot_df = pivot_df.sort_values(by=["Species", "Group", "Metric"])

pivot_df = pivot_df[["Species", "Group", "Metric"] + model_list]
# pivot_df_reordered = pivot_df[model_list]

def format_value(metric, value):
    """Format values depending on whether it's a P-Val metric or not."""
    if "P-Val" in metric:
        return f"{value:.2e}"   # scientific notation, 3 sig figs
    else:
        return f"{value:.3g}"   # regular decimal, 3 sig figs

# Apply formatting to a copy just for display/export
pivot_df_display = pivot_df.copy()
for col in model_list:  # each model column
    pivot_df_display[col] = pivot_df_display.apply(
        lambda row: format_value(row["Metric"], row[col]),
        axis=1
    )

display(pivot_df_display)

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/mouse_cow_pig_model_foldchange_table_FINAL_mse.tsv'
pivot_df_display.to_csv(output_filename, sep='\t')

# print(f'Results successfully saved to: {output_filename}')
