In [2]:
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import numpy as np

def pearson_spearman(x, y):
    pearson_corr, pearson_p_value = scipy.stats.pearsonr(x, y)
    print(f"Pearson correlation coefficient: {pearson_corr:.3f}, p-value: {pearson_p_value:.3g}")

    spearman_corr, spearman_p_value = scipy.stats.spearmanr(x, y)
    print(f"Spearman correlation coefficient: {spearman_corr:.3f}, p-value: {spearman_p_value:.3g}")
    
def collapse_if_identical(lst):
    return lst[0] if all(x == lst[0] for x in lst) else lst
    

# takes in list and returns peak with largest signal
def keep_largest_signal(lst, qn_df):
    if isinstance(lst, list):
        maxPeakName = ''
        maxPeakStrength = 0
        for peak in lst:
            matching_rows = qn_df[qn_df[3] == peak]
            if not matching_rows.empty:
                signal = matching_rows[4].iloc[0]
                if signal > maxPeakStrength:
                    maxPeakStrength = signal
                    maxPeakName = peak
        return maxPeakName
    else: 
        return lst

# def overlap(start1, end1, start2, end2):
#     return max(0, min(end1, end2)-max(start1, start2))

def get_biggest_overlap(lst, col, df):
    maxOverlap = 0
    maxPeakOverlap = ""
    for peak in lst:
        row = df[df[col] == peak].iloc[0] #iloc bc possible to return multiple rows (never will based on th
        overlap = max(0, min(row[2], row[12])-max(row[1], row[11]))
        if overlap > maxOverlap:
            maxOverlap = overlap
            maxPeakOverlap = peak
    return peak

def mean_squared_error(x, y):
    x = np.asarray(x)
    y = np.asarray(y)
    return np.mean((x - y) ** 2)

In [26]:
# EXAMPLE MAKE ONE_TO_ONE_PEAKS THEN SAVE IT

species = 'macaque'
both_df = pd.read_csv(f"/home/azstephe/liverRegression/regression_liver/data/splits/{species}Mouse/{species}ToMouse_liver_{species}Enhancer_mouseEnhancer_val_wawb.narrowPeak", sep="\t", header=None)
nonMouse_true = pd.read_csv(f"/home/azstephe/liverRegression/regression_liver/data/log/{species}_liver_pos_ALL.bed", sep="\t", header=None)
mouse_true = pd.read_csv("/home/azstephe/liverRegression/regression_liver/data/log/mouse_liver_pos_ALL.bed", sep="\t", header=None)

unique1_2 = both_df[~both_df.duplicated(subset=[1, 2], keep=False)] # entries with unique mac mapped to mouse start and end
all_unique = unique1_2[~unique1_2.duplicated(subset=[11,12], keep=False)]

#### COL1,2 DUPLICATES

duplicates3 = both_df[both_df.duplicated(subset=[1, 2], keep=False)] # entries with duplicated mac mapped to mouse start and end 
grouped_dups3 = duplicates3.groupby([1, 2])[3].apply(list).reset_index(name='col3')

# grouped dups: start | end | [peaks with these endpoints]
grouped_dups3['col3'] = grouped_dups3['col3'].apply(collapse_if_identical) 

grouped_dups3['col3'] = grouped_dups3['col3'].apply(lambda x: keep_largest_signal(x, nonMouse_true)) # redundant endpoints now map to 1 peak in col3
grouped_dups3 = grouped_dups3[grouped_dups3['col3'] != ''] # remove macaque peaks that aren't large enough

# df of duplicated col1,2 with strongest peak
keep_strongestcol3 = duplicates3.merge(grouped_dups3[['col3']], left_on=3, right_on='col3', how='inner').drop('col3', axis=1) # keeps the strongest signal in col3 for redundant endpoints

#### COL11,12 DUPLICATES

grouped_dups13 = keep_strongestcol3.groupby([11, 12])[13].apply(list).reset_index(name='col13')
grouped_dups13['col13'] = grouped_dups13['col13'].apply(collapse_if_identical)

grouped_dups13['col13'] = grouped_dups13['col13'].apply(lambda x: keep_largest_signal(x, mouse_true))
grouped_dups13 = grouped_dups13[grouped_dups13['col13'] != '']

keep_strongestcol13 = keep_strongestcol3.merge(grouped_dups13[['col13']], left_on=13, right_on='col13', how='inner').drop('col13', axis=1)

unique_endpoints = keep_strongestcol13 #rows with unique endpoints from the duplicated endpoints set

####

# col3 peakname duplicates with different endpoints
still_dups_col3 = unique_endpoints[unique_endpoints.duplicated(subset=[3], keep=False)]

# col3peaks | [col13 peaks intersecting col3 peak]
grouped_dcol3 = still_dups_col3.groupby(3)[13].apply(list).reset_index(name='col13')

# get the col13 peak with most overlap of col3
grouped_dcol3['col13'] = grouped_dcol3['col13'].apply(lambda x: get_biggest_overlap(x, 13, still_dups_col3))

merged3 = still_dups_col3.merge(grouped_dcol3[[3, 'col13']], left_on=[3, 13], right_on=[3, 'col13'], how='left', indicator=True)

remove3 = merged3[merged3['_merge'] == 'left_only'].drop(columns=['_merge']) # col13 is what we want to remove

unique_endpoints_subset = unique_endpoints.iloc[:,:20]
remove3_subset = remove3.iloc[:,:20]

# all col3 entries unique
unique3 = unique_endpoints[~unique_endpoints_subset.apply(tuple, axis=1).isin(remove3_subset.apply(tuple, axis=1))] 

####

# col13 peakname duplicates with different endpoints
still_dups_col13 = unique3[unique3.duplicated(subset=[13], keep=False)]

# col13peaks | [col3 peaks intersecting col13 peak]
grouped_dcol13 = still_dups_col13.groupby(13)[3].apply(list).reset_index(name='col3')

# get the col3 peak with most overlap of col13
grouped_dcol13['col3'] = grouped_dcol13['col3'].apply(lambda x: get_biggest_overlap(x, 3, still_dups_col13))

merged13 = still_dups_col13.merge(grouped_dcol13[[13, 'col3']], left_on=[3, 13], right_on=['col3', 13], how='left', indicator=True)

remove13 = merged13[merged13['_merge'] == 'left_only'].drop(columns=['_merge'])

unique3_subset = unique3.iloc[:, :20]
remove13_subset = remove13.iloc[:, :20]

# Identify rows in u3 that are NOT in remove3
filtered_peaks_unique = unique3[~unique3_subset.apply(tuple, axis=1).isin(remove13_subset.apply(tuple, axis=1))]

full_unique = pd.concat([all_unique, filtered_peaks_unique])

# Sort by column '1'
one_to_one_peaks = full_unique.sort_values(by=1).reset_index(drop=True)
one_to_one_peaks.to_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/oneToOnePeaks_val/{species}_mouse.bed', header=None, sep='\t')
one_to_one_peaks
##############################



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,chr9,6668262,6668828,peak24458,-1,.,-1,-1,-1,370,chr9,6668246,6669123,peak20464,1000,.,10.41801,106.98440,104.00175,372
1,chr9,7603389,7603594,peak64793,-1,.,-1,-1,-1,114,chr9,7603184,7603794,peak20478,1000,.,18.10567,247.44370,243.99432,315
2,chr9,7914492,7914988,peak729,-1,.,-1,-1,-1,141,chr9,7914173,7914971,peak20484,1000,.,15.21739,160.17427,156.98676,531
3,chr8,8733638,8734157,peak30058,-1,.,-1,-1,-1,379,chr8,8733751,8734148,peak19418,1000,.,12.36603,110.58308,107.58508,270
4,chr8,8992445,8993220,peak5261,-1,.,-1,-1,-1,506,chr8,8992509,8993074,peak19424,1000,.,6.54450,48.63282,45.97403,451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,chr8,126824543,126825031,peak57495,-1,.,-1,-1,-1,266,chr8,126824525,126825188,peak20431,1000,.,6.61796,38.65833,36.08059,314
256,chr8,126838305,126839235,peak50766,-1,.,-1,-1,-1,182,chr8,126839044,126839256,peak20435,829,.,5.00771,21.68738,19.29232,109
257,chr8,126849473,126850187,peak58047,-1,.,-1,-1,-1,560,chr8,126849637,126850309,peak20437,836,.,4.81874,20.20660,17.83258,402
258,chr8,126920866,126921347,peak14712,-1,.,-1,-1,-1,189,chr8,126920865,126921188,peak20441,1000,.,6.42499,33.74764,31.21361,172


In [46]:
species = 'macaque'
model = 'bdbi7l3n'

nonMouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/{species}_liver_pos_ALL.bed', sep="\t", header=None)
mouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/mouse_liver_pos_ALL.bed', sep="\t", header=None)

one_to_one_peaks = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/oneToOnePeaks/{species}_mouse.bed', header=None, sep='\t')
pred_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_TEST.csv', header=None)
val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/amy_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
# val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
val3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]

val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, sep="\t").rename(columns={0: 'mac_chr'})

val1_len = 2*len(val1_df)
val2_len = 2*len(val2_df)

####################### make the ____ doubled predicted and true df
peaks = one_to_one_peaks[[4, 14]]

merged_df = peaks.merge(nonMouse_true, how='left', left_on=4, right_on=3, suffixes=('', '_hello'))

# Merge the result with df3 on col13 and df3['col3']
merged_df = merged_df.merge(mouse_true, how='left', left_on=14, right_on=3, suffixes=('_MAC', '_mouse'))

# Subtract col4 values from df2 and df3
# merged_df['result'] = merged_df['4_mac'] - merged_df['4_mouse']
# res = merged_df[['4', 14, '4_mac', '4_mouse', 'result']]
# nan = res[res.isna().any(axis=1)]

# nan

####################### 

doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)

pred_val2_df = pred_df.iloc[val1_len:val1_len + val2_len].reset_index(drop=True)

doubled_combined = pd.concat([doubled_val2_df, pred_val2_df], axis=1)

non_val2_real_pred = doubled_combined.rename(columns={4: 'mac_true', 0:'mac_pred'})

merged_mac_real_pred = peaks.merge(non_val2_real_pred, how='left', left_on=4, right_on=3).drop(14, axis=1)
# m_mac_pred = merged_mac_real_pred[~merged_mac_real_pred[3].isin(nan['3_mac'])].reset_index(drop=True)
m_mac_pred = merged_mac_real_pred.rename(columns={3:'mac_peak'}).drop(4, axis=1)

####################### make the mouse doubled predicted and true df

mouse_pred = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_mouse_TEST.csv', sep="\t", header=None)
mouse_real_neg=pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/mouse_liver_TEST_500bp.bed', sep="\t", header=None)
mouse_real_pos=pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/mouse_liver_TEST_500bp.bed', sep="\t", header=None)

real_neg_len = 2*len(mouse_real_neg)
real_pos_len = 2*len(mouse_real_pos)

mouse_pred_pos = mouse_pred.head(real_pos_len).reset_index(drop=True)

doubled_mouse_real_pos = pd.concat([mouse_real_pos, mouse_real_pos]).sort_index(kind='mergesort').reset_index(drop=True)


doubled_mouse_real_pos = doubled_mouse_real_pos.rename(columns={0:'mouse_chr'})
mouse_pos_real_pred = pd.concat([doubled_mouse_real_pos.drop(columns=5), mouse_pred_pos], axis=1)
mouse_pos_real_pred = mouse_pos_real_pred.rename(columns={4:'mouse_true', 0:'mouse_pred'}).reset_index(drop=True)


merged_mouse_pred = peaks.merge(mouse_pos_real_pred, how='left', left_on=14, right_on=3)

m_mouse_pred = merged_mouse_pred.rename(columns={3:'mouse_peak'}).drop([14], axis=1)

####################### make bigboss mouse + query predicted and real info

bigboss = pd.concat([m_mouse_pred, m_mac_pred], axis=1)

bigboss['true'] = bigboss['mouse_true']-bigboss['mac_true']
bigboss['pred'] = bigboss['mouse_pred']-bigboss['mac_pred']

mouse_av = (bigboss.loc[::2, 'mouse_pred'].values + bigboss.loc[1::2, 'mouse_pred'].values) / 2
mac_av = (bigboss.loc[::2, 'mac_pred'].values + bigboss.loc[1::2, 'mac_pred'].values) / 2

# Add the averages back to the DataFrame as a new column
bigboss.loc[::2, 'mouse_pred_avg'] = mouse_av  # Assign averages to even indices only
bigboss.loc[::2, 'mac_pred_avg'] = mac_av 
bigboss['avg_pred'] = bigboss['mouse_pred_avg']-bigboss['mac_pred_avg']


same_sign = np.sign(bigboss['true']) == np.sign(bigboss['pred'])
print(f'# of same sign for true difference and predicted difference: {same_sign.sum()} / {len(bigboss)} = {same_sign.sum() / len(bigboss)} %')

same_sign = np.sign(bigboss.loc[::2]['true']) == np.sign(bigboss.loc[::2]['avg_pred'])
print(f'# of same sign for true difference and averaged predicted difference: {same_sign.sum()} / {len(bigboss) / 2} = {same_sign.sum() / (len(bigboss) / 2)}%')

x = bigboss['true']
y = bigboss['pred']

print(pearson_spearman(x, y))

# of same sign for true difference and predicted difference: 466 / 798 = 0.5839598997493735 %
# of same sign for true difference and averaged predicted difference: 240 / 399.0 = 0.6015037593984962%
Pearson correlation coefficient: 0.212, p-value: 1.37e-09
Spearman correlation coefficient: 0.202, p-value: 8.24e-09
None


In [48]:
a = non_val2_real_pred

In [7]:
# 5 BEST LOG MODELS FOLDCHANGE
import pandas as pd
import scipy.stats

mhc = 200
def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Val2', 'Test2']
    dfs = [val2_foldchange_df, test2_foldchange_df]

    # Calculate correlations
    for group, df in zip(groups, dfs):
        x = df['true'].squeeze()
        y = df['pred'].squeeze()
        pearson, pp = scipy.stats.pearsonr(x, y)
        spearman, ps = scipy.stats.spearmanr(x, y)

        mse = mean_squared_error(x, y)

        same_sign = np.sign(df['true']) == np.sign(df['pred'])
        num_ss = same_sign.sum()
        len_ss = len(df)
        perc_ss = num_ss / len_ss
        
        same_sign_avg = np.sign(df.loc[::2]['true']) == np.sign(df.loc[::2]['avg_pred'])
        num_ssa = same_sign_avg.sum()
        len_ssa = len(df) / 2
        perc_ssa = num_ssa / len_ssa
        
        rows.append({'Group': group, 'Metric': 'Same Sign Count', 'Value': num_ss})
        rows.append({'Group': group, 'Metric': 'Total Count', 'Value': len_ss})
        rows.append({'Group': group, 'Metric': 'Same Sign %', 'Value': perc_ss})
     
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson P-Val', 'Value': pp*mhc})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman P-Val', 'Value': ps*mhc})
        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
        
    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['bdbi7l3n', '7vsdq5k2', 'wnfdrgcc', '8i7h7nsh', 'ph4wrpxu']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'

        nonMouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/{species}_liver_pos_ALL.bed', sep="\t", header=None)
        mouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/mouse_liver_pos_ALL.bed', sep="\t", header=None)
        one_to_one_peaks_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/oneToOnePeaks/{species}_mouse.bed', header=None, sep='\t')
        one_to_one_peaks_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/oneToOnePeaks/{species}_mouse.bed', header=None, sep='\t')

        peaks = one_to_one_peaks_VAL[[4, 14]]

        merged_df = peaks.merge(nonMouse_true, how='left', left_on=4, right_on=3, suffixes=('', '_non'))
        
        # Aligned orthologous peaks
        merged_df = merged_df.merge(mouse_true, how='left', left_on=14, right_on=3, suffixes=('_NON', '_mouse'))

        #############################################################################
        # load MOUSE VAL DF

        pred_mouse_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_mouse_TRAIN_VAL.csv', header=None)
        
        mouse_train = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_TRAINONLY.narrowPeak', sep="\t", header=None)
        mouse_val = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_VAL.narrowPeak', sep="\t", header=None)
        
        mouse_train_len = 2*len(mouse_train)
        mouse_val_len = 2*len(mouse_val)

        doubled_mouse_val_df = pd.concat([mouse_val, mouse_val]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_mouse_val_df = doubled_mouse_val_df.rename(columns={0:'mouse_chr'})
    
        pred_mouse_val_df = pred_mouse_VAL.iloc[mouse_train_len:mouse_train_len + mouse_val_len].reset_index(drop=True)
        pred_mouse_val_df = pd.concat([doubled_mouse_val_df.drop(columns=5), pred_mouse_val_df], axis=1)
        pred_mouse_val_df = pred_mouse_val_df.rename(columns={4:'mouse_true', 0:'mouse_pred'}).reset_index(drop=True)
        
        merged_mouse_val_true_pred = peaks.merge(pred_mouse_val_df, how='left', left_on=14, right_on=3)
        mouse_val_true_pred = merged_mouse_val_true_pred.rename(columns={3:'mouse_peak'}).drop([14], axis=1)

        #############################################################################
        # load non-mouse VAL2 DF

        pred_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/val_splits/val1/{species}_liver_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).rename(columns={0: 'mac_chr'})
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        pred_val2_df = pred_VAL.iloc[val1_len:val1_len + val2_len].reset_index(drop=True)
        doubled_val2_combined = pd.concat([doubled_val2_df, pred_val2_df], axis=1)

        non_val2_true_pred = doubled_val2_combined.rename(columns={4: 'non_true', 0:'non_pred'})
        merged_non_val2_true_pred = peaks.merge(non_val2_true_pred, how='left', left_on=4, right_on=3).drop(14, axis=1)
        non_val2_true_pred = merged_non_val2_true_pred.rename(columns={3:'non_peak'}).drop(4, axis=1)
        #############################################################################

        # make VAL2 merged df
        val2_foldchange_df = pd.concat([mouse_val_true_pred, non_val2_true_pred], axis=1)
        
        val2_foldchange_df['true'] = val2_foldchange_df['mouse_true']-val2_foldchange_df['non_true']
        val2_foldchange_df['pred'] = val2_foldchange_df['mouse_pred']-val2_foldchange_df['non_pred']
        
        mouse_av = (val2_foldchange_df.loc[::2, 'mouse_pred'].values + val2_foldchange_df.loc[1::2, 'mouse_pred'].values) / 2
        mac_av = (val2_foldchange_df.loc[::2, 'non_pred'].values + val2_foldchange_df.loc[1::2, 'non_pred'].values) / 2
        
        # Add the averages back to the DataFrame as a new column
        val2_foldchange_df.loc[::2, 'mouse_pred_avg'] = mouse_av
        val2_foldchange_df.loc[::2, 'non_pred_avg'] = mac_av 
        val2_foldchange_df['avg_pred'] = val2_foldchange_df['mouse_pred_avg']-val2_foldchange_df['non_pred_avg']

        #############################################################################
        # TIME FOR TEST2 FOLDCHANGE

        peaks = one_to_one_peaks_TEST[[4, 14]]

        merged_df = peaks.merge(nonMouse_true, how='left', left_on=4, right_on=3, suffixes=('', '_non'))
        
        # Aligned orthologous peaks
        merged_df = merged_df.merge(mouse_true, how='left', left_on=14, right_on=3, suffixes=('_NON', '_mouse'))

        #############################################################################
        
        # load MOUSE TEST DF

        pred_mouse_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_mouse_TEST.csv', header=None)
        
        mouse_test = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/mouse_liver_TEST_500bp.bed', sep="\t", header=None)
        
        mouse_test_len = 2*len(mouse_test)

        doubled_mouse_test_df = pd.concat([mouse_test, mouse_test]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_mouse_test_df = doubled_mouse_test_df.rename(columns={0:'mouse_chr'})
    
        pred_mouse_test_df = pred_mouse_TEST.head(mouse_test_len).reset_index(drop=True)
        pred_mouse_test_df = pd.concat([doubled_mouse_test_df.drop(columns=5), pred_mouse_test_df], axis=1)
        pred_mouse_test_df = pred_mouse_test_df.rename(columns={4:'mouse_true', 0:'mouse_pred'}).reset_index(drop=True)
        
        merged_mouse_test_true_pred = peaks.merge(pred_mouse_test_df, how='left', left_on=14, right_on=3)
        mouse_test_true_pred = merged_mouse_test_true_pred.rename(columns={3:'mouse_peak'}).drop([14], axis=1)

        #############################################################################
        # load nonmouse TEST2 DF
        
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).rename(columns={0: 'mac_chr'})
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len].reset_index(drop=True)
        doubled_test2_combined = pd.concat([doubled_test2_df, pred_test2_df], axis=1)

        non_test2_true_pred = doubled_test2_combined.rename(columns={4: 'non_true', 0:'non_pred'})
        merged_non_test2_true_pred = peaks.merge(non_test2_true_pred, how='left', left_on=4, right_on=3).drop(14, axis=1)
        non_test2_true_pred = merged_non_test2_true_pred.rename(columns={3:'non_peak'}).drop(4, axis=1)

        #############################################################################

        # make TEST2 merged df
        test2_foldchange_df = pd.concat([mouse_test_true_pred, non_test2_true_pred], axis=1)
        
        test2_foldchange_df['true'] = test2_foldchange_df['mouse_true']-test2_foldchange_df['non_true']
        test2_foldchange_df['pred'] = test2_foldchange_df['mouse_pred']-test2_foldchange_df['non_pred']
        
        mouse_av = (test2_foldchange_df.loc[::2, 'mouse_pred'].values + test2_foldchange_df.loc[1::2, 'mouse_pred'].values) / 2
        non_av = (test2_foldchange_df.loc[::2, 'non_pred'].values + test2_foldchange_df.loc[1::2, 'non_pred'].values) / 2
        
        # Add the averages back to the DataFrame as a new column
        test2_foldchange_df.loc[::2, 'mouse_pred_avg'] = mouse_av
        test2_foldchange_df.loc[::2, 'non_pred_avg'] = non_av 
        test2_foldchange_df['avg_pred'] = test2_foldchange_df['mouse_pred_avg']-test2_foldchange_df['non_pred_avg']
        
        corr_df = correlate()
        corr_df['Species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)
        


#############################################################################

summary_df = pd.concat(all_results)

custom_group_order = [ 'Val2', 'Test2' ]
custom_metric_order = ['Same Sign Count', 'Total Count', 'Same Sign %', 'Pearson', 'Pearson P-Val', 'Spearman', 'Spearman P-Val', 'MSE']

# # Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_df['Metric'] = pd.Categorical(summary_df['Metric'], categories=custom_metric_order, ordered=True)


pivot_df = summary_df.pivot_table(
    index=["Species", "Group", "Metric"],
    columns="model",
    values="Value"
).reset_index()

pivot_df = pivot_df.sort_values(by=["Species", "Group", "Metric"])

pivot_df = pivot_df[["Species", "Group", "Metric"] + model_list]
# pivot_df_reordered = pivot_df[model_list]

def format_value(metric, value):
    """Format values depending on whether it's a P-Val metric or not."""
    if "P-Val" in metric:
        return f"{value:.2e}"   # scientific notation, 3 sig figs
    else:
        return f"{value:.3g}"   # regular decimal, 3 sig figs

# Apply formatting to a copy just for display/export
pivot_df_display = pivot_df.copy()
for col in model_list:  # each model column
    pivot_df_display[col] = pivot_df_display.apply(
        lambda row: format_value(row["Metric"], row[col]),
        axis=1
    )

display(pivot_df_display)

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/log_model_foldchange_table_FINAL_mse.tsv'
pivot_df_display.to_csv(output_filename, sep='\t')

# print(f'Results successfully saved to: {output_filename}')


model,Species,Group,Metric,bdbi7l3n,7vsdq5k2,wnfdrgcc,8i7h7nsh,ph4wrpxu
0,cow,Val2,Same Sign Count,200,184,184,183,181
1,cow,Val2,Total Count,330,330,330,330,330
2,cow,Val2,Same Sign %,0.606,0.558,0.558,0.555,0.548
3,cow,Val2,Pearson,0.259,0.286,0.273,0.25,0.273
4,cow,Val2,Pearson P-Val,3.80e-04,2.63e-05,9.35e-05,8.60e-04,9.73e-05
...,...,...,...,...,...,...,...,...
59,rat,Test2,Pearson,0.177,0.155,0.151,0.135,0.13
60,rat,Test2,Pearson P-Val,2.53e-09,7.42e-07,1.62e-06,5.33e-05,1.57e-04
61,rat,Test2,Spearman,0.153,0.145,0.138,0.124,0.119
62,rat,Test2,Spearman P-Val,1.17e-06,6.92e-06,2.76e-05,5.05e-04,1.23e-03


In [9]:
# 1 3 5 BEST LOG MODELS FOLDCHANGE
import pandas as pd
import scipy.stats

def correlate(mhc):
    rows = []
    # Lists for correlation calculations
    groups = ['Val2', 'Test2']
    dfs = [val2_foldchange_df, test2_foldchange_df]

    # Calculate correlations
    for group, df in zip(groups, dfs):
        x = df['true'].squeeze()
        y = df['pred'].squeeze()
        pearson, pp = scipy.stats.pearsonr(x, y)
        spearman, ps = scipy.stats.spearmanr(x, y)

        mse = mean_squared_error(x, y)
        
        same_sign = np.sign(df['true']) == np.sign(df['pred'])
        num_ss = same_sign.sum()
        len_ss = len(df)
        perc_ss = num_ss / len_ss
        
        same_sign_avg = np.sign(df.loc[::2]['true']) == np.sign(df.loc[::2]['avg_pred'])
        num_ssa = same_sign_avg.sum()
        len_ssa = len(df) / 2
        perc_ssa = num_ssa / len_ssa
        
        rows.append({'Group': group, 'Metric': 'Same Sign Count', 'Value': num_ss})
        rows.append({'Group': group, 'Metric': 'Total Count', 'Value': len_ss})
        # print(f'total count: {len_ss}')
        rows.append({'Group': group, 'Metric': 'Same Sign %', 'Value': perc_ss})
     
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson P-Val', 'Value': pp*mhc})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman P-Val', 'Value': ps*mhc})

        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
        
    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['bdbi7l3n', 'kf8188qf', 'cq45eb2s']

for species in species_list:
    for model in model_list:
        mhc = 200
        if model == 'kf8188qf':
            mhc = 100
        model_dir = f'{model}_FINAL'

        nonMouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/{species}_liver_pos_ALL.bed', sep="\t", header=None)
        mouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/mouse_liver_pos_ALL.bed', sep="\t", header=None)
        one_to_one_peaks_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/oneToOnePeaks/{species}_mouse.bed', header=None, sep='\t')
        one_to_one_peaks_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/oneToOnePeaks/{species}_mouse.bed', header=None, sep='\t')

        peaks = one_to_one_peaks_VAL[[4, 14]]

        merged_df = peaks.merge(nonMouse_true, how='left', left_on=4, right_on=3, suffixes=('', '_non'))
        
        # Aligned orthologous peaks
        merged_df = merged_df.merge(mouse_true, how='left', left_on=14, right_on=3, suffixes=('_NON', '_mouse'))

        #############################################################################
        # load MOUSE VAL DF

        pred_mouse_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_mouse_TRAIN_VAL.csv', header=None)
        
        mouse_train = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_TRAINONLY.narrowPeak', sep="\t", header=None)
        mouse_val = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/splits/logPos/mouse_liver_VAL.narrowPeak', sep="\t", header=None)
        
        mouse_train_len = 2*len(mouse_train)
        mouse_val_len = 2*len(mouse_val)

        doubled_mouse_val_df = pd.concat([mouse_val, mouse_val]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_mouse_val_df = doubled_mouse_val_df.rename(columns={0:'mouse_chr'})
    
        pred_mouse_val_df = pred_mouse_VAL.iloc[mouse_train_len:mouse_train_len + mouse_val_len].reset_index(drop=True)
        pred_mouse_val_df = pd.concat([doubled_mouse_val_df.drop(columns=5), pred_mouse_val_df], axis=1)
        pred_mouse_val_df = pred_mouse_val_df.rename(columns={4:'mouse_true', 0:'mouse_pred'}).reset_index(drop=True)
        
        merged_mouse_val_true_pred = peaks.merge(pred_mouse_val_df, how='left', left_on=14, right_on=3)
        mouse_val_true_pred = merged_mouse_val_true_pred.rename(columns={3:'mouse_peak'}).drop([14], axis=1)

        #############################################################################
        # load non-mouse VAL2 DF

        pred_VAL = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/val_splits/val1/{species}_liver_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).rename(columns={0: 'mac_chr'})
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        pred_val2_df = pred_VAL.iloc[val1_len:val1_len + val2_len].reset_index(drop=True)
        doubled_val2_combined = pd.concat([doubled_val2_df, pred_val2_df], axis=1)

        non_val2_true_pred = doubled_val2_combined.rename(columns={4: 'non_true', 0:'non_pred'})
        merged_non_val2_true_pred = peaks.merge(non_val2_true_pred, how='left', left_on=4, right_on=3).drop(14, axis=1)
        non_val2_true_pred = merged_non_val2_true_pred.rename(columns={3:'non_peak'}).drop(4, axis=1)
        #############################################################################

        # make VAL2 merged df
        val2_foldchange_df = pd.concat([mouse_val_true_pred, non_val2_true_pred], axis=1)
        
        val2_foldchange_df['true'] = val2_foldchange_df['mouse_true']-val2_foldchange_df['non_true']
        val2_foldchange_df['pred'] = val2_foldchange_df['mouse_pred']-val2_foldchange_df['non_pred']
        
        mouse_av = (val2_foldchange_df.loc[::2, 'mouse_pred'].values + val2_foldchange_df.loc[1::2, 'mouse_pred'].values) / 2
        mac_av = (val2_foldchange_df.loc[::2, 'non_pred'].values + val2_foldchange_df.loc[1::2, 'non_pred'].values) / 2
        
        # Add the averages back to the DataFrame as a new column
        val2_foldchange_df.loc[::2, 'mouse_pred_avg'] = mouse_av
        val2_foldchange_df.loc[::2, 'non_pred_avg'] = mac_av 
        val2_foldchange_df['avg_pred'] = val2_foldchange_df['mouse_pred_avg']-val2_foldchange_df['non_pred_avg']

        #############################################################################
        # TIME FOR TEST2 FOLDCHANGE

        peaks = one_to_one_peaks_TEST[[4, 14]]

        merged_df = peaks.merge(nonMouse_true, how='left', left_on=4, right_on=3, suffixes=('', '_non'))
        
        # Aligned orthologous peaks
        merged_df = merged_df.merge(mouse_true, how='left', left_on=14, right_on=3, suffixes=('_NON', '_mouse'))

        #############################################################################
        
        # load MOUSE TEST DF

        pred_mouse_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_mouse_TEST.csv', header=None)
        
        mouse_test = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/mouse_liver_TEST_500bp.bed', sep="\t", header=None)
        
        mouse_test_len = 2*len(mouse_test)

        doubled_mouse_test_df = pd.concat([mouse_test, mouse_test]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_mouse_test_df = doubled_mouse_test_df.rename(columns={0:'mouse_chr'})
    
        pred_mouse_test_df = pred_mouse_TEST.head(mouse_test_len).reset_index(drop=True)
        pred_mouse_test_df = pd.concat([doubled_mouse_test_df.drop(columns=5), pred_mouse_test_df], axis=1)
        pred_mouse_test_df = pred_mouse_test_df.rename(columns={4:'mouse_true', 0:'mouse_pred'}).reset_index(drop=True)
        
        merged_mouse_test_true_pred = peaks.merge(pred_mouse_test_df, how='left', left_on=14, right_on=3)
        mouse_test_true_pred = merged_mouse_test_true_pred.rename(columns={3:'mouse_peak'}).drop([14], axis=1)

        #############################################################################
        # load nonmouse TEST2 DF
        
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).rename(columns={0: 'mac_chr'})
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len].reset_index(drop=True)
        doubled_test2_combined = pd.concat([doubled_test2_df, pred_test2_df], axis=1)

        non_test2_true_pred = doubled_test2_combined.rename(columns={4: 'non_true', 0:'non_pred'})
        merged_non_test2_true_pred = peaks.merge(non_test2_true_pred, how='left', left_on=4, right_on=3).drop(14, axis=1)
        non_test2_true_pred = merged_non_test2_true_pred.rename(columns={3:'non_peak'}).drop(4, axis=1)

        #############################################################################

        # make TEST2 merged df
        test2_foldchange_df = pd.concat([mouse_test_true_pred, non_test2_true_pred], axis=1)
        
        test2_foldchange_df['true'] = test2_foldchange_df['mouse_true']-test2_foldchange_df['non_true']
        test2_foldchange_df['pred'] = test2_foldchange_df['mouse_pred']-test2_foldchange_df['non_pred']
        
        mouse_av = (test2_foldchange_df.loc[::2, 'mouse_pred'].values + test2_foldchange_df.loc[1::2, 'mouse_pred'].values) / 2
        non_av = (test2_foldchange_df.loc[::2, 'non_pred'].values + test2_foldchange_df.loc[1::2, 'non_pred'].values) / 2
        
        # Add the averages back to the DataFrame as a new column
        test2_foldchange_df.loc[::2, 'mouse_pred_avg'] = mouse_av
        test2_foldchange_df.loc[::2, 'non_pred_avg'] = non_av 
        test2_foldchange_df['avg_pred'] = test2_foldchange_df['mouse_pred_avg']-test2_foldchange_df['non_pred_avg']
        
        corr_df = correlate(mhc)
        corr_df['Species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)
        


#############################################################################

summary_df = pd.concat(all_results)

custom_group_order = [ 'Val2', 'Test2' ]
custom_metric_order = ['Same Sign Count', 'Total Count', 'Same Sign %', 'Pearson', 'Pearson P-Val', 'Spearman', 'Spearman P-Val', 'MSE']

# # Convert 'group' to a categorical type with the specified order.
summary_df['Group'] = pd.Categorical(summary_df['Group'], categories=custom_group_order, ordered=True)
summary_df['Metric'] = pd.Categorical(summary_df['Metric'], categories=custom_metric_order, ordered=True)


pivot_df = summary_df.pivot_table(
    index=["Species", "Group", "Metric"],
    columns="model",
    values="Value"
).reset_index()

pivot_df = pivot_df.sort_values(by=["Species", "Group", "Metric"])

pivot_df = pivot_df[["Species", "Group", "Metric"] + model_list]
# pivot_df_reordered = pivot_df[model_list]

def format_value(metric, value):
    """Format values depending on whether it's a P-Val metric or not."""
    if "P-Val" in metric:
        return f"{value:.2e}"   # scientific notation, 3 sig figs
    else:
        return f"{value:.3g}"   # regular decimal, 3 sig figs

# Apply formatting to a copy just for display/export
pivot_df_display = pivot_df.copy()
for col in model_list:  # each model column
    pivot_df_display[col] = pivot_df_display.apply(
        lambda row: format_value(row["Metric"], row[col]),
        axis=1
    )

display(pivot_df_display)

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/135log_model_foldchange_table_FINAL_mse.tsv'
pivot_df_display.to_csv(output_filename, sep='\t')

# print(f'Results successfully saved to: {output_filename}')


model,Species,Group,Metric,bdbi7l3n,kf8188qf,cq45eb2s
0,cow,Val2,Same Sign Count,200,211,193
1,cow,Val2,Total Count,330,330,330
2,cow,Val2,Same Sign %,0.606,0.639,0.585
3,cow,Val2,Pearson,0.259,0.355,0.274
4,cow,Val2,Pearson P-Val,3.80e-04,3.28e-09,8.33e-05
...,...,...,...,...,...,...
59,rat,Test2,Pearson,0.177,0.293,0.285
60,rat,Test2,Pearson P-Val,2.53e-09,5.84e-28,5.56e-26
61,rat,Test2,Spearman,0.153,0.281,0.262
62,rat,Test2,Spearman P-Val,1.17e-06,1.62e-25,1.10e-21


In [13]:
# 2KB MODEL FOLDCHANGE
import pandas as pd
import scipy.stats

mhc = 100

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Test2']
    dfs = [test2_foldchange_df]

    # Calculate correlations
    for group, df in zip(groups, dfs):
        x = df['true'].squeeze()
        y = df['pred'].squeeze()
        pearson, pp = scipy.stats.pearsonr(x, y)
        spearman, ps = scipy.stats.spearmanr(x, y)

        mse = mean_squared_error(x, y)

        same_sign = np.sign(df['true']) == np.sign(df['pred'])
        num_ss = same_sign.sum()
        len_ss = len(df)
        perc_ss = num_ss / len_ss
        
        same_sign_avg = np.sign(df.loc[::2]['true']) == np.sign(df.loc[::2]['avg_pred'])
        num_ssa = same_sign_avg.sum()
        len_ssa = len(df) / 2
        perc_ssa = num_ssa / len_ssa
        
        rows.append({'Group': group, 'Metric': 'Same Sign Count', 'Value': num_ss})
        rows.append({'Group': group, 'Metric': 'Total Count', 'Value': len_ss})
        # print(f'total count: {len_ss}')
        rows.append({'Group': group, 'Metric': 'Same Sign %', 'Value': perc_ss})
     
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson P-Val', 'Value': pp*mhc})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman P-Val', 'Value': ps*mhc})
        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
        
    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['im88hepv']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'

        nonMouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/{species}_liver_pos_ALL.bed', sep="\t", header=None)
        mouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/mouse_liver_pos_ALL.bed', sep="\t", header=None)
        one_to_one_peaks_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/oneToOnePeaks/{species}_mouse.bed', header=None, sep='\t')

        peaks = one_to_one_peaks_TEST[[4, 14]]

        merged_df = peaks.merge(nonMouse_true, how='left', left_on=4, right_on=3, suffixes=('', '_non'))
        
        # Aligned orthologous peaks
        merged_df = merged_df.merge(mouse_true, how='left', left_on=14, right_on=3, suffixes=('_NON', '_mouse'))

        #############################################################################
        
        # load MOUSE TEST DF

        pred_mouse_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_mouse_TEST.csv', header=None)
        
        mouse_test = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_pos/mouse_liver_TEST_500bp.bed', sep="\t", header=None)
        
        mouse_test_len = 2*len(mouse_test)

        doubled_mouse_test_df = pd.concat([mouse_test, mouse_test]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_mouse_test_df = doubled_mouse_test_df.rename(columns={0:'mouse_chr'})
    
        pred_mouse_test_df = pred_mouse_TEST.head(mouse_test_len).reset_index(drop=True)
        pred_mouse_test_df = pd.concat([doubled_mouse_test_df, pred_mouse_test_df], axis=1)
        pred_mouse_test_df = pred_mouse_test_df.rename(columns={4:'mouse_true', 0:'mouse_pred'}).reset_index(drop=True)
        
        merged_mouse_test_true_pred = peaks.merge(pred_mouse_test_df, how='left', left_on=14, right_on=3)
        mouse_test_true_pred = merged_mouse_test_true_pred.rename(columns={3:'mouse_peak'}).drop([14], axis=1)

        #############################################################################
        # load nonmouse TEST2 DF
        
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_2kb/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).rename(columns={0: 'mac_chr'})
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len].reset_index(drop=True)
        doubled_test2_combined = pd.concat([doubled_test2_df, pred_test2_df], axis=1)

        non_test2_true_pred = doubled_test2_combined.rename(columns={4: 'non_true', 0:'non_pred'})
        merged_non_test2_true_pred = peaks.merge(non_test2_true_pred, how='left', left_on=4, right_on=3).drop(14, axis=1)
        non_test2_true_pred = merged_non_test2_true_pred.rename(columns={3:'non_peak'}).drop(4, axis=1)

        #############################################################################

        # make TEST2 merged df
        test2_foldchange_df = pd.concat([mouse_test_true_pred, non_test2_true_pred], axis=1)
        
        test2_foldchange_df['true'] = test2_foldchange_df['mouse_true']-test2_foldchange_df['non_true']
        test2_foldchange_df['pred'] = test2_foldchange_df['mouse_pred']-test2_foldchange_df['non_pred']
        
        mouse_av = (test2_foldchange_df.loc[::2, 'mouse_pred'].values + test2_foldchange_df.loc[1::2, 'mouse_pred'].values) / 2
        non_av = (test2_foldchange_df.loc[::2, 'non_pred'].values + test2_foldchange_df.loc[1::2, 'non_pred'].values) / 2
        
        # Add the averages back to the DataFrame as a new column
        test2_foldchange_df.loc[::2, 'mouse_pred_avg'] = mouse_av
        test2_foldchange_df.loc[::2, 'non_pred_avg'] = non_av 
        test2_foldchange_df['avg_pred'] = test2_foldchange_df['mouse_pred_avg']-test2_foldchange_df['non_pred_avg']
        
        corr_df = correlate()
        corr_df['Species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)
        


#############################################################################

summary_df = pd.concat(all_results)

custom_metric_order = ['Same Sign Count', 'Total Count', 'Same Sign %', 'Pearson', 'Pearson P-Val', 'Spearman', 'Spearman P-Val', 'MSE']

summary_df['Metric'] = pd.Categorical(summary_df['Metric'], categories=custom_metric_order, ordered=True)

pivot_df = summary_df.pivot_table(
    index=["Species", "Group", "Metric"],
    columns="model",
    values="Value"
).reset_index()

pivot_df = pivot_df.sort_values(by=["Species", "Group", "Metric"])

pivot_df = pivot_df[["Species", "Group", "Metric"] + model_list]
# pivot_df_reordered = pivot_df[model_list]

def format_value(metric, value):
    """Format values depending on whether it's a P-Val metric or not."""
    if "P-Val" in metric:
        return f"{value:.2e}"   # scientific notation, 3 sig figs
    else:
        return f"{value:.3g}"   # regular decimal, 3 sig figs

# Apply formatting to a copy just for display/export
pivot_df_display = pivot_df.copy()
for col in model_list:  # each model column
    pivot_df_display[col] = pivot_df_display.apply(
        lambda row: format_value(row["Metric"], row[col]),
        axis=1
    )

pivot_df_2kb = pivot_df_display
display(pivot_df_display)

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/2kb_log_model_foldchange_table_FINAL_mse.tsv'
pivot_df_2kb.to_csv(output_filename, sep='\t')

# print(f'Results successfully saved to: {output_filename}')


model,Species,Group,Metric,im88hepv
0,cow,Test2,Same Sign Count,327.0
1,cow,Test2,Total Count,618.0
2,cow,Test2,Same Sign %,0.529
3,cow,Test2,Pearson,0.199
4,cow,Test2,Pearson P-Val,5.93e-05
5,cow,Test2,Spearman,0.226
6,cow,Test2,Spearman P-Val,1.38e-06
7,cow,Test2,MSE,0.382
8,macaque,Test2,Same Sign Count,461.0
9,macaque,Test2,Total Count,798.0


In [63]:
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import numpy as np

species = 'pig'

both_df = pd.read_csv(f"/home/azstephe/liverRegression/regression_liver/data/splits/{species}Mouse/{species}ToMouse_liver_{species}Enhancer_mouseEnhancer_test_wawb.narrowPeak", sep="\t", header=None)
mac_qn = pd.read_csv(f"/home/azstephe/liverRegression/regression_liver/data/ladder_qn/{species}_liver_pos_ALL.bed", sep="\t", header=None)
mouse_qn = pd.read_csv(f"/home/azstephe/liverRegression/regression_liver/data/ladder_qn/mouse_liver_pos_ALL.bed", sep="\t", header=None)

unique1_2 = both_df[~both_df.duplicated(subset=[1, 2], keep=False)] # entries with unique mac mapped to mouse start and end
all_unique = unique1_2[~unique1_2.duplicated(subset=[11,12], keep=False)]

#### FUNCTIONS

def pearson_spearman(x, y):
    pearson_corr, pearson_p_value = scipy.stats.pearsonr(x, y)
    print(f"Pearson correlation coefficient: {pearson_corr:.4f}, p-value: {pearson_p_value:.4g}")

    spearman_corr, spearman_p_value = scipy.stats.spearmanr(x, y)
    print(f"Spearman correlation coefficient: {spearman_corr:.4f}, p-value: {spearman_p_value:.4g}")
    
def collapse_if_identical(lst):
    return lst[0] if all(x == lst[0] for x in lst) else lst
    

# takes in list and returns peak with largest signal
def keep_largest_signal(lst, qn_df):
    if isinstance(lst, list):
        maxPeakName = ''
        maxPeakStrength = 0
        for peak in lst:
            matching_rows = qn_df[qn_df[3] == peak]
            if not matching_rows.empty:
                signal = matching_rows[4].iloc[0]
                if signal > maxPeakStrength:
                    maxPeakStrength = signal
                    maxPeakName = peak
        return maxPeakName
    else: 
        return lst

# def overlap(start1, end1, start2, end2):
#     return max(0, min(end1, end2)-max(start1, start2))

def get_biggest_overlap(lst, col, df):
    maxOverlap = 0
    maxPeakOverlap = ""
    for peak in lst:
        row = df[df[col] == peak].iloc[0] #iloc bc possible to return multiple rows (never will based on th
        overlap = max(0, min(row[2], row[12])-max(row[1], row[11]))
        if overlap > maxOverlap:
            maxOverlap = overlap
            maxPeakOverlap = peak
    return peak

#### COL1,2 DUPLICATES

duplicates3 = both_df[both_df.duplicated(subset=[1, 2], keep=False)] # entries with duplicated mac mapped to mouse start and end 
grouped_dups3 = duplicates3.groupby([1, 2])[3].apply(list).reset_index(name='col3')

# grouped dups: start | end | [peaks with these endpoints]
grouped_dups3['col3'] = grouped_dups3['col3'].apply(collapse_if_identical) 

grouped_dups3['col3'] = grouped_dups3['col3'].apply(lambda x: keep_largest_signal(x, mac_qn)) # redundant endpoints now map to 1 peak in col3
grouped_dups3 = grouped_dups3[grouped_dups3['col3'] != ''] # remove macaque peaks that aren't large enough

# df of duplicated col1,2 with strongest peak
keep_strongestcol3 = duplicates3.merge(grouped_dups3[['col3']], left_on=3, right_on='col3', how='inner').drop('col3', axis=1) # keeps the strongest signal in col3 for redundant endpoints

#### COL11,12 DUPLICATES

grouped_dups13 = keep_strongestcol3.groupby([11, 12])[13].apply(list).reset_index(name='col13')
grouped_dups13['col13'] = grouped_dups13['col13'].apply(collapse_if_identical)

grouped_dups13['col13'] = grouped_dups13['col13'].apply(lambda x: keep_largest_signal(x, mouse_qn))
grouped_dups13 = grouped_dups13[grouped_dups13['col13'] != '']

keep_strongestcol13 = keep_strongestcol3.merge(grouped_dups13[['col13']], left_on=13, right_on='col13', how='inner').drop('col13', axis=1)

unique_endpoints = keep_strongestcol13 #rows with unique endpoints from the duplicated endpoints set

####

# col3 peakname duplicates with different endpoints
still_dups_col3 = unique_endpoints[unique_endpoints.duplicated(subset=[3], keep=False)]

# col3peaks | [col13 peaks intersecting col3 peak]
grouped_dcol3 = still_dups_col3.groupby(3)[13].apply(list).reset_index(name='col13')

# get the col13 peak with most overlap of col3
grouped_dcol3['col13'] = grouped_dcol3['col13'].apply(lambda x: get_biggest_overlap(x, 13, still_dups_col3))

merged3 = still_dups_col3.merge(grouped_dcol3[[3, 'col13']], left_on=[3, 13], right_on=[3, 'col13'], how='left', indicator=True)

remove3 = merged3[merged3['_merge'] == 'left_only'].drop(columns=['_merge']) # col13 is what we want to remove

unique_endpoints_subset = unique_endpoints.iloc[:,:20]
remove3_subset = remove3.iloc[:,:20]

# all col3 entries unique
unique3 = unique_endpoints[~unique_endpoints_subset.apply(tuple, axis=1).isin(remove3_subset.apply(tuple, axis=1))] 

####

# col13 peakname duplicates with different endpoints
still_dups_col13 = unique3[unique3.duplicated(subset=[13], keep=False)]

# col13peaks | [col3 peaks intersecting col13 peak]
grouped_dcol13 = still_dups_col13.groupby(13)[3].apply(list).reset_index(name='col3')

# get the col3 peak with most overlap of col13
grouped_dcol13['col3'] = grouped_dcol13['col3'].apply(lambda x: get_biggest_overlap(x, 3, still_dups_col13))

merged13 = still_dups_col13.merge(grouped_dcol13[[13, 'col3']], left_on=[3, 13], right_on=['col3', 13], how='left', indicator=True)

remove13 = merged13[merged13['_merge'] == 'left_only'].drop(columns=['_merge'])

unique3_subset = unique3.iloc[:, :20]
remove13_subset = remove13.iloc[:, :20]

# Identify rows in u3 that are NOT in remove3
filtered_peaks_unique = unique3[~unique3_subset.apply(tuple, axis=1).isin(remove13_subset.apply(tuple, axis=1))]

full_unique = pd.concat([all_unique, filtered_peaks_unique])

# Sort by column '1'
one_to_one_peaks = full_unique.sort_values(by=1).reset_index(drop=True)

one_to_one_peaks.to_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/oneToOnePeaks_eqn/{species}_mouse_test.bed', header=None, sep='\t')

one_to_one_peaks

##############################



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,chr2,3229118,3229378,peak3020,-1,.,-1,-1,-1,151,chr2,3228532,3229401,peak11427,1000,.,16.724,152.951,149.789,727
1,chr2,4222111,4222486,peak3028,-1,.,-1,-1,-1,272,chr2,4222038,4222501,peak11441,868,.,6.236,32.050,29.533,356
2,chr1,4543950,4544442,peak13844,-1,.,-1,-1,-1,289,chr1,4543909,4544544,peak22,1000,.,10.428,103.923,100.955,398
3,chr2,5519359,5519700,peak3179,-1,.,-1,-1,-1,199,chr2,5519128,5519647,peak11454,983,.,5.291,23.969,21.543,364
4,chr2,6072172,6072362,peak3200,-1,.,-1,-1,-1,106,chr2,6072019,6072463,peak11460,1000,.,10.449,99.040,96.094,249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,chr1,194995147,194995697,peak19951,-1,.,-1,-1,-1,211,chr1,194995180,194995620,peak1965,1000,.,8.096,68.960,66.170,196
268,chr1,195016830,195017261,peak19954,-1,.,-1,-1,-1,137,chr1,195017180,195017466,peak1968,1000,.,5.099,40.066,37.475,169
269,chr1,195018208,195018410,peak19955,-1,.,-1,-1,-1,104,chr1,195017987,195018471,peak1969,1000,.,11.097,148.403,145.257,336
270,chr1,195033815,195034284,peak19958,-1,.,-1,-1,-1,315,chr1,195033987,195034343,peak1970,1000,.,12.148,108.731,105.741,201


In [12]:
# EQN MODEL FOLDCHANGE
import pandas as pd
import scipy.stats

mhc = 300

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Test2']
    dfs = [test2_foldchange_df]

    # Calculate correlations
    for group, df in zip(groups, dfs):
        x = df['true'].squeeze()
        y = df['pred'].squeeze()
        pearson, pp = scipy.stats.pearsonr(x, y)
        spearman, ps = scipy.stats.spearmanr(x, y)

        mse = mean_squared_error(x, y)
        
        same_sign = np.sign(df['true']) == np.sign(df['pred'])
        num_ss = same_sign.sum()
        len_ss = len(df)
        perc_ss = num_ss / len_ss
        
        same_sign_avg = np.sign(df.loc[::2]['true']) == np.sign(df.loc[::2]['avg_pred'])
        num_ssa = same_sign_avg.sum()
        len_ssa = len(df) / 2
        perc_ssa = num_ssa / len_ssa
        
        rows.append({'Group': group, 'Metric': 'Same Sign Count', 'Value': num_ss})
        rows.append({'Group': group, 'Metric': 'Total Count', 'Value': len_ss})
        # print(f'total count: {len_ss}')
        rows.append({'Group': group, 'Metric': 'Same Sign %', 'Value': perc_ss})
     
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson P-Val', 'Value': pp*mhc})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman P-Val', 'Value': ps*mhc})
        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
        
    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
model_list = ['mcf297qb', '7l12zan1']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'

        nonMouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/{species}_liver_pos_ALL.bed', sep="\t", header=None)
        mouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/mouse_liver_pos_ALL.bed', sep="\t", header=None)
        one_to_one_peaks_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/oneToOnePeaks_eqn/{species}_mouse_test.bed', header=None, sep='\t')

        peaks = one_to_one_peaks_TEST[[4, 14]]

        merged_df = peaks.merge(nonMouse_true, how='left', left_on=4, right_on=3, suffixes=('', '_non'))
        
        # Aligned orthologous peaks
        merged_df = merged_df.merge(mouse_true, how='left', left_on=14, right_on=3, suffixes=('_NON', '_mouse'))

        #############################################################################
        
        # load MOUSE TEST DF

        pred_mouse_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_mouse_TEST.csv', header=None)
        
        mouse_test = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/test_splits_eqn/log_pos/mouse_liver_TEST_500bp.bed', sep="\t", header=None)
        
        mouse_test_len = 2*len(mouse_test)

        doubled_mouse_test_df = pd.concat([mouse_test, mouse_test]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_mouse_test_df = doubled_mouse_test_df.rename(columns={0:'mouse_chr'})
    
        pred_mouse_test_df = pred_mouse_TEST.head(mouse_test_len).reset_index(drop=True)
        pred_mouse_test_df = pd.concat([doubled_mouse_test_df, pred_mouse_test_df], axis=1)
        pred_mouse_test_df = pred_mouse_test_df.rename(columns={4:'mouse_true', 0:'mouse_pred'}).reset_index(drop=True)
        
        merged_mouse_test_true_pred = peaks.merge(pred_mouse_test_df, how='left', left_on=14, right_on=3)
        mouse_test_true_pred = merged_mouse_test_true_pred.rename(columns={3:'mouse_peak'}).drop([14], axis=1)

        #############################################################################
        # load nonmouse TEST2 DF
        
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_eqn/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).rename(columns={0: 'mac_chr'})
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len].reset_index(drop=True)
        doubled_test2_combined = pd.concat([doubled_test2_df, pred_test2_df], axis=1)

        non_test2_true_pred = doubled_test2_combined.rename(columns={4: 'non_true', 0:'non_pred'})
        merged_non_test2_true_pred = peaks.merge(non_test2_true_pred, how='left', left_on=4, right_on=3).drop(14, axis=1)
        non_test2_true_pred = merged_non_test2_true_pred.rename(columns={3:'non_peak'}).drop(4, axis=1)

        #############################################################################

        # make TEST2 merged df
        test2_foldchange_df = pd.concat([mouse_test_true_pred, non_test2_true_pred], axis=1)
        
        test2_foldchange_df['true'] = test2_foldchange_df['mouse_true']-test2_foldchange_df['non_true']
        test2_foldchange_df['pred'] = test2_foldchange_df['mouse_pred']-test2_foldchange_df['non_pred']
        
        mouse_av = (test2_foldchange_df.loc[::2, 'mouse_pred'].values + test2_foldchange_df.loc[1::2, 'mouse_pred'].values) / 2
        non_av = (test2_foldchange_df.loc[::2, 'non_pred'].values + test2_foldchange_df.loc[1::2, 'non_pred'].values) / 2
        
        # Add the averages back to the DataFrame as a new column
        test2_foldchange_df.loc[::2, 'mouse_pred_avg'] = mouse_av
        test2_foldchange_df.loc[::2, 'non_pred_avg'] = non_av 
        test2_foldchange_df['avg_pred'] = test2_foldchange_df['mouse_pred_avg']-test2_foldchange_df['non_pred_avg']
        
        corr_df = correlate()
        corr_df['Species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)
        


#############################################################################

summary_df = pd.concat(all_results)

custom_metric_order = ['Same Sign Count', 'Total Count', 'Same Sign %', 'Pearson', 'Pearson P-Val', 'Spearman', 'Spearman P-Val', 'MSE']

summary_df['Metric'] = pd.Categorical(summary_df['Metric'], categories=custom_metric_order, ordered=True)

pivot_df = summary_df.pivot_table(
    index=["Species", "Group", "Metric"],
    columns="model",
    values="Value"
).reset_index()

pivot_df = pivot_df.sort_values(by=["Species", "Group", "Metric"])

pivot_df = pivot_df[["Species", "Group", "Metric"] + model_list]
# pivot_df_reordered = pivot_df[model_list]

def format_value(metric, value):
    """Format values depending on whether it's a P-Val metric or not."""
    if "P-Val" in metric:
        return f"{value:.2e}"   # scientific notation, 3 sig figs
    else:
        return f"{value:.3g}"   # regular decimal, 3 sig figs

# Apply formatting to a copy just for display/export
pivot_df_display = pivot_df.copy()
for col in model_list:  # each model column
    pivot_df_display[col] = pivot_df_display.apply(
        lambda row: format_value(row["Metric"], row[col]),
        axis=1
    )

pivot_df_eqn = pivot_df_display
display(pivot_df_eqn)

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/eqn_log_model_foldchange_table_FINAL_mse.tsv'
pivot_df_eqn.to_csv(output_filename, sep='\t')

# print(f'Results successfully saved to: {output_filename}')


model,Species,Group,Metric,mcf297qb,7l12zan1
0,cow,Test2,Same Sign Count,380.0,363.0
1,cow,Test2,Total Count,618.0,618.0
2,cow,Test2,Same Sign %,0.615,0.587
3,cow,Test2,Pearson,0.202,0.221
4,cow,Test2,Pearson P-Val,0.000126,8.11e-06
5,cow,Test2,Spearman,0.225,0.226
6,cow,Test2,Spearman P-Val,4.83e-06,3.88e-06
7,cow,Test2,MSE,0.53,0.328
8,macaque,Test2,Same Sign Count,446.0,455.0
9,macaque,Test2,Total Count,798.0,798.0


In [30]:
# EXAMPLE MAKE ONE_TO_ONE_PEAKS THEN SAVE IT

species = 'macaque'
both_df = pd.read_csv(f"/home/azstephe/liverRegression/regression_liver/data/splits/{species}Mouse/{species}ToMouse_liver_{species}Enhancer_mouseEnhancer_test_wawb.narrowPeak", sep="\t", header=None)
# both_df = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/mapped/macaqueToMouse_liver_macaqueEnhancer_mouseEnhancer_wawb.narrowPeak', sep="\t", header=None)
nonMouse_true = pd.read_csv(f"/home/azstephe/liverRegression/regression_liver/data/sorted_log_20615/quantile_norm/{species}_liver_pos_ALL.bed", sep="\t", header=None)
mouse_true = pd.read_csv("/home/azstephe/liverRegression/regression_liver/data/sorted_log_20615/quantile_norm/mouse_liver_pos_ALL.bed", sep="\t", header=None)

unique1_2 = both_df[~both_df.duplicated(subset=[1, 2], keep=False)] # entries with unique mac mapped to mouse start and end
all_unique = unique1_2[~unique1_2.duplicated(subset=[11,12], keep=False)]

#### COL1,2 DUPLICATES

duplicates3 = both_df[both_df.duplicated(subset=[1, 2], keep=False)] # entries with duplicated mac mapped to mouse start and end 
grouped_dups3 = duplicates3.groupby([1, 2])[3].apply(list).reset_index(name='col3')

# grouped dups: start | end | [peaks with these endpoints]
grouped_dups3['col3'] = grouped_dups3['col3'].apply(collapse_if_identical) 

grouped_dups3['col3'] = grouped_dups3['col3'].apply(lambda x: keep_largest_signal(x, nonMouse_true)) # redundant endpoints now map to 1 peak in col3
grouped_dups3 = grouped_dups3[grouped_dups3['col3'] != ''] # remove macaque peaks that aren't large enough

# df of duplicated col1,2 with strongest peak
keep_strongestcol3 = duplicates3.merge(grouped_dups3[['col3']], left_on=3, right_on='col3', how='inner').drop('col3', axis=1) # keeps the strongest signal in col3 for redundant endpoints

#### COL11,12 DUPLICATES

grouped_dups13 = keep_strongestcol3.groupby([11, 12])[13].apply(list).reset_index(name='col13')
grouped_dups13['col13'] = grouped_dups13['col13'].apply(collapse_if_identical)

grouped_dups13['col13'] = grouped_dups13['col13'].apply(lambda x: keep_largest_signal(x, mouse_true))
grouped_dups13 = grouped_dups13[grouped_dups13['col13'] != '']

keep_strongestcol13 = keep_strongestcol3.merge(grouped_dups13[['col13']], left_on=13, right_on='col13', how='inner').drop('col13', axis=1)

unique_endpoints = keep_strongestcol13 #rows with unique endpoints from the duplicated endpoints set

####

# col3 peakname duplicates with different endpoints
still_dups_col3 = unique_endpoints[unique_endpoints.duplicated(subset=[3], keep=False)]

# col3peaks | [col13 peaks intersecting col3 peak]
grouped_dcol3 = still_dups_col3.groupby(3)[13].apply(list).reset_index(name='col13')

# get the col13 peak with most overlap of col3
grouped_dcol3['col13'] = grouped_dcol3['col13'].apply(lambda x: get_biggest_overlap(x, 13, still_dups_col3))

merged3 = still_dups_col3.merge(grouped_dcol3[[3, 'col13']], left_on=[3, 13], right_on=[3, 'col13'], how='left', indicator=True)

remove3 = merged3[merged3['_merge'] == 'left_only'].drop(columns=['_merge']) # col13 is what we want to remove

unique_endpoints_subset = unique_endpoints.iloc[:,:20]
remove3_subset = remove3.iloc[:,:20]

# all col3 entries unique
unique3 = unique_endpoints[~unique_endpoints_subset.apply(tuple, axis=1).isin(remove3_subset.apply(tuple, axis=1))] 

####

# col13 peakname duplicates with different endpoints
still_dups_col13 = unique3[unique3.duplicated(subset=[13], keep=False)]

# col13peaks | [col3 peaks intersecting col13 peak]
grouped_dcol13 = still_dups_col13.groupby(13)[3].apply(list).reset_index(name='col3')

# get the col3 peak with most overlap of col13
grouped_dcol13['col3'] = grouped_dcol13['col3'].apply(lambda x: get_biggest_overlap(x, 3, still_dups_col13))

merged13 = still_dups_col13.merge(grouped_dcol13[[13, 'col3']], left_on=[3, 13], right_on=['col3', 13], how='left', indicator=True)

remove13 = merged13[merged13['_merge'] == 'left_only'].drop(columns=['_merge'])

unique3_subset = unique3.iloc[:, :20]
remove13_subset = remove13.iloc[:, :20]

# Identify rows in u3 that are NOT in remove3
filtered_peaks_unique = unique3[~unique3_subset.apply(tuple, axis=1).isin(remove13_subset.apply(tuple, axis=1))]

full_unique = pd.concat([all_unique, filtered_peaks_unique])

# Sort by column '1'
one_to_one_peaks = full_unique.sort_values(by=1).reset_index(drop=True)
one_to_one_peaks.to_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/oneToOnePeaks_qn/{species}_mouse1.bed', header=None, sep='\t')
one_to_one_peaks
##############################



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,chr2,3228507,3228816,peak55529,-1,.,-1,-1,-1,119,chr2,3228532,3229401,peak11427,1000,.,16.72388,152.95140,149.78862,727
1,chr2,3799568,3800261,peak62146,-1,.,-1,-1,-1,632,chr2,3799767,3800338,peak11438,817,.,4.91323,20.94289,18.55824,208
2,chr1,4571620,4572258,peak33144,-1,.,-1,-1,-1,154,chr1,4571316,4572253,peak23,1000,.,11.87366,175.62753,172.38829,420
3,chr2,5249965,5250352,peak29753,-1,.,-1,-1,-1,173,chr2,5250005,5250415,peak11452,746,.,6.23602,32.05004,29.53300,120
4,chr2,5749258,5749747,peak63648,-1,.,-1,-1,-1,364,chr2,5749265,5749600,peak11459,1000,.,7.74779,46.30035,43.65820,169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,chr1,194995146,194995539,peak36835,-1,.,-1,-1,-1,203,chr1,194995180,194995620,peak1965,1000,.,8.09555,68.96048,66.16955,196
389,chr1,195016706,195017386,peak33495,-1,.,-1,-1,-1,203,chr1,195017180,195017466,peak1968,1000,.,5.09901,40.06577,37.47466,169
390,chr1,195018117,195018471,peak12739,-1,.,-1,-1,-1,185,chr1,195017987,195018471,peak1969,1000,.,11.09704,148.40295,145.25659,336
391,chr1,195034034,195034269,peak77700,-1,.,-1,-1,-1,151,chr1,195033987,195034343,peak1970,1000,.,12.14839,108.73130,105.74149,201


In [15]:
# QN MODEL FOLDCHANGE
import pandas as pd
import scipy.stats

mhc = 200

def correlate():
    rows = []
    # Lists for correlation calculations
    groups = ['Test2']
    dfs = [test2_foldchange_df.dropna(subset=['non_peak'])]

    # Calculate correlations
    for group, df in zip(groups, dfs):
        x = df['true'].squeeze()
        y = df['pred'].squeeze()
        pearson, pp = scipy.stats.pearsonr(x, y)
        spearman, ps = scipy.stats.spearmanr(x, y)

        mse = mean_squared_error(x, y)

        same_sign = np.sign(df['true']) == np.sign(df['pred'])
        num_ss = same_sign.sum()
        len_ss = len(df)
        perc_ss = num_ss / len_ss
        
        # same_sign_avg = np.sign(df.loc[::2]['true']) == np.sign(df.loc[::2]['avg_pred'])
        # num_ssa = same_sign_avg.sum()
        # len_ssa = len(df) / 2
        # perc_ssa = num_ssa / len_ssa
        
        rows.append({'Group': group, 'Metric': 'Same Sign Count', 'Value': num_ss})
        rows.append({'Group': group, 'Metric': 'Total Count', 'Value': len_ss})
        # print(f'total count: {len_ss}')
        rows.append({'Group': group, 'Metric': 'Same Sign %', 'Value': perc_ss})
     
        rows.append({'Group': group, 'Metric': 'Pearson', 'Value': pearson})
        rows.append({'Group': group, 'Metric': 'Pearson P-Val', 'Value': pp*mhc})
        rows.append({'Group': group, 'Metric': 'Spearman', 'Value': spearman})
        rows.append({'Group': group, 'Metric': 'Spearman P-Val', 'Value': ps*mhc})
        rows.append({'Group': group, 'Metric': 'MSE', 'Value': mse})
        
        
    return pd.DataFrame(rows)

# --- Main Script ---
all_results = []
species_list = ['macaque', 'rat', 'cow', 'pig']
# species_list = ['macaque']

model_list = ['bazc1enn']

for species in species_list:
    for model in model_list:
        model_dir = f'{model}_FINAL'

        nonMouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/sorted_log_20615/quantile_norm/{species}_liver_pos_ALL.bed', sep="\t", header=None)
        mouse_true = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/sorted_log_20615/quantile_norm/mouse_liver_pos_ALL.bed', sep="\t", header=None)
        one_to_one_peaks_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/oneToOnePeaks_qn/{species}_mouse.bed', header=None, sep='\t')

        peaks = one_to_one_peaks_TEST[[4, 14]]

        merged_df = peaks.merge(nonMouse_true, how='left', left_on=4, right_on=3, suffixes=('', '_non'))
        
        # Aligned orthologous peaks
        merged_df = merged_df.merge(mouse_true, how='left', left_on=14, right_on=3, suffixes=('_NON', '_mouse'))

        #############################################################################
        
        # load MOUSE TEST DF

        pred_mouse_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_mouse_TEST.csv', header=None)
        
        mouse_test = pd.read_csv('/home/azstephe/liverRegression/regression_liver/data/sorted_log_20615/splits/mousePos/mouse_liver_TEST.narrowPeak', sep="\t", header=None)
        
        mouse_test_len = 2*len(mouse_test)

        doubled_mouse_test_df = pd.concat([mouse_test, mouse_test]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_mouse_test_df = doubled_mouse_test_df.rename(columns={0:'mouse_chr'})
    
        pred_mouse_test_df = pred_mouse_TEST.head(mouse_test_len).reset_index(drop=True)
        pred_mouse_test_df = pd.concat([doubled_mouse_test_df, pred_mouse_test_df], axis=1)
        pred_mouse_test_df = pred_mouse_test_df.rename(columns={4:'mouse_true', 0:'mouse_pred'}).reset_index(drop=True)
        
        merged_mouse_test_true_pred = peaks.merge(pred_mouse_test_df, how='left', left_on=14, right_on=3)
        mouse_test_true_pred = merged_mouse_test_true_pred.rename(columns={3:'mouse_peak'}).drop([14], axis=1)

        #############################################################################
        # load nonmouse TEST2 DF
        
        pred_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model_dir}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits_qn/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).rename(columns={0: 'mac_chr'})
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        pred_test2_df = pred_TEST.iloc[test1_len:test1_len + test2_len].reset_index(drop=True)
        doubled_test2_combined = pd.concat([doubled_test2_df, pred_test2_df], axis=1)

        non_test2_true_pred = doubled_test2_combined.rename(columns={4: 'non_true', 0:'non_pred'})
        merged_non_test2_true_pred_na = peaks.merge(non_test2_true_pred, how='left', left_on=4, right_on=3).drop(14, axis=1)
        merged_non_test2_true_pred = merged_non_test2_true_pred_na.dropna(subset=[1])
        non_test2_true_pred = merged_non_test2_true_pred.rename(columns={3:'non_peak'}).drop(4, axis=1)

        #############################################################################

        # make TEST2 merged df
        test2_foldchange_df = pd.concat([mouse_test_true_pred, non_test2_true_pred], axis=1)
        
        test2_foldchange_df['true'] = test2_foldchange_df['mouse_true']-test2_foldchange_df['non_true']
        test2_foldchange_df['pred'] = test2_foldchange_df['mouse_pred']-test2_foldchange_df['non_pred']
        
        mouse_av = (test2_foldchange_df.loc[::2, 'mouse_pred'].values + test2_foldchange_df.loc[1::2, 'mouse_pred'].values) / 2
        non_av = (test2_foldchange_df.loc[::2, 'non_pred'].values + test2_foldchange_df.loc[1::2, 'non_pred'].values) / 2
        
        # Add the averages back to the DataFrame as a new column
        test2_foldchange_df.loc[::2, 'mouse_pred_avg'] = mouse_av
        test2_foldchange_df.loc[::2, 'non_pred_avg'] = non_av 
        test2_foldchange_df['avg_pred'] = test2_foldchange_df['mouse_pred_avg']-test2_foldchange_df['non_pred_avg']
        
        corr_df = correlate()
        corr_df['Species'] = species
        corr_df['model'] = model
        all_results.append(corr_df)
        


#############################################################################

summary_df = pd.concat(all_results)

custom_metric_order = ['Same Sign Count', 'Total Count', 'Same Sign %', 'Pearson', 'Pearson P-Val', 'Spearman', 'Spearman P-Val', 'MSE']

summary_df['Metric'] = pd.Categorical(summary_df['Metric'], categories=custom_metric_order, ordered=True)

pivot_df = summary_df.pivot_table(
    index=["Species", "Group", "Metric"],
    columns="model",
    values="Value"
).reset_index()

pivot_df = pivot_df.sort_values(by=["Species", "Group", "Metric"])

pivot_df = pivot_df[["Species", "Group", "Metric"] + model_list]
# pivot_df_reordered = pivot_df[model_list]

def format_value(metric, value):
    """Format values depending on whether it's a P-Val metric or not."""
    if "P-Val" in metric:
        return f"{value:.2e}"   # scientific notation, 3 sig figs

    elif "Count" in metric:
        return f"{value:.0f}" 
    else:
        return f"{value:.3f}"   # regular decimal, 3 sig figs

# Apply formatting to a copy just for display/export
pivot_df_display = pivot_df.copy()
for col in model_list:  # each model column
    pivot_df_display[col] = pivot_df_display.apply(
        lambda row: format_value(row["Metric"], row[col]),
        axis=1
    )

pivot_df_qn = pivot_df_display
display(pivot_df_qn)

output_filename = '/home/azstephe/liverRegression/regression_liver/data/figs/tables/qn_log_model_foldchange_table_FINAL_mse.tsv'
pivot_df_qn.to_csv(output_filename, sep='\t')

# print(f'Results successfully saved to: {output_filename}')

model,Species,Group,Metric,bazc1enn
0,cow,Test2,Same Sign Count,332.0
1,cow,Test2,Total Count,584.0
2,cow,Test2,Same Sign %,0.568
3,cow,Test2,Pearson,0.302
4,cow,Test2,Pearson P-Val,1.72e-11
5,cow,Test2,Spearman,0.277
6,cow,Test2,Spearman P-Val,1.81e-09
7,cow,Test2,MSE,0.722
8,macaque,Test2,Same Sign Count,397.0
9,macaque,Test2,Total Count,670.0


In [33]:
test2_foldchange_df.dropna(subset=['non_peak'])

Unnamed: 0,4,mouse_chr,1,2,mouse_peak,mouse_true,5,mouse_pred,mac_chr,1.1,2.1,non_peak,non_true,non_pred,true,pred,mouse_pred_avg,non_pred_avg,avg_pred
0,peak70463,chr2,3229009,3229509,peak11427,2.982887,,2.073398,,,,,,,,,2.139906,,
1,peak70463,chr2,3229009,3229509,peak11427,2.982887,,2.206415,chr9,14586550.0,14587050.0,peak62146,1.878954,1.191091,1.103933,1.015324,,,
2,peak62146,chr2,3799725,3800225,peak11438,2.042950,,0.865723,chr9,14586550.0,14587050.0,peak62146,1.878954,1.017634,0.163996,-0.151911,0.777564,1.119267,-0.341703
3,peak62146,chr2,3799725,3800225,peak11438,2.042950,,0.689404,chr8,52281336.0,52281836.0,peak33144,2.352545,1.220899,-0.309595,-0.531495,,,
4,peak33144,chr1,4571486,4571986,peak23,2.692907,,1.605112,chr8,52281336.0,52281836.0,peak33144,2.352545,1.392450,0.340362,0.212662,1.672422,1.274834,0.397588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
781,peak12739,chr1,195018073,195018573,peak1969,2.634955,,0.876024,,,,,,,,,,,
782,peak77700,chr1,195033938,195034438,peak1970,2.711460,,1.023052,,,,,,,,,1.153953,,
783,peak77700,chr1,195033938,195034438,peak1970,2.711460,,1.284853,,,,,,,,,,,
784,peak67562,chr1,195040379,195040879,peak1972,2.646217,,0.897482,,,,,,,,,0.867262,,


In [28]:
one_to_one_peaks_TEST = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/oneToOnePeaks_qn/{species}_mouse.bed', header=None, sep='\t')
one_to_one_peaks_TEST

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,chr2,3228939,3229355,peak70463,-1,.,-1,-1,-1,...,chr2,3228532,3229401,peak11427,1000,.,16.72388,152.95140,149.78862,727
1,1,chr2,3799568,3800261,peak62146,-1,.,-1,-1,-1,...,chr2,3799767,3800338,peak11438,817,.,4.91323,20.94289,18.55824,208
2,2,chr1,4571620,4572258,peak33144,-1,.,-1,-1,-1,...,chr1,4571316,4572253,peak23,1000,.,11.87366,175.62753,172.38829,420
3,3,chr2,5249965,5250352,peak29753,-1,.,-1,-1,-1,...,chr2,5250005,5250415,peak11452,746,.,6.23602,32.05004,29.53300,120
4,4,chr2,5749258,5749747,peak63648,-1,.,-1,-1,-1,...,chr2,5749265,5749600,peak11459,1000,.,7.74779,46.30035,43.65820,169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,388,chr1,194995146,194995539,peak36835,-1,.,-1,-1,-1,...,chr1,194995180,194995620,peak1965,1000,.,8.09555,68.96048,66.16955,196
389,389,chr1,195016706,195017386,peak33495,-1,.,-1,-1,-1,...,chr1,195017180,195017466,peak1968,1000,.,5.09901,40.06577,37.47466,169
390,390,chr1,195018117,195018471,peak12739,-1,.,-1,-1,-1,...,chr1,195017987,195018471,peak1969,1000,.,11.09704,148.40295,145.25659,336
391,391,chr1,195034034,195034269,peak77700,-1,.,-1,-1,-1,...,chr1,195033987,195034343,peak1970,1000,.,12.14839,108.73130,105.74149,201


In [34]:
test2_foldchange_df.dropna(subset=['non_peak'])

Unnamed: 0,4,mouse_chr,1,2,mouse_peak,mouse_true,5,mouse_pred,mac_chr,1.1,2.1,non_peak,non_true,non_pred,true,pred,mouse_pred_avg,non_pred_avg,avg_pred
1,peak70463,chr2,3229009,3229509,peak11427,2.982887,,2.206415,chr9,14586550.0,14587050.0,peak62146,1.878954,1.191091,1.103933,1.015324,,,
2,peak62146,chr2,3799725,3800225,peak11438,2.042950,,0.865723,chr9,14586550.0,14587050.0,peak62146,1.878954,1.017634,0.163996,-0.151911,0.777564,1.119267,-0.341703
3,peak62146,chr2,3799725,3800225,peak11438,2.042950,,0.689404,chr8,52281336.0,52281836.0,peak33144,2.352545,1.220899,-0.309595,-0.531495,,,
4,peak33144,chr1,4571486,4571986,peak23,2.692907,,1.605112,chr8,52281336.0,52281836.0,peak33144,2.352545,1.392450,0.340362,0.212662,1.672422,1.274834,0.397588
5,peak33144,chr1,4571486,4571986,peak23,2.692907,,1.739732,chr9,12950937.0,12951437.0,peak29753,2.384263,1.157217,0.308644,0.582515,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
720,peak57391,chr1,177495124,177495624,peak1702,2.270858,,0.617959,chr1,158585117.0,158585617.0,peak44813,2.119905,2.235281,0.150952,-1.617322,0.612699,1.482257,-0.869558
721,peak57391,chr1,177495124,177495624,peak1702,2.270858,,0.607438,chr1,158733525.0,158734025.0,peak36835,2.136445,0.729233,0.134413,-0.121794,,,
722,peak67300,chr1,177684857,177685357,peak1706,2.625862,,0.241606,chr1,158733525.0,158734025.0,peak36835,2.136445,0.452434,0.489417,-0.210828,0.296316,,
724,peak25128,chr1,177702799,177703299,peak1710,2.616382,,2.079636,chr1,158759173.0,158759673.0,peak12739,2.384171,1.245446,0.232211,0.834190,2.131734,1.064380,1.067354


In [31]:
df_cleaned = merged_non_test2_true_pred.dropna(subset=[1])
df_cleaned

Unnamed: 0,4,mac_chr,1,2,3,non_true,non_pred
1,peak62146,chr9,14586550.0,14587050.0,peak62146,1.878954,1.191091
2,peak62146,chr9,14586550.0,14587050.0,peak62146,1.878954,1.017634
3,peak33144,chr8,52281336.0,52281836.0,peak33144,2.352545,1.220899
4,peak33144,chr8,52281336.0,52281836.0,peak33144,2.352545,1.392450
5,peak29753,chr9,12950937.0,12951437.0,peak29753,2.384263,1.157217
...,...,...,...,...,...,...,...
720,peak44813,chr1,158585117.0,158585617.0,peak44813,2.119905,2.235281
721,peak36835,chr1,158733525.0,158734025.0,peak36835,2.136445,0.729233
722,peak36835,chr1,158733525.0,158734025.0,peak36835,2.136445,0.452434
724,peak12739,chr1,158759173.0,158759673.0,peak12739,2.384171,1.245446
