In [1]:
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import numpy as np

def correlations(pred_df, true_df, group):
    x = true_df.squeeze()
    y = pred_df.squeeze()
    
    pearson_corr, pearson_p_test = scipy.stats.pearsonr(x, y)
    print(f"Pearson correlation coefficient for {group}: {pearson_corr:.4f}, p-test: {pearson_p_test:.4g}")
    
    spearman_corr, spearman_p_test = scipy.stats.spearmanr(x, y)
    print(f"Spearman correlation coefficient for {group}: {spearman_corr:.4f}, p-test: {spearman_p_test:.4g}")
def scatter(pred_df, true_df, title, xlabel, ylabel):
    x = true_df.squeeze()
    y = pred_df.squeeze() 
    
    plt.figure(figsize=(8, 6))
    plt.xlim(0, 4)
    plt.ylim(0, 4)
    plt.plot([0, 4], [0, 4], color='#015088', linestyle='--', label='y = x')
    # plt.gcf().set_facecolor('#f3f0dfff')
    # plt.gca().set_facecolor('#f3f0dfff')
    plt.scatter(x, y, alpha=0.2)
    plt.xlabel(f'{xlabel}')
    plt.ylabel(f'{ylabel}')
    plt.title(f'{title}')
    plt.grid(True)
    plt.show()

def histogram(pred_df, title, xlabel):
    plt.hist(pred_df, bins=50) #, color='#015088'
    plt.xlim(0, 4)
    # plt.gcf().set_facecolor('#f3f0dfff')
    # plt.gca().set_facecolor('#f3f0dfff')
    plt.xlabel(f'{xlabel}')
    plt.ylabel('Count')
    plt.title(f'{title}')
    plt.show()

def add_stats(df):
    df['mean'] = df.mean(axis=1)
    df['range'] = df.max(axis=1)-df.min(axis=1)
    df['std'] = df.std(axis=1)

def do_scatter(x, y, xlabel, ylabel, title, combined):
    plt.scatter(x, y, alpha=0.2)
    sc = plt.scatter(x, y, c=combined['std'], cmap='coolwarm', alpha=0.6)  # 'coolwarm' goes from blue to red
    plt.colorbar(sc, label='Standard Deviation')  # adds a color bar legend
    plt.xlabel(f'{xlabel}')
    plt.ylabel(f'{ylabel}')
    plt.title(f'{title}')
    plt.grid(True)
    plt.show()
    
def bayesian_scatter(combined, xaxis, yaxis):
    x = combined[f'{xaxis}']
    y = combined[f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted all', combined)
    
    x = combined[combined['label'] == 'val1'][f'{xaxis}']
    y = combined[combined['label'] == 'val1'][f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted val1', combined[combined['label'] == 'val1'])
    
    x = combined[combined['label'] == 'val2'][f'{xaxis}']
    y = combined[combined['label'] == 'val2'][f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted val2', combined[combined['label'] == 'val2'])
    
    x = combined[combined['label'] == 'val3'][f'{xaxis}']
    y = combined[combined['label'] == 'val3'][f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted val3', combined[combined['label'] == 'val3'])

In [None]:
# nonmouse log train, val2, val3, test2, test3, test orthologs, val orthologs?

def correlate():
    # correlations(pred_val2_df, doubled_val2_df, 'val2')
    correlations(pred_test2_df, doubled_test2_df, 'test2')
    # correlations(pred_val3_df, doubled_val3_df, 'val3')
    correlations(pred_test3_df, doubled_test3_df, 'test3')

    # print('\n')
    # correlations(pred_pos_df, doubled_pos_df, 'mouse test orthologs')
    
species_list = ['macaque', 'rat', 'cow', 'pig']
# species_list = ['cow', 'pig']
# species_list = ['cow']
# model_list = ['bdbi7l3n', '7vsdq5k2', 'wnfdrgcc', '8i7h7nsh', 'ph4wrpxu']
# model_list = ['bdbi7l3n']
model_list = ['bdbi7l3n', '7vsdq5k2', 'wnfdrgcc', '8i7h7nsh']

for species in species_list:
    print(f'\n{species}')
    for model in model_list:
        print(f'\n{model}')
        
        # load all the DFs
        pred_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}_LiuAll/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/amy_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_LiuAll_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        # input sanity check
        if len(pred_df) != test1_len+test2_len+test3_len:
            print("ERROR1: predictions are a different length than testidation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_df.head(test1_len)
        pred_test2_df = pred_df.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_df.tail(test3_len)

        # print(f'average test1 prediction: {pred_test1_df.mean()}')
        
        #############
        pred_orthologs_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}_LiuAll/activations_{species}_TEST_orthologs.csv', header=None)

        pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        neg_len = 2*len(neg_df)
        pos_len = 2*len(pos_df)
        
        # input sanity check
        if len(pred_orthologs_df) != neg_len+pos_len:
            print("ERROR2: predictions are a different length than testidation sets")

        doubled_neg_df = pd.concat([neg_df, neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_pos_df = pd.concat([pos_df, pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_pos_df = pred_orthologs_df.head(pos_len)
        pred_neg_df = pred_orthologs_df.tail(neg_len)

        # print(f'average neg test prediction: {pred_neg_df.mean()}')

        neg = 'nonMacaque_liver_andRat_andCow_andPig_VAL_500bp.bed'
        if species == 'rat':
            neg = 'nonRat_liver_andMacaque_andCow_andPig_VAL_500bp.bed'
        elif species == 'cow':
            neg = 'nonCow_liver_andMacaque_andRat_andPig_VAL_500bp.bed'
        elif species == 'pig':
            neg = 'nonPig_liver_andMacaque_andRat_andCow_VAL_500bp.bed'
        
        # load all the DFs
        pred_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/val1/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val3/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        val3_len = 2*len(val3_df)
        
        # input sanity check
        if len(pred_df) != val1_len+val2_len+val3_len:
            print("ERROR: predictions are a different length than validation sets")
        
        doubled_val1_df = pd.concat([val1_df, val1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val3_df = pd.concat([val3_df, val3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val1_df = pred_df.head(val1_len)
        pred_val2_df = pred_df.iloc[val1_len:val1_len + val2_len]
        pred_val3_df = pred_df.tail(val3_len)
        
        doubled_combined = pd.concat([doubled_val1_df, doubled_val2_df, doubled_val3_df])

        # print(f'average val1 prediction: {pred_val1_df.mean()}')
        
        correlate()



        # scatter(pred_test2_df, doubled_test2_df, 'True vs predicted for test2', 'True', 'Predicted')
        # scatter(pred_test3_df, doubled_test3_df, 'True vs predicted for test3', 'True', 'Predicted')
        # scatter(pred_pos_df, doubled_pos_df, 'True vs predicted for orthologs', 'True', 'Predicted')


        # histogram(pred_test1_df, 'test1 predictions', 'Predicted signal')
        # histogram(pred_test2_df, 'test2 predictions', 'Predicted signal')
        # histogram(pred_test3_df, 'test3 predictions', 'Predicted signal')
    
        # histogram(pred_neg_df, 'negative test predictions', 'Predicted signal')
        # histogram(pred_pos_df, 'positive test predictions', 'Predicted signal')


macaque

bdbi7l3n
Pearson correlation coefficient for test2: 0.2895, p-test: 1.114e-19
Spearman correlation coefficient for test2: 0.2737, p-test: 1.119e-17
Pearson correlation coefficient for test3: 0.3420, p-test: 3.084e-70
Spearman correlation coefficient for test3: 0.3690, p-test: 2.539e-82

7vsdq5k2
Pearson correlation coefficient for test2: 0.2824, p-test: 9.166e-19
Spearman correlation coefficient for test2: 0.2789, p-test: 2.492e-18
Pearson correlation coefficient for test3: 0.3876, p-test: 2.35e-91
Spearman correlation coefficient for test3: 0.4110, p-test: 1.485e-103

wnfdrgcc
Pearson correlation coefficient for test2: 0.3053, p-test: 8.266e-22
Spearman correlation coefficient for test2: 0.2986, p-test: 6.871e-21
Pearson correlation coefficient for test3: 0.4113, p-test: 9.851e-104
Spearman correlation coefficient for test3: 0.4308, p-test: 1.072e-114

8i7h7nsh
Pearson correlation coefficient for test2: 0.2867, p-test: 2.581e-19
Spearman correlation coefficient for test2: 0.

In [None]:
# macaque all
species_list = ['macaque']
model_list = ['bdbi7l3n']
for species in species_list:
    print(f'\n{species}')
    for model in model_list:
        print(f'\n{model}')
        
        # load all the DFs
        pred_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_ALL.csv', header=None)
        pred_train_val_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_TRAIN_VAL.csv', header=None)
        
        all_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/log/macaque_liver_pos_ALL.bed', header=None, delim_whitespace=True).iloc[:,4]
        train_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/macaque_liver_TRAINONLY.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/macaque_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        
        all_len = 2*len(all_df)
        train_len = 2*len(train_df)
        val_len = 2*len(val_df)
        
        # input sanity check
        if len(pred_df) != all_len:
            print("ERROR: predictions are a different length than testidation sets")

        if len(pred_train_val_df) != train_len+val_len:
            print("ERROR: predictions are a different length than testidation sets")
        
        doubled_all_df = pd.concat([all_df, all_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_train_df = pd.concat([train_df, train_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val_df = pd.concat([val_df, val_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        doubled_train_val_df = pd.concat([doubled_train_df, doubled_val_df]).reset_index(drop=True)
        
        correlations(pred_df, doubled_all_df, 'macaque all')
        correlations(pred_train_val_df, doubled_train_val_df, 'macaque train and val')
        correlations(pred_train_val_df.head(train_len), doubled_train_df, 'macaque train')
        correlations(pred_train_val_df.tail(val_len), doubled_val_df, 'macaque val')
        
        pred_orthologs_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_TEST_orthologs.csv', header=None)

        pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        neg_len = 2*len(neg_df)
        pos_len = 2*len(pos_df)
        
        # input sanity check
        if len(pred_orthologs_df) != neg_len+pos_len:
            print("ERROR: predictions are a different length than testidation sets")

        doubled_neg_df = pd.concat([neg_df, neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_pos_df = pd.concat([pos_df, pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_pos_df = pred_orthologs_df.head(pos_len)
        pred_neg_df = pred_orthologs_df.tail(neg_len)

        correlations(pred_pos_df, doubled_pos_df, 'macaque test')


        # scatter(pred_test2_df, doubled_test2_df, 'True vs predicted for test2', 'True', 'Predicted')
        # scatter(pred_test3_df, doubled_test3_df, 'True vs predicted for test3', 'True', 'Predicted')
        # scatter(pred_pos_df, doubled_pos_df, 'True vs predicted for orthologs', 'True', 'Predicted')


        # histogram(pred_test1_df, 'test1 predictions', 'Predicted signal')
        # histogram(pred_test2_df, 'test2 predictions', 'Predicted signal')
        # histogram(pred_test3_df, 'test3 predictions', 'Predicted signal')
    
        # histogram(pred_neg_df, 'negative test predictions', 'Predicted signal')
        # histogram(pred_pos_df, 'positive test predictions', 'Predicted signal')
