In [44]:
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import numpy as np

def correlations(pred_df, true_df, group):
    x = true_df.squeeze()
    y = pred_df.squeeze()
    
    pearson_corr, pearson_p_test = scipy.stats.pearsonr(x, y)
    print(f"Pearson correlation coefficient for {group}: {pearson_corr:.4f}, p-test: {pearson_p_test:.4g}")
    
    spearman_corr, spearman_p_test = scipy.stats.spearmanr(x, y)
    print(f"Spearman correlation coefficient for {group}: {spearman_corr:.4f}, p-test: {spearman_p_test:.4g}")
def scatter(pred_df, true_df, title, xlabel, ylabel):
    x = true_df.squeeze()
    y = pred_df.squeeze() 
    
    plt.figure(figsize=(8, 6))
    plt.xlim(0, 4)
    plt.ylim(0, 4)
    plt.plot([0, 4], [0, 4], color='#015088', linestyle='--', label='y = x')
    # plt.gcf().set_facecolor('#f3f0dfff')
    # plt.gca().set_facecolor('#f3f0dfff')
    plt.scatter(x, y, alpha=0.2)
    plt.xlabel(f'{xlabel}')
    plt.ylabel(f'{ylabel}')
    plt.title(f'{title}')
    plt.grid(True)
    plt.show()

def histogram(pred_df, title, xlabel):
    plt.hist(pred_df, bins=50) #, color='#015088'
    plt.xlim(0, 4)
    # plt.gcf().set_facecolor('#f3f0dfff')
    # plt.gca().set_facecolor('#f3f0dfff')
    plt.xlabel(f'{xlabel}')
    plt.ylabel('Count')
    plt.title(f'{title}')
    plt.show()

def add_stats(df):
    df['mean'] = df.mean(axis=1)
    df['range'] = df.max(axis=1)-df.min(axis=1)
    df['std'] = df.std(axis=1)

def do_scatter(x, y, xlabel, ylabel, title, combined):
    plt.scatter(x, y, alpha=0.2)
    sc = plt.scatter(x, y, c=combined['std'], cmap='coolwarm', alpha=0.6)  # 'coolwarm' goes from blue to red
    plt.colorbar(sc, label='Standard Deviation')  # adds a color bar legend
    plt.xlabel(f'{xlabel}')
    plt.ylabel(f'{ylabel}')
    plt.title(f'{title}')
    plt.grid(True)
    plt.show()
    
def bayesian_scatter(combined, xaxis, yaxis):
    x = combined[f'{xaxis}']
    y = combined[f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted all', combined)
    
    x = combined[combined['label'] == 'val1'][f'{xaxis}']
    y = combined[combined['label'] == 'val1'][f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted val1', combined[combined['label'] == 'val1'])
    
    x = combined[combined['label'] == 'val2'][f'{xaxis}']
    y = combined[combined['label'] == 'val2'][f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted val2', combined[combined['label'] == 'val2'])
    
    x = combined[combined['label'] == 'val3'][f'{xaxis}']
    y = combined[combined['label'] == 'val3'][f'{yaxis}']
    
    do_scatter(x, y, f'{xaxis}', f'{yaxis}', 'predicted val3', combined[combined['label'] == 'val3'])

In [32]:
def correlate():
    correlations(pred_val2_df, doubled_val2_df, 'val2')
    correlations(pred_test2_df, doubled_test2_df, 'test2')
    correlations(pred_val3_df, doubled_val3_df, 'val3')
    correlations(pred_test3_df, doubled_test3_df, 'test3')

    print('\n')
    correlations(pred_pos_df, doubled_pos_df, 'mouse orthologs')
    
species_list = ['macaque', 'rat', 'cow', 'pig']
# species_list = ['rat']
model_list = ['bdbi7l3n', '7vsdq5k2']
for species in species_list:
    print(f'\n{species}')
    for model in model_list:
        print(f'\n{model}')
        
        # load all the DFs
        pred_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/amy_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        # input sanity check
        if len(pred_df) != test1_len+test2_len+test3_len:
            print("ERROR: predictions are a different length than testidation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_df.head(test1_len)
        pred_test2_df = pred_df.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_df.tail(test3_len)

        # print(f'average test1 prediction: {pred_test1_df.mean()}')
        
        #############
        pred_orthologs_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_TEST_orthologs.csv', header=None)
        
        neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        neg_len = 2*len(neg_df)
        pos_len = 2*len(pos_df)
        
        # input sanity check
        if len(pred_orthologs_df) != neg_len+pos_len:
            print("ERROR: predictions are a different length than testidation sets")

        doubled_neg_df = pd.concat([neg_df, neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_pos_df = pd.concat([pos_df, pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_pos_df = pred_orthologs_df.head(pos_len)
        pred_neg_df = pred_orthologs_df.tail(neg_len)

        # print(f'average neg test prediction: {pred_neg_df.mean()}')

        neg = 'nonMacaque_liver_andRat_andCow_andPig_VAL_500bp.bed'
        if species == 'rat':
            neg = 'nonRat_liver_andMacaque_andCow_andPig_VAL_500bp.bed'
        elif species == 'cow':
            neg = 'nonCow_liver_andMacaque_andRat_andPig_VAL_500bp.bed'
        elif species == 'pig':
            neg = 'nonPig_liver_andMacaque_andRat_andCow_VAL_500bp.bed'
        
        # load all the DFs
        pred_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/val1/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val3/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        val3_len = 2*len(val3_df)
        
        # input sanity check
        if len(pred_df) != val1_len+val2_len+val3_len:
            print("ERROR: predictions are a different length than validation sets")
        
        doubled_val1_df = pd.concat([val1_df, val1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val3_df = pd.concat([val3_df, val3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val1_df = pred_df.head(val1_len)
        pred_val2_df = pred_df.iloc[val1_len:val1_len + val2_len]
        pred_val3_df = pred_df.tail(val3_len)
        
        doubled_combined = pd.concat([doubled_val1_df, doubled_val2_df, doubled_val3_df])

        # print(f'average val1 prediction: {pred_val1_df.mean()}')
        
        correlate()



        # scatter(pred_test2_df, doubled_test2_df, 'True vs predicted for test2', 'True', 'Predicted')
        # scatter(pred_test3_df, doubled_test3_df, 'True vs predicted for test3', 'True', 'Predicted')
        # scatter(pred_pos_df, doubled_pos_df, 'True vs predicted for orthologs', 'True', 'Predicted')


        # histogram(pred_test1_df, 'test1 predictions', 'Predicted signal')
        # histogram(pred_test2_df, 'test2 predictions', 'Predicted signal')
        # histogram(pred_test3_df, 'test3 predictions', 'Predicted signal')
    
        # histogram(pred_neg_df, 'negative test predictions', 'Predicted signal')
        # histogram(pred_pos_df, 'positive test predictions', 'Predicted signal')



macaque

bdbi7l3n
Pearson correlation coefficient for val2: 0.4373, p-test: 2.669e-32
Spearman correlation coefficient for val2: 0.4312, p-test: 2.358e-31
Pearson correlation coefficient for test2: 0.2895, p-test: 1.114e-19
Spearman correlation coefficient for test2: 0.2737, p-test: 1.119e-17
Pearson correlation coefficient for val3: 0.3299, p-test: 2.59e-46
Spearman correlation coefficient for val3: 0.3453, p-test: 7.484e-51
Pearson correlation coefficient for test3: 0.3354, p-test: 9.074e-64
Spearman correlation coefficient for test3: 0.3595, p-test: 1.174e-73


Pearson correlation coefficient for mouse orthologs: 0.3266, p-test: 1.034e-130
Spearman correlation coefficient for mouse orthologs: 0.3461, p-test: 1.444e-147

7vsdq5k2
Pearson correlation coefficient for val2: 0.3985, p-test: 1.267e-26
Spearman correlation coefficient for val2: 0.4231, p-test: 3.885e-30
Pearson correlation coefficient for test2: 0.2824, p-test: 9.166e-19
Spearman correlation coefficient for test2: 0.2789,

In [35]:
def correlate():
    correlations(pred_train_df, doubled_train_df, 'train, pos only')
    correlations(pred_val_df, doubled_val_df, 'val, pos only')
    correlations(pred_pos_df, doubled_pos_df, 'test, pos only')
    
model_list = ['bdbi7l3n', '7vsdq5k2']
species = 'mouse'
for model in model_list:
    print(f'\n{model}')
        
     # load all the DFs
    pred_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_TEST.csv', header=None)
        
    pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
    neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/mouse_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
    pos_len = 2*len(pos_df)
    neg_len = 2*len(neg_df)
        
        # input sanity check
    if len(pred_df) != pos_len+neg_len:
        print("ERROR: predictions are a different length than testidation sets")
        
    doubled_pos_df = pd.concat([pos_df, pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
    doubled_neg_df = pd.concat([neg_df, neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
    pred_pos_df = pred_df.head(pos_len)
    pred_neg_df = pred_df.tail(neg_len)

    print(f'average neg test prediction: {pred_neg_df.mean()}')
        
    pred_val_train_mouse = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_VAL_TRAIN.csv', header=None, sep='\t')
    
    neg_df = pd.read_csv(f'/home/azstephe/regression_liver/data/splits/negatives/nonMouse_liver_andRat_andCow_andPig_andMacaque_VAL_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
    val_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
    train_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/logPos/{species}_liver_TRAINONLY.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
    
    val1_len = 2*len(neg_df)
    val2_len = 2*len(val_df)
    val3_len = 2*len(train_df)
    
    # input sanity check
    if len(pred_val_train_mouse) != val1_len+val2_len+val3_len:
        print("ERROR: predictions are a different length than validation sets")
    
    doubled_neg_df = pd.concat([neg_df, neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
    doubled_val_df = pd.concat([val_df, val_df]).sort_index(kind='mergesort').reset_index(drop=True)
    doubled_train_df = pd.concat([train_df, train_df]).sort_index(kind='mergesort').reset_index(drop=True)
    
    pred_neg_df = pred_val_train_mouse.head(val1_len)
    pred_val_df = pred_val_train_mouse.iloc[val1_len:val1_len + val2_len]
    pred_train_df = pred_val_train_mouse.tail(val3_len)

    pred_neg_pos_val_mouse = pd.concat([pred_neg_df, pred_val_df]).reset_index(drop=True)
    
    doubled_combined_val_train = pd.concat([doubled_neg_df, doubled_val_df, doubled_train_df]).reset_index(drop=True)
    mouse_combined = pd.concat([doubled_combined_val_train, pred_val_train_mouse])
    print(f'average neg val prediction: {pred_neg_df.mean()}')
    correlate()



        # scatter(pred_test2_df, doubled_test2_df, 'True vs predicted for test2', 'True', 'Predicted')
        # scatter(pred_test3_df, doubled_test3_df, 'True vs predicted for test3', 'True', 'Predicted')
        # scatter(pred_pos_df, doubled_pos_df, 'True vs predicted for orthologs', 'True', 'Predicted')


        # histogram(pred_test1_df, 'test1 predictions', 'Predicted signal')
        # histogram(pred_test2_df, 'test2 predictions', 'Predicted signal')
        # histogram(pred_test3_df, 'test3 predictions', 'Predicted signal')
    
        # histogram(pred_neg_df, 'negative test predictions', 'Predicted signal')
        # histogram(pred_pos_df, 'positive test predictions', 'Predicted signal')



bdbi7l3n
average neg test prediction: 0    0.609153
dtype: float64
average neg val prediction: 0    0.61926
dtype: float64
Pearson correlation coefficient for train, pos only: 0.4937, p-test: 0
Spearman correlation coefficient for train, pos only: 0.4982, p-test: 0
Pearson correlation coefficient for val, pos only: 0.4827, p-test: 1.745e-234
Spearman correlation coefficient for val, pos only: 0.4913, p-test: 4.294e-244
Pearson correlation coefficient for test, pos only: 0.4958, p-test: 0
Spearman correlation coefficient for test, pos only: 0.5022, p-test: 0

7vsdq5k2
average neg test prediction: 0    0.590392
dtype: float64
average neg val prediction: 0    0.599513
dtype: float64
Pearson correlation coefficient for train, pos only: 0.4967, p-test: 0
Spearman correlation coefficient for train, pos only: 0.5094, p-test: 0
Pearson correlation coefficient for val, pos only: 0.4574, p-test: 1.128e-207
Spearman correlation coefficient for val, pos only: 0.4768, p-test: 5.493e-228
Pearson co

In [36]:
# non mouse bayesian
def correlate():
    correlations(pred_val2_df, doubled_val2_df, 'val2')
    correlations(pred_test2_df, doubled_test2_df, 'test2')
    correlations(pred_val3_df, doubled_val3_df, 'val3')
    correlations(pred_test3_df, doubled_test3_df, 'test3')

    print('\n')
    correlations(pred_pos_df, doubled_pos_df, 'mouse orthologs')
    
species_list = ['macaque', 'rat', 'cow', 'pig']
# species_list = ['rat']
model_list = ['7vsdq5k2']
for species in species_list:
    print(f'\n{species}')
    for model in model_list:
        print(f'\n{model}')
        
        # load all the DFs
        pred_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}_bayesian/activations_{species}_TEST.csv', header=None)
        
        test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/amy_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        test1_len = 2*len(test1_df)
        test2_len = 2*len(test2_df)
        test3_len = 2*len(test3_df)
        
        # input sanity check
        if len(pred_df) != test1_len+test2_len+test3_len:
            print("ERROR: predictions are a different length than testidation sets")
        
        doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_test1_df = pred_df.head(test1_len)
        pred_test2_df = pred_df.iloc[test1_len:test1_len + test2_len]
        pred_test3_df = pred_df.tail(test3_len)

        # print(f'average test1 prediction: {pred_test1_df.mean()}')
        
        #############
        pred_orthologs_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_TEST_orthologs.csv', header=None)
        
        neg_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        pos_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
        
        neg_len = 2*len(neg_df)
        pos_len = 2*len(pos_df)
        
        # input sanity check
        if len(pred_orthologs_df) != neg_len+pos_len:
            print("ERROR: predictions are a different length than testidation sets")

        doubled_neg_df = pd.concat([neg_df, neg_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_pos_df = pd.concat([pos_df, pos_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_pos_df = pred_orthologs_df.head(pos_len)
        pred_neg_df = pred_orthologs_df.tail(neg_len)

        # print(f'average neg test prediction: {pred_neg_df.mean()}')

        neg = 'nonMacaque_liver_andRat_andCow_andPig_VAL_500bp.bed'
        if species == 'rat':
            neg = 'nonRat_liver_andMacaque_andCow_andPig_VAL_500bp.bed'
        elif species == 'cow':
            neg = 'nonCow_liver_andMacaque_andRat_andPig_VAL_500bp.bed'
        elif species == 'pig':
            neg = 'nonPig_liver_andMacaque_andRat_andCow_VAL_500bp.bed'
        
        # load all the DFs
        pred_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}/activations_{species}_VAL.csv', header=None)
        
        val1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/val1/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val2/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        val3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/splits/log_val3/{species}_liver_VAL.narrowPeak', header=None, delim_whitespace=True).iloc[:,4]
        
        val1_len = 2*len(val1_df)
        val2_len = 2*len(val2_df)
        val3_len = 2*len(val3_df)
        
        # input sanity check
        if len(pred_df) != val1_len+val2_len+val3_len:
            print("ERROR: predictions are a different length than validation sets")
        
        doubled_val1_df = pd.concat([val1_df, val1_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val2_df = pd.concat([val2_df, val2_df]).sort_index(kind='mergesort').reset_index(drop=True)
        doubled_val3_df = pd.concat([val3_df, val3_df]).sort_index(kind='mergesort').reset_index(drop=True)
        
        pred_val1_df = pred_df.head(val1_len)
        pred_val2_df = pred_df.iloc[val1_len:val1_len + val2_len]
        pred_val3_df = pred_df.tail(val3_len)
        
        doubled_combined = pd.concat([doubled_val1_df, doubled_val2_df, doubled_val3_df])

        # print(f'average val1 prediction: {pred_val1_df.mean()}')
        
        correlate()



        # scatter(pred_test2_df, doubled_test2_df, 'True vs predicted for test2', 'True', 'Predicted')
        # scatter(pred_test3_df, doubled_test3_df, 'True vs predicted for test3', 'True', 'Predicted')
        # scatter(pred_pos_df, doubled_pos_df, 'True vs predicted for orthologs', 'True', 'Predicted')


        # histogram(pred_test1_df, 'test1 predictions', 'Predicted signal')
        # histogram(pred_test2_df, 'test2 predictions', 'Predicted signal')
        # histogram(pred_test3_df, 'test3 predictions', 'Predicted signal')
    
        # histogram(pred_neg_df, 'negative test predictions', 'Predicted signal')
        # histogram(pred_pos_df, 'positive test predictions', 'Predicted signal')



macaque

bdbi7l3n_bayesian


FileNotFoundError: [Errno 2] No such file or directory: '/home/azstephe/liverRegression/regression_liver/data/model_outputs/bdbi7l3n_bayesian/activations_macaque_TEST_orthologs.csv'

In [45]:
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import numpy as np

species='macaque'
model='7vsdq5k2'

# load all the DFs
pred_baye = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/model_outputs/{model}_bayesian/activations_{species}_TEST.csv', header=None, delim_whitespace=True)
add_stats(pred_baye)

test1_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/amy_test1/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
test1_df = test1_df.replace(-1, 0)
test2_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test2/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]
test3_df = pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True).iloc[:,4]

test1_len = 2*len(test1_df)
test2_len = 2*len(test2_df)
test3_len = 2*len(test3_df)

# input sanity check
if len(pred_baye) != test1_len+test2_len+test3_len:
    print("ERROR: predictions are a different length than testidation sets")

doubled_test1_df = pd.concat([test1_df, test1_df]).sort_index(kind='mergesort').reset_index(drop=True)
doubled_test2_df = pd.concat([test2_df, test2_df]).sort_index(kind='mergesort').reset_index(drop=True)
doubled_test3_df = pd.concat([test3_df, test3_df]).sort_index(kind='mergesort').reset_index(drop=True)

baye_test1_df = pred_baye.head(test1_len)
baye_test2_df = pred_baye.iloc[test1_len:test1_len + test2_len]
baye_test3_df = pred_baye.tail(test3_len)

doubled_combined = pd.concat([doubled_test1_df, doubled_test2_df, doubled_test3_df])
doubled_combined = doubled_combined.reset_index(drop=True)

pred_df = pred_df.reset_index(drop=True)
pred_baye = pred_baye.reset_index(drop=True)

doubled_combined = doubled_combined.to_frame(name='true')

doubled_combined.loc[0:test1_len-1, 'label'] = 'test1'
doubled_combined.loc[test1_len:test1_len + test2_len - 1, 'label'] = 'test2'
doubled_combined.loc[test1_len + test2_len:, 'label'] = 'test3'

combined = pd.concat([pred_baye, doubled_combined], axis=1)

squared_errors = (combined.loc[:, 0:63].subtract(combined['true'], axis=0)) ** 2
combined['mse'] = squared_errors.mean(axis=1)
combined['mean_diff'] = abs(combined['mean'] - combined['true'])
combined
bayesian_scatter(combined, 'mean', 'std')
bayesian_scatter(combined, 'mean', 'mse')
bayesian_scatter(combined, 'std', 'mse')

correlations(combined[combined['label']=='test2']['std'], combined[combined['label']=='test2']['mse'], 'test2 macaque')
correlations(combined[combined['label']=='test3']['std'], combined[combined['label']=='test3']['mse'], 'test3 macaque')
bayesian_scatter(combined, 'mean_diff', 'mse')
bayesian_scatter(combined, 'mean_diff', 'std')

# threshold_correlate(combined, 'std', 0.3)
# print('\n')
# threshold_correlate(combined, 'std', 0.4)
# threshold_scatter(combined, 'std', 0.2)

NameError: name 'do_scatter' is not defined

In [43]:
pd.read_csv(f'/home/azstephe/liverRegression/regression_liver/data/test_splits/log_test3/{species}_liver_TEST_500bp.bed', header=None, delim_whitespace=True)


Unnamed: 0,0,1,2,3,4
0,chr1,134484347,134484847,peak10605,2.718037
1,chr1,135389947,135390447,peak72645,1.816486
2,chr1,136913858,136914358,peak78586,1.616586
3,chr1,136927092,136927592,peak51393,2.061906
4,chr1,136988427,136988927,peak44530,2.097141
...,...,...,...,...,...
1187,chr9,25033115,25033615,peak73344,1.760068
1188,chr9,26594127,26594627,peak15914,2.700721
1189,chr9,26594644,26595144,peak87593,1.435739
1190,chr9,41267864,41268364,peak61898,2.040823
