In [1]:
import os
import scipy
import pandas as pd
from pprint import pprint
from scipy.stats import wilcoxon
from itertools import combinations, product
from sanitize_ml_labels import sanitize_ml_labels

In [2]:
RESULT_PATH = "./correlation/"
os.makedirs(RESULT_PATH, exist_ok=True)

In [3]:
def sanitize_df(df):
    df.columns = sanitize_ml_labels(df.columns)
    df.index.names = list(
        map(lambda x: x[:-1] if x[-1].isdigit() else x , 
            sanitize_ml_labels(df.index.names)
        )
    )
    for col in df.columns[df.dtypes == object]:
        df[col] = sanitize_ml_labels(df[col])
        
    return df

In [4]:
df = pd.concat([
    pd.read_csv("./reports/"+file, index_col=0)
    for file in os.listdir("./reports/")
])
df = df.drop("dataset", axis=1)
df = df[df.run_type != "biological validation"]

In [5]:
df["window_size"] = [
    int(x.split("_")[1])
    for x in df.model.values
]

In [6]:
df["model_type"] = [
    x.split("_")[0]
    for x in df.model.values
]

In [7]:
df

Unnamed: 0,model,trained_on,task,target,run_type,roc_auc_score,average_precision_score,accuracy_score,window_size,model_type
0,cae_500,multivariate_gaps_with_weight_2,reconstruction,all_nucleotides,multivariate gaps test,0.882948,0.778071,0.640812,500,cae
1,cae_500,multivariate_gaps_with_weight_2,reconstruction,adenine,multivariate gaps test,0.868955,0.777267,0.828147,500,cae
2,cae_500,multivariate_gaps_with_weight_2,reconstruction,cytosine,multivariate gaps test,0.891547,0.774009,0.883621,500,cae
3,cae_500,multivariate_gaps_with_weight_2,reconstruction,thymine,multivariate gaps test,0.881032,0.803004,0.838942,500,cae
4,cae_500,multivariate_gaps_with_weight_2,reconstruction,guanine,multivariate gaps test,0.879461,0.743368,0.872183,500,cae
...,...,...,...,...,...,...,...,...,...,...
5,cae_200,single_gap_with_weight_10,gap_filling,all_nucleotides,single gap train,0.668916,0.399340,0.389316,200,cae
6,cae_200,single_gap_with_weight_10,gap_filling,adenine,single gap train,0.636148,0.417454,0.707900,200,cae
7,cae_200,single_gap_with_weight_10,gap_filling,cytosine,single gap train,0.686190,0.368543,0.801729,200,cae
8,cae_200,single_gap_with_weight_10,gap_filling,thymine,single gap train,0.631204,0.413160,0.708584,200,cae


# CAE model analysis

In [8]:
df_cae = df[df.model_type == "cae"]

In [9]:
def analyze(df, target=None):
    if target:
        df = df[df.target == target]
    gaps = df[df.task == "gap_filling"]
    rec = df[df.task == "reconstruction"]
    print("Pearson")
    print("ACC ", scipy.stats.pearsonr(gaps.accuracy_score, rec.accuracy_score))
    print("PRC ", scipy.stats.pearsonr(gaps.average_precision_score, rec.average_precision_score))
    print("ROC ", scipy.stats.pearsonr(gaps.roc_auc_score, rec.roc_auc_score))
    print("Spearmanr")
    print("ACC ", scipy.stats.spearmanr(gaps.accuracy_score, rec.accuracy_score))
    print("PRC ", scipy.stats.spearmanr(gaps.average_precision_score, rec.average_precision_score))
    print("ROC ", scipy.stats.spearmanr(gaps.roc_auc_score, rec.roc_auc_score))
    

### Gap filling and recostruction correlation

In [10]:
print("General")
analyze(df_cae)
for target in df_cae.target.unique():
    print("\n\n" + target)
    analyze(df_cae, target)

General
Pearson
ACC  (0.4904552499820177, 3.468122999737208e-23)
PRC  (-0.45900615308634674, 3.682306051689224e-20)
ROC  (-0.13386658432532747, 0.011004367119300848)
Spearmanr
ACC  SpearmanrResult(correlation=0.2307813341840152, pvalue=9.716279355216207e-06)
PRC  SpearmanrResult(correlation=-0.40026389092508424, pvalue=2.771308564091817e-15)
ROC  SpearmanrResult(correlation=-0.06288731651735996, pvalue=0.2339553543523421)


all_nucleotides
Pearson
ACC  (-0.1457585832073195, 0.22182346684359103)
PRC  (-0.5901878473894278, 4.868158280856321e-08)
ROC  (-0.2211255812203224, 0.061949905177918506)
Spearmanr
ACC  SpearmanrResult(correlation=-0.21345424143031713, pvalue=0.07180839206730805)
PRC  SpearmanrResult(correlation=-0.4721525500032157, pvalue=2.824305207687442e-05)
ROC  SpearmanrResult(correlation=-0.11306193324329541, pvalue=0.34434821652479164)


adenine
Pearson
ACC  (-0.38979842899745487, 0.0007128160467629724)
PRC  (-0.6662114886158754, 1.6944057135936033e-10)
ROC  (-0.476645220788

### Window size and performance correlation

In [11]:
rec = df_cae[df_cae.task == "reconstruction"]
print("ACC ", scipy.stats.pearsonr(rec.window_size, rec.accuracy_score))
print("PRC ", scipy.stats.pearsonr(rec.window_size, rec.average_precision_score))
print("ROC ", scipy.stats.pearsonr(rec.window_size, rec.roc_auc_score))
print("ACC ", scipy.stats.spearmanr(rec.window_size, rec.accuracy_score))
print("PRC ", scipy.stats.spearmanr(rec.window_size, rec.average_precision_score))
print("ROC ", scipy.stats.spearmanr(rec.window_size, rec.roc_auc_score))

ACC  (-0.7270641502002068, 1.972939630077375e-60)
PRC  (-0.9759434088015936, 6.603103753455906e-239)
ROC  (-0.9837087548965502, 6.60921784027039e-269)
ACC  SpearmanrResult(correlation=-0.8408383215172114, pvalue=1.8466236226194634e-97)
PRC  SpearmanrResult(correlation=-0.9428126789836747, pvalue=6.85347982782517e-173)
ROC  SpearmanrResult(correlation=-0.9428126789836747, pvalue=6.85347982782517e-173)


In [12]:
gaps = df_cae[df_cae.task == "gap_filling"]
print("ACC ", scipy.stats.pearsonr(gaps.window_size, gaps.accuracy_score))
print("PRC ", scipy.stats.pearsonr(gaps.window_size, gaps.average_precision_score))
print("ROC ", scipy.stats.pearsonr(gaps.window_size, gaps.roc_auc_score))
print("ACC ", scipy.stats.spearmanr(gaps.window_size, gaps.accuracy_score))
print("PRC ", scipy.stats.spearmanr(gaps.window_size, gaps.average_precision_score))
print("ROC ", scipy.stats.spearmanr(gaps.window_size, gaps.roc_auc_score))

ACC  (0.05754402533024341, 0.2761862886984938)
PRC  (0.5815789025984608, 5.805660440624179e-34)
ROC  (0.22783844582930485, 1.2686414373432279e-05)
ACC  SpearmanrResult(correlation=0.22221581060900633, pvalue=2.0914099376175878e-05)
PRC  SpearmanrResult(correlation=0.5366502863395618, pvalue=3.0668739255532884e-28)
ROC  SpearmanrResult(correlation=0.1510464479455096, pvalue=0.004073209040056278)


# CNN analysis

In [13]:
df_cnn = df[df.model_type == "cnn"]

### Window size and performance correlation

In [14]:
gaps = df_cnn[df_cnn.task == "gap_filling"]
print("ACC ", scipy.stats.pearsonr(gaps.window_size, gaps.accuracy_score))
print("PRC ", scipy.stats.pearsonr(gaps.window_size, gaps.average_precision_score))
print("ROC ", scipy.stats.pearsonr(gaps.window_size, gaps.roc_auc_score))
print("ACC ", scipy.stats.spearmanr(gaps.window_size, gaps.accuracy_score))
print("PRC ", scipy.stats.spearmanr(gaps.window_size, gaps.average_precision_score))
print("ROC ", scipy.stats.spearmanr(gaps.window_size, gaps.roc_auc_score))

ACC  (0.02621780038693372, 0.776223122135287)
PRC  (0.028982130532245944, 0.7533491853906168)
ROC  (0.04773408034671381, 0.6046517145993185)
ACC  SpearmanrResult(correlation=0.0026517563176666455, pvalue=0.9770683378676717)
PRC  SpearmanrResult(correlation=-0.11490884190242913, pvalue=0.21139742045523957)
ROC  SpearmanrResult(correlation=-0.02622278699824665, pvalue=0.7761816799226504)
