In [68]:
import os
import scipy
import pandas as pd
from pprint import pprint
from scipy.stats import wilcoxon
from itertools import combinations, product
from sanitize_ml_labels import sanitize_ml_labels

In [4]:
RESULT_PATH = "./correlation/"
os.makedirs(RESULT_PATH, exist_ok=True)

In [9]:
def sanitize_df(df):
    df.columns = sanitize_ml_labels(df.columns)
    df.index.names = list(
        map(lambda x: x[:-1] if x[-1].isdigit() else x , 
            sanitize_ml_labels(df.index.names)
        )
    )
    for col in df.columns[df.dtypes == object]:
        df[col] = sanitize_ml_labels(df[col])
        
    return df

In [72]:
df = pd.concat([
    pd.read_csv("./reports/"+file, index_col=0)
    for file in os.listdir("./reports/")
])
df = df.drop("dataset", axis=1)
df = df[df.run_type != "biological validation"]

In [77]:
df

Unnamed: 0,model,trained_on,task,target,run_type,roc_auc_score,average_precision_score,accuracy_score
0,cae_500,multivariate_gaps,reconstruction,all_nucleotides,multivariate gaps test,0.894521,0.788934,0.653716
1,cae_500,multivariate_gaps,reconstruction,adenine,multivariate gaps test,0.876964,0.777225,0.820360
2,cae_500,multivariate_gaps,reconstruction,cytosine,multivariate gaps test,0.910102,0.801123,0.886691
3,cae_500,multivariate_gaps,reconstruction,thymine,multivariate gaps test,0.891502,0.812252,0.841135
4,cae_500,multivariate_gaps,reconstruction,guanine,multivariate gaps test,0.890247,0.751566,0.867721
...,...,...,...,...,...,...,...,...
5,cae_200,multivariate_gaps,gap_filling,all_nucleotides,single gap train,0.651315,0.370335,0.371709
6,cae_200,multivariate_gaps,gap_filling,adenine,single gap train,0.605114,0.377970,0.697266
7,cae_200,multivariate_gaps,gap_filling,cytosine,single gap train,0.671961,0.339063,0.794883
8,cae_200,multivariate_gaps,gap_filling,thymine,single gap train,0.620819,0.403958,0.687500


In [74]:
df = df[(df.model == "cae_200") | (df.model == "cae_500") | (df.model == "cae_1000")]

In [94]:
def analyze(df, target=None):
    if target:
        df = df[df.target == target]
    gaps = df[df.task == "gap_filling"]
    rec = df[df.task == "reconstruction"]
    print("Pearson")
    print("ACC ", scipy.stats.pearsonr(gaps.accuracy_score, rec.accuracy_score))
    print("PRC ", scipy.stats.pearsonr(gaps.average_precision_score, rec.average_precision_score))
    print("ROC ", scipy.stats.pearsonr(gaps.roc_auc_score, rec.roc_auc_score))
    print("Spearmanr")
    print("ACC ", scipy.stats.spearmanr(gaps.accuracy_score, rec.accuracy_score))
    print("PRC ", scipy.stats.spearmanr(gaps.average_precision_score, rec.average_precision_score))
    print("ROC ", scipy.stats.spearmanr(gaps.roc_auc_score, rec.roc_auc_score))
    

In [95]:
gaps = df[df.task == "gap_filling"]
rec = df[df.task == "reconstruction"]
pd.DataFrame({"acc1":gaps.accuracy_score.values, "acc2":rec.accuracy_score.values}).cov()

Unnamed: 0,acc1,acc2
acc1,0.026507,0.011225
acc2,0.011225,0.020385


In [96]:
print("General")
analyze(df)
for target in df.target.unique():
    print("\n\n" + target)
    analyze(df, target)

General
Pearson
ACC  (0.482895338644168, 2.326397636045051e-08)
PRC  (-0.5134980133064615, 2.0039036190860594e-09)
ROC  (-0.20785939520592472, 0.02272097597082698)
Spearmanr
ACC  SpearmanrResult(correlation=0.19855545523994717, pvalue=0.02970560894212244)
PRC  SpearmanrResult(correlation=-0.4071463296062226, pvalue=3.931990884368521e-06)
ROC  SpearmanrResult(correlation=-0.09129800680602819, pvalue=0.3213319885539175)


all_nucleotides
Pearson
ACC  (-0.08501791437050782, 0.6928553591011505)
PRC  (-0.6566630954882765, 0.0004915147884109693)
ROC  (-0.15275313758830594, 0.47610295879266495)
Spearmanr
ACC  SpearmanrResult(correlation=-0.10695652173913042, pvalue=0.6188809974331805)
PRC  SpearmanrResult(correlation=-0.38956521739130434, pvalue=0.059875479374870595)
ROC  SpearmanrResult(correlation=0.07826086956521738, pvalue=0.7162392905010877)


adenine
Pearson
ACC  (-0.6960337936053477, 0.00015864120971077322)
PRC  (-0.8182573084594442, 1.034175641023154e-06)
ROC  (-0.7151842408931849, 8.