In [1]:
from pathlib import Path

import polars as pl

from udonpred_benchmarking.constants import ALPHAFOLD_DISORDER_THRESHOLD, DATA_DIR, PROJECT_DIR, TRIZOD_THRESHOLD
from udonpred_benchmarking.result_parsing.caid import read_multi_caid_file
from udonpred_benchmarking.result_parsing.fasta import read_annotated_fasta_file

In [2]:
method_result_directory = PROJECT_DIR / "predictions/caid3/v3"
# some of these, especially UdonPred, are missing predictions
punch2_df = read_multi_caid_file(method_result_directory / "PUNCH2.caid", "PUNCH2")
udonpred_df = read_multi_caid_file(
    method_result_directory / "UdonPred-combined.caid",
    "UdonPred",
    has_binary_scores=False,
    binary_threshold=TRIZOD_THRESHOLD
)
seth_df = read_multi_caid_file(method_result_directory / "SETH-1.caid", "SETH")
af2_rsa_df = read_multi_caid_file(
    method_result_directory / "AlphaFold-rsa.caid",
    "AlphaFold2-RSA",
    has_binary_scores=False,
    binary_threshold=ALPHAFOLD_DISORDER_THRESHOLD
)
af2_plddt_df = read_multi_caid_file(
    method_result_directory / "AlphaFold-pLDDT.caid",
    "AlphaFold2-pLDDT",
    has_binary_scores=False,
    binary_threshold=ALPHAFOLD_DISORDER_THRESHOLD
)
af3_rsa_df = read_multi_caid_file(
    method_result_directory / "AlphaFold3-rsa.caid",
    "AlphaFold3-RSA",
    has_binary_scores=False,
    binary_threshold=ALPHAFOLD_DISORDER_THRESHOLD
)
af3_plddt_df = read_multi_caid_file(
    method_result_directory / "AlphaFold3-pLDDT.caid",
    "AlphaFold3-pLDDT",
    has_binary_scores=False,
    binary_threshold=ALPHAFOLD_DISORDER_THRESHOLD
)
fldpnn_df = read_multi_caid_file(method_result_directory / "flDPnn.caid", "flDPnn")
join_columns = ["protein", "position", "residue"]
pred_df = (
    punch2_df
    .join(udonpred_df, on=join_columns, how="left")
    .join(af2_rsa_df, on=join_columns, how="left")
    .join(af2_plddt_df, on=join_columns, how="left")
    .join(af3_rsa_df, on=join_columns, how="left")
    .join(af3_plddt_df, on=join_columns, how="left")
    .join(seth_df, on=join_columns, how="left")
    .join(fldpnn_df, on=join_columns, how="left")
)
pred_df

protein,position,residue,PUNCH2_continuous,PUNCH2_binary,UdonPred_continuous,UdonPred_binary,AlphaFold2-RSA_continuous,AlphaFold2-RSA_binary,AlphaFold2-pLDDT_continuous,AlphaFold2-pLDDT_binary,AlphaFold3-RSA_continuous,AlphaFold3-RSA_binary,AlphaFold3-pLDDT_continuous,AlphaFold3-pLDDT_binary,SETH_continuous,SETH_binary,flDPnn_continuous,flDPnn_binary
str,i64,str,f64,i64,f64,i32,f64,i32,f64,i32,f64,i32,f64,i32,f64,i64,f64,i64
"""DP02732""",1,"""M""",0.958,1,0.497,1,0.86,1,0.621,1,0.844,1,0.684,1,0.597,1,0.102,0
"""DP02732""",2,"""E""",0.961,1,0.556,1,0.865,1,0.676,1,0.847,1,0.62,1,0.569,1,0.11,0
"""DP02732""",3,"""D""",0.961,1,0.563,1,0.868,1,0.653,1,0.849,1,0.604,1,0.566,1,0.118,0
"""DP02732""",4,"""L""",0.958,1,0.554,1,0.869,1,0.665,1,0.851,1,0.586,1,0.584,1,0.104,0
"""DP02732""",5,"""V""",0.959,1,0.499,1,0.875,1,0.658,1,0.857,1,0.546,1,0.603,1,0.123,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""DP04416""",93,"""T""",0.629,1,0.993,1,0.609,1,0.332,1,0.606,1,0.199,0,0.521,1,0.579,1
"""DP04416""",94,"""Y""",0.634,1,0.797,1,0.61,1,0.404,1,0.609,1,0.185,0,0.557,1,0.649,1
"""DP04416""",95,"""T""",0.654,1,0.869,1,0.597,1,0.399,1,0.589,1,0.255,0,0.63,1,0.608,1
"""DP04416""",96,"""G""",0.64,1,0.841,1,0.598,1,0.468,1,0.585,1,0.306,0,0.715,1,0.552,1


In [21]:
residue_df = (
    read_annotated_fasta_file(
        Path("/home/julius/projects/University/Publications/UdonPred/caid2_reference/data/output/references/disorder_pdb.fasta"),
        contains_sequence=True,
        contains_continuous_scores=False,
        contains_binary_scores=True,
        null_value="-",
        method_name="CAID3"
    )
    .join(pred_df, on=["protein", "position"])
    .drop_nulls("CAID3_binary")
    #.drop_nulls()
)
residue_df

protein,position,CAID3_binary,residue,PUNCH2_continuous,PUNCH2_binary,UdonPred_continuous,UdonPred_binary,AlphaFold2-RSA_continuous,AlphaFold2-RSA_binary,AlphaFold2-pLDDT_continuous,AlphaFold2-pLDDT_binary,AlphaFold3-RSA_continuous,AlphaFold3-RSA_binary,AlphaFold3-pLDDT_continuous,AlphaFold3-pLDDT_binary,SETH_continuous,SETH_binary,flDPnn_continuous,flDPnn_binary
str,i16,i8,str,f64,i64,f64,i32,f64,i32,f64,i32,f64,i32,f64,i32,f64,i64,f64,i64
"""DP02732""",1547,1,"""L""",0.612,1,0.548,1,0.792,1,0.634,1,0.811,1,0.577,1,0.599,1,0.003,0
"""DP02732""",1548,1,"""E""",0.587,1,0.457,1,0.79,1,0.7,1,0.809,1,0.535,1,0.58,1,0.002,0
"""DP02732""",1549,1,"""P""",0.566,1,0.423,1,0.783,1,0.604,1,0.801,1,0.577,1,0.565,1,0.002,0
"""DP02732""",1550,1,"""P""",0.54,1,0.359,0,0.778,1,0.665,1,0.798,1,0.581,1,0.532,1,0.001,0
"""DP02732""",1551,1,"""L""",0.506,1,0.37,0,0.778,1,0.694,1,0.81,1,0.569,1,0.503,1,0.001,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""DP04416""",93,1,"""T""",0.629,1,0.993,1,0.609,1,0.332,1,0.606,1,0.199,0,0.521,1,0.579,1
"""DP04416""",94,1,"""Y""",0.634,1,0.797,1,0.61,1,0.404,1,0.609,1,0.185,0,0.557,1,0.649,1
"""DP04416""",95,1,"""T""",0.654,1,0.869,1,0.597,1,0.399,1,0.589,1,0.255,0,0.63,1,0.608,1
"""DP04416""",96,1,"""G""",0.64,1,0.841,1,0.598,1,0.468,1,0.585,1,0.306,0,0.715,1,0.552,1


### Next steps
- generate missing predictions
    - UdonPred
    - AF3
- investigate slightly off negative/undefined residues
- try to find right flDPnn version

In [22]:
residue_df.write_csv(DATA_DIR / "caid3/per_residue_predictions.csv")