In [5]:
from pathlib import Path

import polars as pl

from udonpred_benchmarking.constants import (
    ALPHAFOLD_DISORDER_THRESHOLD,
    DATA_DIR, PROJECT_DIR,
    SETH_THRESHOLD, TRIZOD_THRESHOLD
)
from udonpred_benchmarking.postprocessing import postprocess_udonpred_df
from udonpred_benchmarking.result_parsing.alphafold_disorder import prepare_af2_files, prepare_af3_files
from udonpred_benchmarking.result_parsing.caid import read_caid_files, read_multi_caid_file
from udonpred_benchmarking.result_parsing.fasta import read_annotated_fasta_file
from udonpred_benchmarking.result_parsing.fldpnn import read_fldpnn_file
from udonpred_benchmarking.result_parsing.odinpred import prepare_odinpred_files, read_odinpred_files

In [2]:
method_result_directory = PROJECT_DIR / "predictions/trizod"

### Regular CAID files (UdonPred, PUNCH2, AlphaFold-Disorder)

In [3]:
punch2_light_df = read_caid_files(method_result_directory / "PUNCH2-light/results/disorder", "PUNCH2-light")
punch2_df = read_caid_files(method_result_directory / "PUNCH2/results/disorder", "PUNCH2")

In [22]:
udonpred_df = read_caid_files(
    method_result_directory / "UdonPred/disorder",
    "UdonPred",
    has_binary_scores=False,
    binary_threshold=TRIZOD_THRESHOLD
)
udonpred_df = postprocess_udonpred_df(udonpred_df)

### SETH

In [7]:
seth_df = read_annotated_fasta_file(
    method_result_directory / "SETH/SETH_trizod_test_set.fasta",
    contains_sequence=False,
    method_name="SETH",
    separator=", ",
    binary_threshold=SETH_THRESHOLD
).with_columns(
    (1 - (pl.col("SETH_continuous") - pl.col("SETH_continuous").min()) / (pl.col("SETH_continuous").max() - pl.col("SETH_continuous").min())).alias("SETH_continuous") # new threshold: < (1 - 0.621)
)

protein,position,SETH_continuous,SETH_binary
str,i64,f64,i32
"""17602_1_1_1""",1,0.64627,0
"""17602_1_1_1""",2,0.594339,0
"""17602_1_1_1""",3,0.569543,0
"""17602_1_1_1""",4,0.533415,0
"""17602_1_1_1""",5,0.461051,0
…,…,…,…
"""15973_1_1_1""",11,0.530063,0
"""15973_1_1_1""",12,0.51261,0
"""15973_1_1_1""",13,0.494732,0
"""15973_1_1_1""",14,0.574651,0


### AlphaFold

#### AlphaFold2-Disorder

In [6]:
"""
prepare_af2_files(
    method_result_path / "af2/af2_output",
    method_result_path / "af2/renamed_pdb_files",
    Path.home() / "projects/University/2023W/PP2/Project/data/TriZOD_test_set.fasta"
)
"""

af2_rsa_df = read_multi_caid_file(
    method_result_directory / "af2/af_disorder_output/disorder-25.dat",
    "AlphaFold2-RSA",
    has_binary_scores=False,
    binary_threshold=ALPHAFOLD_DISORDER_THRESHOLD
    )

af2_plddt_df = read_multi_caid_file(
    method_result_directory / "af2/af_disorder_output/disorder.dat",
    "AlphaFold2-pLDDT",
    has_binary_scores=False,
    binary_threshold=ALPHAFOLD_DISORDER_THRESHOLD
    )

#### AlphaFold3-Disorder

In [7]:
# prepare_af3_files(method_result_path / "af3/af3_output", method_result_path / "af3/mmcif_files")

af3_rsa_df = read_multi_caid_file(
    method_result_directory / "af3/af_disorder_output/output_disorder-25.dat",
    "AlphaFold3-RSA",
    separator="\t",
    has_binary_scores=False,
    binary_threshold=ALPHAFOLD_DISORDER_THRESHOLD
    )

af3_plddt_df = read_multi_caid_file(
    method_result_directory / "af3/af_disorder_output/output_disorder.dat",
    "AlphaFold3-pLDDT",
    separator="\t",
    has_binary_scores=False,
    binary_threshold=ALPHAFOLD_DISORDER_THRESHOLD
    )

### flDPnn

In [8]:
fldpnn_df = read_fldpnn_file(method_result_directory / "flDPnn/fldpnn_results.csv")

### ODiNPred

In [9]:
odinpred_output_path = method_result_directory / "ODiNPred"
# prepare_odinpred_files(odinpred_output_path)
odinpred_df = read_odinpred_files(odinpred_output_path)

### Merging

In [10]:
join_columns = ["protein", "position", "residue"]
pred_df = udonpred_df.join(punch2_df, on=join_columns)
pred_df = pred_df.join(punch2_light_df, on=join_columns)
pred_df = pred_df.join(af2_rsa_df, on=join_columns)
pred_df = pred_df.join(af2_plddt_df, on=join_columns)
pred_df = pred_df.join(af3_rsa_df, on=join_columns)
pred_df = pred_df.join(af3_plddt_df, on=join_columns)
pred_df = pred_df.join(fldpnn_df, on=join_columns)
pred_df = pred_df.join(odinpred_df, on=join_columns)
pred_df = pred_df.join(seth_df, on=join_columns[:2])

In [23]:
residue_df = (
    pl.read_csv(method_result_directory / "TriZOD/unfiltered.csv", columns=["ID", "seq_index", "seq", "k", "zscores", "pscores"])
    .join(
        pred_df,
        left_on=["ID", "seq_index"],
        right_on=["protein", "position"]
    )
    .with_columns(
        pl.col("zscores").cast(pl.Float64),
    )
    .drop_nulls("pscores")
    .with_columns(
        pl.when(pl.col("zscores") < 3).then(pl.lit("fully_disordered"))
        .when(pl.col("zscores") < 8).then(pl.lit("fractionally_ordered"))
        .when(pl.col("zscores") < 11).then(pl.lit("flexible_loops"))
        .otherwise(pl.lit("ordered"))
        .alias("zscore_category")
    )
    .with_columns(
        pl.when(pl.col("pscores") <= TRIZOD_THRESHOLD).then(0).otherwise(1).alias("pscores_binary")
    )
    .drop("residue")
    .rename({
        "ID": "protein",
        "seq_index": "position",
        "seq": "residue"
    })
)
residue_df

ID,seq_index,seq,k,zscores,pscores,UdonPred_continuous,UdonPred_binary,PUNCH2_continuous,PUNCH2_binary,PUNCH2-light_continuous,PUNCH2-light_binary,AlphaFold2-RSA_continuous,AlphaFold2-RSA_binary,AlphaFold2-pLDDT_continuous,AlphaFold2-pLDDT_binary,AlphaFold3-RSA_continuous,AlphaFold3-RSA_binary,AlphaFold3-pLDDT_continuous,AlphaFold3-pLDDT_binary,flDPnn_continuous,flDPnn_binary,ODiNPred_continuous,ODiNPred_binary,SETH_continuous,SETH_binary,zscore_category,pscores_binary
str,i64,str,i64,f64,f64,f64,i32,f64,i64,f64,i64,f64,i32,f64,i32,f64,i32,f64,i32,f64,i8,f64,i32,f64,i32,str,i32
"""30161_1_1_1""",2,"""I""",14,11.1609,0.0947,0.122,0,0.175,0,0.094,0,0.282,0,0.277,0,0.265,0,0.036,0,0.043,0,0.0594,0,0.268171,0,"""ordered""",0
"""30161_1_1_1""",3,"""R""",21,14.1291,0.0736,0.088667,0,0.105,0,0.041,0,0.267,0,0.161,0,0.249,0,0.028,0,0.041,0,0.0511,0,0.104289,0,"""ordered""",0
"""30161_1_1_1""",4,"""T""",21,15.1705,0.0332,0.070333,0,0.076,0,0.026,0,0.277,0,0.109,0,0.256,0,0.016,0,0.032,0,0.0482,0,0.033521,0,"""ordered""",0
"""30161_1_1_1""",5,"""I""",21,14.728,0.0497,0.055333,0,0.063,0,0.019,0,0.27,0,0.088,0,0.253,0,0.013,0,0.024,0,0.0329,0,0.011706,0,"""ordered""",0
"""30161_1_1_1""",6,"""L""",21,15.1002,0.0358,0.048333,0,0.06,0,0.018,0,0.27,0,0.078,0,0.253,0,0.012,0,0.016,0,0.0326,0,0.009844,0,"""ordered""",0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""15582_1_1_1""",10,"""T""",15,13.043,0.0248,0.143333,0,0.119,0,0.098,0,0.716,1,0.109,0,0.663,1,0.03,0,0.631,1,0.0,0,0.372991,0,"""ordered""",0
"""15582_1_1_1""",11,"""S""",15,12.8633,0.0323,0.209667,0,0.179,0,0.117,0,0.711,1,0.094,0,0.665,1,0.03,0,0.658,1,0.0,0,0.432372,1,"""ordered""",0
"""15582_1_1_1""",12,"""L""",13,11.8975,0.0365,0.283667,0,0.266,0,0.163,0,0.7,1,0.102,0,0.658,1,0.024,0,0.703,1,0.0,0,0.462488,1,"""ordered""",0
"""15582_1_1_1""",13,"""T""",11,9.8587,0.098,0.283333,0,0.397,1,0.259,0,0.688,1,0.086,0,0.646,1,0.027,0,0.703,1,0.0,0,0.5539,1,"""flexible_loops""",0


In [15]:
residue_df.write_csv(DATA_DIR / "trizod/per_residue_predictions.csv")