In [1]:
import os
import re
import numpy as np
import pandas as pd
from glob import glob

import src as sp

In [2]:
# parameters
filepaths = {
    "wt": glob("benchmark_data/wt/monomers/*.pdb"),
    "baseline": glob("benchmark_data/wt/monomers/alphafold_models_ss/*_rank_001_*.pdb"),
    "carbonara_maxseqid": glob("benchmark_data/carbonara/monomers/maxseqid/alphafold_models_ss/*_rank_001_*.pdb"),
    "carbonara_minseqid": glob("benchmark_data/carbonara/monomers/minseqid/alphafold_models_ss/*_rank_001_*.pdb"),
    "carbonara_minseqsim": glob("benchmark_data/carbonara/monomers/minseqsim/alphafold_models_ss/*_rank_001_*.pdb"),
    "mpnn": glob("benchmark_data/mpnn/monomers/alphafold_models_ss/*_rank_001_*.pdb"),
    "esm": glob("benchmark_data/esm/monomers/alphafold_models_ss/*_rank_001_*.pdb"),
}
keys = ["baseline", "carbonara_maxseqid", "carbonara_minseqid", "carbonara_minseqsim", "mpnn", "esm"]

# read structures 
fp_map = {}
for key in filepaths:
    fp_map[key] = {}
    for fp in filepaths[key]:
        sid = re.search(r'/([0-9A-Z]{4}(_[A-Z])?)', fp)[1]
        fp_map[key][sid] = fp
        
# evaluate structures 
results = []
for key in keys:
    for sid in fp_map[key]:
        if sid in fp_map["wt"]:
            # get structures pair
            fp_ref = fp_map["wt"][sid]
            fp = fp_map[key][sid]

            # temporary pdb files without sequence information and backbone only
            structure = sp.extract_backbone(sp.read_pdb(fp_ref))
            structure['resname'][:] = "GLY"
            sp.save_pdb(sp.split_by_chain(structure), "/tmp/ref.pdb")
            
            structure = sp.extract_backbone(sp.read_pdb(fp))
            structure['resname'][:] = "GLY"
            sp.save_pdb(sp.split_by_chain(structure), "/tmp/mod.pdb")

            # run tmscore and parse output
            #command = "tmscore/TMscore {} {}".format(fp, fp_ref)
            command = "tmscore/TMscore /tmp/mod.pdb /tmp/ref.pdb"
            output = os.popen(command).read()
            tmscore = float(re.search(r'TM-score    = ([0-9].[0-9]*)', output)[1])

            # store results
            results.append({
                "method": key,
                'sid': sid,
                'tmscore': tmscore,
            })

# pack results
dfm = pd.DataFrame(results).sort_values(["sid", "method"])
dfm = dfm.groupby("sid").filter(lambda x: len(x)==len(keys))
dfm.to_csv("results/monomers_structure_ss_tmscore.csv", index=False)
dfm

Unnamed: 0,method,sid,tmscore
5,baseline,1ABO_A,0.2348
147,carbonara_maxseqid,1ABO_A,0.9334
289,carbonara_minseqid,1ABO_A,0.3004
431,carbonara_minseqsim,1ABO_A,0.4004
715,esm,1ABO_A,0.8388
...,...,...,...
261,carbonara_maxseqid,6R3C_A,0.7052
403,carbonara_minseqid,6R3C_A,0.8355
545,carbonara_minseqsim,6R3C_A,0.7967
829,esm,6R3C_A,0.5381


In [3]:
# parameters
filepaths = {
    "wt": glob("benchmark_data/wt/dimers/*.pdb"),
    #"baseline": glob("benchmark_data/wt/dimers/alphafold_models_ss/*_rank_001_*.pdb"),
    "carbonara_maxseqid": glob("benchmark_data/carbonara/dimers/maxseqid/alphafold_models_ss/*_rank_001_*.pdb"),
    "carbonara_minseqid": glob("benchmark_data/carbonara/dimers/minseqid/alphafold_models_ss/*_rank_001_*.pdb"),
    "carbonara_minseqsim": glob("benchmark_data/carbonara/dimers/minseqsim/alphafold_models_ss/*_rank_001_*.pdb"),
    "mpnn": glob("benchmark_data/mpnn/dimers/alphafold_models_ss/*_rank_001_*.pdb"),
    "esm": glob("benchmark_data/esm/dimers/alphafold_models_ss/*_rank_001_*.pdb"),
}
keys = ["carbonara_maxseqid", "carbonara_minseqid", "carbonara_minseqsim", "mpnn", "esm"]

# read structures 
fp_map = {}
for key in filepaths:
    fp_map[key] = {}
    for fp in filepaths[key]:
        sid = re.search(r'/([0-9A-Z]{4}(_[A-Z])?)', fp)[1]
        fp_map[key][sid] = fp
        
# evaluate structures 
results = []
for key in keys:
    for sid in fp_map[key]:
        pdbid = sid.split('_')[0]
        if pdbid in fp_map["wt"]:
            # get structures pair
            fp_ref = fp_map["wt"][pdbid]
            fp = fp_map[key][sid]

            # temporary pdb files without sequence information and backbone only
            structure = sp.extract_backbone(sp.read_pdb(fp_ref))
            structure['resname'][:] = "GLY"
            sp.save_pdb(sp.split_by_chain(structure), "/tmp/ref.pdb")
            
            structure = sp.extract_backbone(sp.read_pdb(fp))
            structure['resname'][:] = "GLY"
            sp.save_pdb(sp.split_by_chain(structure), "/tmp/mod.pdb")

            # run tmscore and parse output
            #command = "tmscore/TMscore -c -ter 0 {} {}".format(fp, fp_ref)
            command = "tmscore/TMscore -c -ter 0 /tmp/mod.pdb /tmp/ref.pdb"
            output = os.popen(command).read()
            tmscore = float(re.search(r'TM-score    = ([0-9].[0-9]*)', output)[1])

            # store results
            results.append({
                "method": key,
                'sid': sid,
                'tmscore': tmscore,
            })

# pack results
dfd = pd.DataFrame(results).sort_values(["sid", "method"])
dfd = dfd.groupby("sid").filter(lambda x: len(x)==len(keys))
dfd.to_csv("results/dimers_structure_ss_tmscore.csv", index=False)
dfd

Unnamed: 0,method,sid,tmscore
38,carbonara_maxseqid,1DAN_T,0.2813
100,carbonara_minseqid,1DAN_T,0.2610
162,carbonara_minseqsim,1DAN_T,0.4241
286,esm,1DAN_T,0.2352
224,mpnn,1DAN_T,0.4954
...,...,...,...
15,carbonara_maxseqid,6PNW_B,0.1879
77,carbonara_minseqid,6PNW_B,0.3002
139,carbonara_minseqsim,6PNW_B,0.2414
263,esm,6PNW_B,0.2485


In [4]:
# parameters
filepaths = {
    "wt": glob("benchmark_data/wt/monomers/*.pdb"),
    "carbonara_maxseqid": glob("benchmark_data/carbonara/monomers/maxseqid/alphafold_models_msa/*_rank_001_*.pdb"),
    "carbonara_minseqid": glob("benchmark_data/carbonara/monomers/minseqid/alphafold_models_msa/*_rank_001_*.pdb"),
    "carbonara_minseqsim": glob("benchmark_data/carbonara/monomers/minseqsim/alphafold_models_msa/*_rank_001_*.pdb"),
}
keys = ["carbonara_maxseqid", "carbonara_minseqid", "carbonara_minseqsim"]

# read structures 
fp_map = {}
for key in filepaths:
    fp_map[key] = {}
    for fp in filepaths[key]:
        sid = re.search(r'/([0-9A-Z]{4}(_[A-Z])?)', fp)[1]
        fp_map[key][sid] = fp
        
# evaluate structures 
results = []
for key in keys:
    for sid in fp_map[key]:
        if sid in fp_map["wt"]:
            # get structures pair
            fp_ref = fp_map["wt"][sid]
            fp = fp_map[key][sid]

            # temporary pdb files without sequence information and backbone only
            structure = sp.extract_backbone(sp.read_pdb(fp_ref))
            structure['resname'][:] = "GLY"
            sp.save_pdb(sp.split_by_chain(structure), "/tmp/ref.pdb")
            
            structure = sp.extract_backbone(sp.read_pdb(fp))
            structure['resname'][:] = "GLY"
            sp.save_pdb(sp.split_by_chain(structure), "/tmp/mod.pdb")

            # run tmscore and parse output
            #command = "tmscore/TMscore {} {}".format(fp, fp_ref)
            command = "tmscore/TMscore /tmp/mod.pdb /tmp/ref.pdb"
            output = os.popen(command).read()
            tmscore = float(re.search(r'TM-score    = ([0-9].[0-9]*)', output)[1])

            # store results
            results.append({
                "method": key,
                'sid': sid,
                'tmscore': tmscore,
            })

# pack results
dfo = pd.DataFrame(results).sort_values(["sid", "method"])
dfo = dfo.groupby("sid").filter(lambda x: len(x)==len(keys))
dfo.to_csv("results/monomers_options_structure_msa_tmscore.csv", index=False)
dfo

Unnamed: 0,method,sid,tmscore
5,carbonara_maxseqid,1ABO_A,0.9391
147,carbonara_minseqid,1ABO_A,0.9033
289,carbonara_minseqsim,1ABO_A,0.9125
124,carbonara_maxseqid,1ABQ_A,0.9192
266,carbonara_minseqid,1ABQ_A,0.8755
...,...,...,...
190,carbonara_minseqid,6PNW_B,0.0000
332,carbonara_minseqsim,6PNW_B,0.0000
119,carbonara_maxseqid,6R3C_A,0.8910
261,carbonara_minseqid,6R3C_A,0.8298


In [5]:
# parameters
filepaths = {
    "wt": glob("benchmark_data/wt/dimers/*.pdb"),
    "carbonara_maxseqid": glob("benchmark_data/carbonara/dimers/maxseqid/alphafold_models_msa/*_rank_001_*.pdb"),
    "carbonara_minseqid": glob("benchmark_data/carbonara/dimers/minseqid/alphafold_models_msa/*_rank_001_*.pdb"),
    "carbonara_minseqsim": glob("benchmark_data/carbonara/dimers/minseqsim/alphafold_models_msa/*_rank_001_*.pdb"),
}
keys = ["carbonara_maxseqid", "carbonara_minseqid", "carbonara_minseqsim"]

# read structures 
fp_map = {}
for key in filepaths:
    fp_map[key] = {}
    for fp in filepaths[key]:
        sid = re.search(r'/([0-9A-Z]{4}(_[A-Z])?)', fp)[1]
        fp_map[key][sid] = fp
        
# evaluate structures 
results = []
for key in keys:
    for sid in fp_map[key]:
        pdbid = sid.split('_')[0]
        if pdbid in fp_map["wt"]:
            # get structures pair
            fp_ref = fp_map["wt"][pdbid]
            fp = fp_map[key][sid]

            # temporary pdb files without sequence information and backbone only
            structure = sp.extract_backbone(sp.read_pdb(fp_ref))
            structure['resname'][:] = "GLY"
            sp.save_pdb(sp.split_by_chain(structure), "/tmp/ref.pdb")
            
            structure = sp.extract_backbone(sp.read_pdb(fp))
            structure['resname'][:] = "GLY"
            sp.save_pdb(sp.split_by_chain(structure), "/tmp/mod.pdb")

            # run tmscore and parse output
            #command = "tmscore/TMscore -c -ter 0 {} {}".format(fp, fp_ref)
            command = "tmscore/TMscore -c -ter 0 /tmp/mod.pdb /tmp/ref.pdb"
            output = os.popen(command).read()
            tmscore = float(re.search(r'TM-score    = ([0-9].[0-9]*)', output)[1])

            # store results
            results.append({
                "method": key,
                'sid': sid,
                'tmscore': tmscore,
            })

# pack results
dfd = pd.DataFrame(results).sort_values(["sid", "method"])
dfd = dfd.groupby("sid").filter(lambda x: len(x)==len(keys))
dfo.to_csv("results/dimers_options_structure_msa_tmscore.csv", index=False)
dfd

Unnamed: 0,method,sid,tmscore
38,carbonara_maxseqid,1DAN_T,0.9744
100,carbonara_minseqid,1DAN_T,0.9594
162,carbonara_minseqsim,1DAN_T,0.6444
10,carbonara_maxseqid,1DAN_U,0.9766
72,carbonara_minseqid,1DAN_U,0.9570
...,...,...,...
94,carbonara_minseqid,6PNW_A,0.4752
156,carbonara_minseqsim,6PNW_A,0.2038
15,carbonara_maxseqid,6PNW_B,0.4859
77,carbonara_minseqid,6PNW_B,0.4875
