In [1]:
# collect all ids

from pathlib import Path

directory_path = Path("failed_logs")

ids = set()
for file_path in directory_path.iterdir():
    if file_path.is_file():
        ids.add(file_path.stem)

len(ids)

57

In [2]:
index_error = set()
for id in ids:
    with open("failed_logs/" + id + '.err') as file:
        for line in file:
            if 'IndexError' in line:
                index_error.add(id)
print(len(index_error))
print(index_error)

19
{'PDBBdock_26886092_4595', 'PDBBdock_26886092_6230', 'PDBBdock_26886092_1246', 'PDBBdock_26886092_2703', 'PoBudock_26886809_319', 'PDBBdock_26886092_4552', 'Astdock_26886808_59', 'PoBudock_26886809_196', 'PDBBdock_26886092_5203', 'PDBBdock_26886092_7288', 'PDBBdock_26886092_7390', 'PDBBdock_26886092_8654', 'PDBBdock_26886092_10212', 'PDBBdock_26886092_5132', 'PDBBdock_26886092_6261', 'PoBudock_26886809_54', 'PDBBdock_26886092_9298', 'PDBBdock_26886092_4478', 'PDBBdock_26886092_5135'}


In [3]:
oserror = set()
for id in ids:
    with open("failed_logs/" + id + '.err') as file:
        for line in file:
            if 'OSError' in line:
                oserror.add(id)
print(len(oserror))
print(oserror)

8
{'PDBBdock_26886092_10714', 'PoBudock_26886809_413', 'PoBudock_26886809_85', 'PoBudock_26886809_386', 'PoBudock_26886809_365', 'PoBudock_26886809_144', 'PoBudock_26886809_106', 'PoBudock_26886809_217'}


In [4]:
rosetta_errors = set()
unrec_res = set()
for id in ids:
    with open("failed_logs/" + id + '.err') as file:
        for line in file:
            if 'Unrecognized residue' in line:
                unrec_res.add(id)
rosetta_errors.update(unrec_res)
print(len(rosetta_errors), len(unrec_res))

16 16


In [5]:
variant = set()
for id in ids:
    with open("failed_logs/" + id + '.err') as file:
        for line in file:
            if 'with variant' in line:
                variant.add(id)
rosetta_errors.update(variant)
print(len(rosetta_errors), len(variant))
print(variant)

17 1
{'PoBudock_26886809_272'}


In [6]:
disulfide = set()
for id in ids:
    with open("failed_logs/" + id + '.err') as file:
        for line in file:
            if 'disulfide' in line:
                disulfide.add(id)
rosetta_errors.update(disulfide)
print(len(rosetta_errors), len(disulfide))

19 2


In [7]:
missing_atom = set()
for id in ids:
    with open("failed_logs/" + id + '.err') as file:
        for line in file:
            if 'missing_atom' in line:
                missing_atom.add(id)
rosetta_errors.update(missing_atom)
print(len(rosetta_errors), len(missing_atom))

24 5


In [8]:
oom = set()
for id in ids:
    with open("failed_logs/" + id + '.err') as file:
        for line in file:
            if 'Out Of Memory' in line or 'Bus error' in line:
                oom.add(id)
print(len(oom))
print(oom)

0
set()


In [9]:
keyerror = set()
for id in ids:
    with open("failed_logs/" + id + '.err') as file:
        for line in file:
            if 'KeyError' in line:
                keyerror.add(id)
print(len(keyerror))
print(keyerror)

5
{'PoBudock_26886809_412', 'PoBudock_26886809_30', 'PoBudock_26886809_252', 'PoBudock_26886809_287', 'PoBudock_26886809_11'}


In [10]:
currently_collected = set()
currently_collected.update(keyerror, oom, rosetta_errors, oserror, index_error)
print(len(currently_collected), len(ids))
ids.difference(currently_collected)

56 57


{'PoBudock_26886809_201'}

In [11]:
print('keyerror', len(keyerror))
print('oom', len(oom))
print('rosetta_errors', len(rosetta_errors))
print('oserror', len(oserror))
print('index_error', len(index_error))

keyerror 5
oom 0
rosetta_errors 24
oserror 8
index_error 19


In [12]:
known_failed_pdbs = []
for id in ids:
    if 'PDBBdock' in id:
        with open("failed_logs/" + id + '.out') as file:
            for line in file:
                if 'selected complex' in line:
                    line = line.strip().split()
                    known_failed_pdbs.append(line[-1])
                    break
print(known_failed_pdbs)

['2xys', '3zdh', '3c79', '1uv6', '3u8k', '2ymd', '6qri', '5afk', '5afl', '5lxb', '5oaj', '2yme', '3zdg', '6ezq', '3u8n', '4dbm']


In [13]:
from data import PDBRinterface

results = PDBRinterface('pdbbind_h5/')
len(results.results)

11460

In [14]:
res = results.get_result('4j3m')
len(res.relax_df[res.relax_df.type == 'crystal'])
res.docking_df.type.unique()

array(['crystal_docking_perturb', 'crystal_docking_std',
       'relax_docking_perturb', 'relax_docking_std',
       'apo_relax_docking_perturb', 'apo_relax_docking_std'], dtype=object)

In [15]:
from tqdm import tqdm

failed_pdbs = []
progresses = {}

for pdb in tqdm(results.results):
    try:
        res = results.get_result(pdb, False)
    except:
        failed_pdbs.append(pdb)
        continue

    progress = [0, 0, 0, 0, 0, 0, 0, 0, 0]
    progress[0] = len(res.relax_df[res.relax_df.type == 'crystal']) / 1
    progress[1] = len(res.relax_df[res.relax_df.type == 'relax']) / 10
    progress[2] = len(res.relax_df[res.relax_df.type == 'apo_relax']) / 10
    progress[3] = len(res.docking_df[res.docking_df.type == 'crystal_docking_perturb']) / 300
    progress[4] = len(res.docking_df[res.docking_df.type == 'crystal_docking_std']) / 300
    progress[5] = len(res.docking_df[res.docking_df.type == 'relax_docking_perturb']) / 300
    progress[6] = len(res.docking_df[res.docking_df.type == 'relax_docking_std']) / 300
    progress[7] = len(res.docking_df[res.docking_df.type == 'apo_relax_docking_perturb']) / 300
    progress[8] = len(res.docking_df[res.docking_df.type == 'apo_relax_docking_std']) / 300
    progresses[pdb] = progress

print(failed_pdbs)
progresses
    

100%|██████████| 11460/11460 [09:49<00:00, 19.44it/s]

['6qri']





{'4j3m': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '4eo6': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '6n79': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '1txr': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '4j28': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '1pbq': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '5l9o': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '2vh6': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '5ab9': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '4lw1': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '2jj3': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '4o7b': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '3su1': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '2ybu': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '5d3l': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '4qf7': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '1w1p': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 '4der': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0

In [16]:
base = [0 for _ in progresses['4j3m']]
for key in progresses:
    printed = False
    for i in range(len(base)):
        if progresses[key][i] < 1.0:
            base[i] += 1
            printed = True
    if printed:
        print(key, progresses[key])

base

3zdh [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
5lxb [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]
3u8k [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
3zdg [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
6ezq [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]
4dbm [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
3u8n [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
2yme [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
2xys [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]
5oaj [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
1uv6 [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
5afk [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
5afl [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]
3c79 [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]
2ymd [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]


[0, 0, 0, 0, 0, 11, 11, 15, 15]

In [17]:
%load_ext autoreload
%autoreload 2

In [18]:
from data import PDBRinterface, Result
from tqdm import tqdm

results_ast = PDBRinterface('astex_h5/')
print(len(results_ast.results))

failed_pdbs_ast = []
progresses_ast = {}

for pdb in tqdm(results_ast.results):
    try:
        res = results_ast.get_result(pdb, False)
    except:
        failed_pdbs_ast.append(pdb)
        continue

    progress = [0, 0, 0, 0, 0, 0, 0, 0, 0]
    progress[0] = len(res.relax_df[res.relax_df.type == 'crystal']) / 1
    progress[1] = len(res.relax_df[res.relax_df.type == 'relax']) / 10
    progress[2] = len(res.relax_df[res.relax_df.type == 'apo_relax']) / 10
    progress[3] = len(res.docking_df[res.docking_df.type == 'crystal_docking_perturb']) / 300
    progress[4] = len(res.docking_df[res.docking_df.type == 'crystal_docking_std']) / 300
    progress[5] = len(res.docking_df[res.docking_df.type == 'relax_docking_perturb']) / 300
    progress[6] = len(res.docking_df[res.docking_df.type == 'relax_docking_std']) / 300
    progress[7] = len(res.docking_df[res.docking_df.type == 'apo_relax_docking_perturb']) / 300
    progress[8] = len(res.docking_df[res.docking_df.type == 'apo_relax_docking_std']) / 300
    progresses_ast[pdb] = progress

print(failed_pdbs_ast)

base = [0 for _ in range(9)]
for key in progresses_ast:
    for i in range(len(base)):
        if progresses_ast[key][i] < 1.0:
            base[i] += 1

base

85


100%|██████████| 85/85 [00:03<00:00, 22.63it/s]

[]





[0, 0, 0, 1, 1, 1, 1, 1, 1]

In [19]:
results_pobu = PDBRinterface('posebuster_h5/')
print(len(results_pobu.results))

failed_pdbs_pobu = []
progresses_pobu = {}

for pdb in tqdm(results_pobu.results):
    try:
        res = results_pobu.get_result(pdb, False)
    except:
        failed_pdbs_pobu.append(pdb)
        continue

    progress = [0, 0, 0, 0, 0, 0, 0, 0, 0]
    progress[0] = len(res.relax_df[res.relax_df.type == 'crystal']) / 1
    progress[1] = len(res.relax_df[res.relax_df.type == 'relax']) / 10
    progress[2] = len(res.relax_df[res.relax_df.type == 'apo_relax']) / 10
    progress[3] = len(res.docking_df[res.docking_df.type == 'crystal_docking_perturb']) / 300
    progress[4] = len(res.docking_df[res.docking_df.type == 'crystal_docking_std']) / 300
    progress[5] = len(res.docking_df[res.docking_df.type == 'relax_docking_perturb']) / 300
    progress[6] = len(res.docking_df[res.docking_df.type == 'relax_docking_std']) / 300
    progress[7] = len(res.docking_df[res.docking_df.type == 'apo_relax_docking_perturb']) / 300
    progress[8] = len(res.docking_df[res.docking_df.type == 'apo_relax_docking_std']) / 300
    progresses_pobu[pdb] = progress

print(len(failed_pdbs_pobu))

base = [0 for _ in range(9)]
for key in progresses_pobu:
    for i in range(len(base)):
        if progresses_pobu[key][i] < 1.0:
            base[i] += 1

base

428


100%|██████████| 428/428 [00:18<00:00, 22.53it/s]

29





[2, 4, 6, 8, 8, 9, 10, 11, 11]

In [24]:
directory_path = Path("failed_logs")

ids = {
    'PoBudock' : [],
    'PDBBdock' : [],
    'Astdock' : [],
}

for file_path in directory_path.iterdir():
    if file_path.is_file() and file_path.suffix == '.out':
        name = file_path.stem.split('_')[0]
        with open(file_path) as file:
            for line in file:
                if 'Process ' in line:
                    pdbid = line.strip().split()[-1]
                    break
        ids[name].append(pdbid)

for key in ids:
    print(key)
    for pdbids in ids[key]:
        print(pdbids)

PoBudock
8CI0_8EL
8C3N_ADP
8BPL
6VTA_AKN
5SAK_ZRY
7B2C_TP7
7WUX_6OI
7OKF_VH5
7UMW_NAD
7PA4
8AP0_PRP
6YSP_PAL
7V3N_AKG
8EYE_X4I
7T0D_FPP
7L7C_XQ1
7OP9_06K
7M31_TDR
8EX2_Q2Q
7D6O_MTE
7TSF_H4B
7ZZW_KKW
7Q5I_I0F
7SUC_COM
7VB8_STL
7TOM_5AD
7NML_I7B
7RKW_5TV
7NR8_UOE
8CGC_LMR
7Q25_8J9
8F4J_PHO
7QHL_D5P
7ZTL_BCN
7AMC_73B
7ZXV_45D
7QE4_NGA
7VC5_9SF
7FRX_O88
7MSR_DCA
PDBBdock
3c79
4dbm
3zdh
2ymd
5lxb
3zdg
3u8k
1uv6
2xys
5afk
5afl
6qri
2yme
5oaj
3u8n
6ezq
Astdock
1TZ8_DES
