In [1]:
import os
import glob
import tqdm
import pandas as pd
import math

In [2]:
ROSETTA_BIN_DIR_PARIS = "/home/ml/novozymes-prediction/resources/rosetta/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/"
ROSETTA_BIN_DIR_SOUTH = "/home/tom/Documents/Kaggle/novozymes-prediction/resources/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/"
RELAX_BIN_PARIS = f"{ROSETTA_BIN_DIR_PARIS}relax.static.linuxgccrelease"
RELAX_BIN_SOUTH = f"{ROSETTA_BIN_DIR_SOUTH}relax.static.linuxgccrelease"
THREADS_PARIS = 12
THREADS_SOUTH = 8
THREADS_GCP = 32
THREADS = THREADS_PARIS+THREADS_SOUTH+THREADS_GCP

MAX_CYCLES = 10000
# -default_max_cycles {MAX_CYCLES}
ROSETTA_PARAMETERS = f"-relax:constrain_relax_to_start_coords -out:suffix _relaxed -out:no_nstruct_label -relax:ramp_constraints false -default_max_cycles {MAX_CYCLES}"

DATASET_PATH = "../data/main_dataset_creation/outputs/all_v2/dataset_with_alphafold_paths.csv"

In [4]:
df = pd.read_csv(DATASET_PATH)
alphafold_paths = df.alphafold_path.unique().tolist()

already_computed_relaxed = [os.path.splitext(x.split('/')[-1])[0].split('_')[0] for x in glob.glob("./relaxed_pdb/**/*.pdb")]
to_compute = []
for alphafold_path in alphafold_paths:
    if (type(alphafold_path) == type(0.0)) and (math.isnan(alphafold_path)):
        continue
    name, _ = os.path.splitext(alphafold_path.split("/")[-1])
    if name not in already_computed_relaxed:
        to_compute.append(alphafold_path)

# alphafold_paths contains one "nan" element
print(f"found {len(already_computed_relaxed)}/{len(alphafold_paths)-1} already computed relaxed pdb")
print(f"to_compute: len: {len(to_compute)}, {to_compute[:3]}")

found 478/478 already computed relaxed pdb
to_compute: len: 0, []


In [7]:
for i, alphafold_path in enumerate(to_compute):
    script_suffix = f"{'PARIS' if (i % THREADS)<THREADS_PARIS else 'SOUTH'}_{i % THREADS}"
    with open(f"bash_script_{script_suffix}.sh", "a+") as f:
        try:
            name, _ = os.path.splitext(alphafold_path.split("/")[-1])
            if (i % THREADS) < THREADS_PARIS:
                cmd = f"{RELAX_BIN_PARIS} -in:file:s .{alphafold_path} {ROSETTA_PARAMETERS} > {name}.log"
            else:
                cmd = f"{RELAX_BIN_SOUTH} -in:file:s .{alphafold_path} {ROSETTA_PARAMETERS} > {name}.log"
                
            f.write(cmd)
            f.write("\n")
            f.write(f"mv score_relaxed.sc {name}_score_relaxed.sc")
            f.write("\n")
        except Exception as e:
            print(f"exception raised for {name}: {e}")


In [6]:
alphafold_path = "data/main_dataset_creation/3D_structures/alphafold/AF70.pdb"
f"{RELAX_BIN_SOUTH} -in:file:s .{alphafold_path} {ROSETTA_PARAMETERS} > AF70.log"


'/home/tom/Documents/Kaggle/novozymes-prediction/resources/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/relax.static.linuxgccrelease -in:file:s .data/main_dataset_creation/3D_structures/alphafold/AF70.pdb -relax:constrain_relax_to_start_coords -out:suffix _relaxed -out:no_nstruct_label -relax:ramp_constraints false -default_max_cycles 10000 > AF70.log'

In [8]:
with open("main_bash_script_PARIS.sh", "w+") as f:
    for i in range(THREADS_PARIS):
        f.write(f"bash bash_script_PARIS_{i}.sh & \n") 

# with open("main_bash_script_SOUTH.sh", "w+") as f:
#     for i in range(THREADS_SOUTH):
#         f.write(f"bash bash_script_SOUTH_{i+THREADS_PARIS}.sh & \n")
