In [1]:
import os
import glob
import tqdm
import pandas as pd
import math

In [6]:
ROSETTA_BIN_DIR_PARIS = "/home/ml/novozymes-prediction/resources/rosetta/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/"
ROSETTA_BIN_DIR_SOUTH = "/home/tom/Documents/Kaggle/novozymes-prediction/resources/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/"
RELAX_BIN_PARIS = f"{ROSETTA_BIN_DIR_PARIS}relax.static.linuxgccrelease"
RELAX_BIN_SOUTH = f"{ROSETTA_BIN_DIR_SOUTH}relax.static.linuxgccrelease"
THREADS_PARIS = 10
THREADS_SOUTH = 0
THREADS_GCP = 0
THREADS = THREADS_PARIS+THREADS_SOUTH+THREADS_GCP

MAX_CYCLES = 10000
# -default_max_cycles {MAX_CYCLES}
ROSETTA_PARAMETERS = f"-relax:constrain_relax_to_start_coords -out:suffix _relaxed -out:no_nstruct_label -relax:ramp_constraints false -default_max_cycles {MAX_CYCLES}"

DATASET_PATH = "../data/main_dataset_creation/outputs/merged/dataset_with_3D_paths.csv"
COMPUTE_SUBMISSION_DELETION = False

In [7]:
df = pd.read_csv(DATASET_PATH)
alphafold_paths = df.alphafold_path.unique().tolist()

already_computed_relaxed = [os.path.splitext(x.split('/')[-1])[0].split('_')[0] for x in glob.glob("./relaxed_pdb/**/*.pdb")]
to_compute = []
for alphafold_path in alphafold_paths:
    if (type(alphafold_path) == type(0.0)) and (math.isnan(alphafold_path)):
        continue
    name, _ = os.path.splitext(alphafold_path.split("/")[-1])
    if name not in already_computed_relaxed:
        to_compute.append(alphafold_path)

# alphafold_paths contains one "nan" element
print(f"found {len(already_computed_relaxed)}/{len(alphafold_paths)-1} already computed relaxed pdb")
print(f"to_compute: len: {len(to_compute)}, {to_compute[:3]}")

found 9315/467 already computed relaxed pdb
to_compute: len: 10, ['./data/main_dataset_creation/3D_structures/alphafold/P00766.pdb', './data/main_dataset_creation/3D_structures/alphafold/Q9QZH4.pdb', './data/main_dataset_creation/3D_structures/alphafold/A0A502FPX5.pdb']


In [8]:
for i, alphafold_path in enumerate(to_compute):
    script_suffix = f"{'PARIS' if (i % THREADS)<THREADS_PARIS else 'SOUTH'}_{i % THREADS}"
    with open(f"bash_script_{script_suffix}.sh", "a+") as f:
        try:
            name, _ = os.path.splitext(alphafold_path.split("/")[-1])
            if (i % THREADS) < THREADS_PARIS:
                cmd = f"{RELAX_BIN_PARIS} -in:file:s .{alphafold_path} {ROSETTA_PARAMETERS} > {name}.log"
            else:
                cmd = f"{RELAX_BIN_SOUTH} -in:file:s .{alphafold_path} {ROSETTA_PARAMETERS} > {name}.log"
                
            f.write(cmd)
            f.write("\n")
            f.write(f"mv score_relaxed.sc {name}_score_relaxed.sc")
            f.write("\n")
        except Exception as e:
            print(f"exception raised for {name}: {e}")


In [9]:
if THREADS_PARIS>0:
    with open("main_bash_script_PARIS.sh", "w+") as f:
        for i in range(THREADS_PARIS):
            f.write(f"bash bash_script_PARIS_{i}.sh & \n") 

if THREADS_SOUTH>0:
    with open("main_bash_script_SOUTH.sh", "w+") as f:
        for i in range(THREADS_SOUTH):
            f.write(f"bash bash_script_SOUTH_{i+THREADS_PARIS}.sh & \n")


In [9]:
if COMPUTE_SUBMISSION_DELETION:
    df = pd.read_csv("../data/processed_test.csv")
    # deletion is _, so we look for path such as A23__unrelaxed_rank_1.pdb
    alphafold_paths = glob.glob(
        "../data/main_dataset_creation/3D_structures/all_test_alphafold/*__*.pdb")

    already_computed_relaxed = [os.path.splitext(
        x.split('/')[-1])[0].split('_')[0] for x in glob.glob("./relaxed_pdb/**/AF70_*.pdb")]
    to_compute = []
    for alphafold_path in alphafold_paths:
        name, _ = os.path.splitext(alphafold_path.split("/")[-1])
        name = name.split('_')[0]
        name = "AF70_alphafold_"+name
        if name not in already_computed_relaxed:
            to_compute.append(alphafold_path)

    # alphafold_paths contains one "nan" element
    print(
        f"found {len(already_computed_relaxed)}/{len(alphafold_paths)-1} already computed relaxed pdb")
    print(f"to_compute: len: {len(to_compute)}, {to_compute[:3]}")

    for i, alphafold_path in enumerate(to_compute):
        script_suffix = f"{'PARIS' if (i % THREADS)<THREADS_PARIS else 'SOUTH'}_{i % THREADS}"
        with open(f"bash_script_{script_suffix}.sh", "a+") as f:
            try:
                name, _ = os.path.splitext(alphafold_path.split("/")[-1])
                name = name.split('_')[0]
                name = "AF70_alphafold_"+name
                if (i % THREADS) < THREADS_PARIS:
                    cmd = " ".join([f"{RELAX_BIN_PARIS}",
                                    f"-in:file:s {alphafold_path}",
                                    "-out:path ./relaxed_pdb/AF70_alphafold/",
                                    f"{ROSETTA_PARAMETERS} > {name}.log"])
                else:
                    cmd = " ".join([f"{RELAX_BIN_SOUTH}",
                                    f"-in:file:s {alphafold_path}",
                                    "-out:path ./relaxed_pdb/AF70_alphafold/",
                                    f"{ROSETTA_PARAMETERS} > {name}.log"])
                f.write(cmd)
                f.write("\n")
                f.write(f"mv score_relaxed.sc {name}_score_relaxed.sc")
                f.write("\n")
            except Exception as e:
                print(f"exception raised for {name}: {e}")


found 2337/76 already computed relaxed pdb
to_compute: len: 77, ['../data/main_dataset_creation/3D_structures/all_test_alphafold/Q38__unrelaxed_rank_1_model_3.pdb', '../data/main_dataset_creation/3D_structures/all_test_alphafold/P28__unrelaxed_rank_1_model_3.pdb', '../data/main_dataset_creation/3D_structures/all_test_alphafold/K185__unrelaxed_rank_1_model_3.pdb']
