In [1]:
# format to specify is the one from thermonet
# needs to bee: P03958A 263 W F
# ie: uniprot+mutated_chain mutation_position wild_aa mutated_aa

In [2]:
import pandas as pd
import os
from glob import glob

In [3]:
DATASET_INPUT = "../data/main_dataset_creation/outputs/all_v2_1/dataset_with_alphafold_paths.csv"
COMPUTE_MUTATIONS_LISTS = False
GET_ALREADY_COMPUTED = True

ROSETTA_BIN_DIR_PARIS = "/home/ml/novozymes-prediction/resources/rosetta/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/"
ROSETTA_BIN_DIR_SOUTH = "/home/tom/Documents/Kaggle/novozymes-prediction/resources/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/"
RELAX_BIN_PARIS = f"{ROSETTA_BIN_DIR_PARIS}relax.static.linuxgccrelease"
RELAX_BIN_SOUTH = f"{ROSETTA_BIN_DIR_SOUTH}relax.static.linuxgccrelease"
THREADS_PARIS = 7
THREADS_SOUTH = 6
PARIS_RATIO = 0.75
THREADS = THREADS_PARIS+THREADS_SOUTH


In [4]:
df = pd.read_csv(DATASET_INPUT)
df = df[~(df.alphafold_path.isna())]
relaxed_pdbs = glob("relaxed_pdb/**/*_relaxed*_relaxed.pdb")
print(f"found {len(relaxed_pdbs)} relaxed pdbs")

found 2275 relaxed pdbs


In [5]:
print(len(df.uniprot.unique()))
print(len(df.alphafold_path.unique()))

481
478


In [30]:
if COMPUTE_MUTATIONS_LISTS:
    for relaxed_pdb in relaxed_pdbs:
        alphafold_name = os.path.splitext(
            relaxed_pdb.split('/')[-1])[0].split('_')[0]
        alphafold_path = f"./data/main_dataset_creation/3D_structures/alphafold/{alphafold_name}.pdb"
        subdf = df[df.alphafold_path.eq(alphafold_path)]

        # create the mutation list from the sub df that contains the alphafold_path
        with open(f"mutations/{alphafold_name}.txt", "w+") as f:
            for _, row in subdf.iterrows():
                # we add 1 to the position as we index starting at 0 and rosetta at 1
                line = f"{alphafold_name}{row['mutated_chain']} {int(row['mutation_position']+1)} {row['wild_aa']} {row['mutated_aa']}"
                f.write(line)
                f.write("\n")


In [6]:
# split mutations lists between CPUs
mutations_lists = glob("mutations/*.txt")
print(f"{len(mutations_lists)}, Paris to: {int(len(mutations_lists)*PARIS_RATIO)}")
mutations_PARIS = mutations_lists[:int(len(mutations_lists)*PARIS_RATIO)]
mutations_SOUTH = mutations_lists[int(len(mutations_lists)*PARIS_RATIO):]


451, Paris to: 338


In [10]:
if GET_ALREADY_COMPUTED:
    already_computed = glob("relaxed_pdb/**/*_relaxed*_relaxed.pdb")
    print(len(already_computed))
    print(already_computed[:5])
    total_to_compute = 0
    for mutations_list in mutations_SOUTH[2:]:
        name, _ = os.path.splitext(mutations_list.split("/")[-1])
        num_lines = sum(1 for line in open(mutations_list))
        computed = glob(f'relaxed_pdb/**/{name}_relaxed*_relaxed.pdb')
        print(f"for {name} num_lines: {num_lines} computed: {len(computed)}")
        print(computed)
        total_to_compute += num_lines
        break
    print(total_to_compute)


917
['relaxed_pdb/P00592_relaxed/P00592_relaxed_N89D_relaxed.pdb', 'relaxed_pdb/P00592_relaxed/P00592_relaxed_H70Q_relaxed.pdb', 'relaxed_pdb/P00592_relaxed/P00592_relaxed_H70N_relaxed.pdb', 'relaxed_pdb/P00592_relaxed/P00592_relaxed_H70K_relaxed.pdb', 'relaxed_pdb/P24297_relaxed/P24297_relaxed_L33V_relaxed.pdb']
for P14679 num_lines: 4 computed: 0
[]
4


In [31]:
# create bash scripts
for i, mutations_list in enumerate(mutations_PARIS):
    script_suffix = f"PARIS_{i % THREADS_PARIS}"
    name, _ = os.path.splitext(mutations_list.split("/")[-1])
    with open(f"mutations_{script_suffix}.sh", "a+") as f:
        cmd = f"python3 rosetta_relax.py --rosetta-bin {RELAX_BIN_PARIS} -l {mutations_list} --base-dir ./relaxed_pdb/ > {name}.log"
        f.write(cmd)
        f.write("\n")

for i, mutations_list in enumerate(mutations_SOUTH):
    name, _ = os.path.splitext(mutations_list.split("/")[-1])
    script_suffix = f"SOUTH_{i % THREADS_SOUTH}"
    with open(f"mutations_{script_suffix}.sh", "a+") as f:
        cmd = f"python3 rosetta_relax.py --rosetta-bin {RELAX_BIN_SOUTH} -l {mutations_list} --base-dir ./relaxed_pdb/ > {name}.log"
        f.write(cmd)
        f.write("\n")


451, Paris to: 338


In [32]:
with open("main_mutations_PARIS.sh", "w+") as f:
    for i in range(THREADS_PARIS):
        f.write(f"bash mutations_PARIS_{i}.sh & \n")

with open("main_mutations_SOUTH.sh", "w+") as f:
    for i in range(THREADS_SOUTH):
        f.write(f"bash mutations_SOUTH_{i}.sh & \n")
