In [11]:
# format to specify is the one from thermonet
# needs to bee: P03958A 263 W F
# ie: uniprot+mutated_chain mutation_position wild_aa mutated_aa

In [12]:
import pandas as pd
import os
from glob import glob

In [13]:
DATASET_INPUT = "../data/main_dataset_creation/outputs/all_v2_2/dataset_with_alphafold_paths.csv"
COMPUTE_NEW_MUTATIONS_LISTS = True
CLEAN_MUTATIONS = True
GET_ALREADY_COMPUTED = False

ROSETTA_BIN_DIR_PARIS = "/home/ml/novozymes-prediction/resources/rosetta/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/"
ROSETTA_BIN_DIR_SOUTH = "/home/tom/Documents/Kaggle/novozymes-prediction/resources/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/"
RELAX_BIN_PARIS = f"{ROSETTA_BIN_DIR_PARIS}relax.static.linuxgccrelease"
RELAX_BIN_SOUTH = f"{ROSETTA_BIN_DIR_SOUTH}relax.static.linuxgccrelease"
THREADS_PARIS = 12
THREADS_SOUTH = 8
PARIS_RATIO = 0.5
THREADS = THREADS_PARIS+THREADS_SOUTH

SUBSET_DUPLICATES_NO_PH = ["uniprot", "wild_aa", "mutation_position",
                           "mutated_aa", "sequence"]


In [14]:
df = pd.read_csv(DATASET_INPUT)
df = df[~(df.alphafold_path.isna())]
df = df[~(df.ddG.isna())]
df.drop_duplicates(subset=SUBSET_DUPLICATES_NO_PH, inplace=True)
print(len(df))

5704


In [15]:
print(len(df.uniprot.unique()))
print(len(df.alphafold_path.unique()))

297
294


In [16]:
if CLEAN_MUTATIONS:
    for path in glob("./mutations/*"):
        os.remove(path)


In [17]:
crashed = glob("relaxed_pdb/**/*CRASH*")
crashed = [n.split('/')[-2] for n in crashed]
print(len(crashed))
print(crashed[:5])


17
['Q04206_relaxed', 'P0306_relaxed', 'P38398_relaxed', 'P21695_relaxed', 'P61769_relaxed']


In [18]:
if COMPUTE_NEW_MUTATIONS_LISTS:
    already_computed = glob("relaxed_pdb/**/*_relaxed*_relaxed.pdb")
    
    already_computed = [n.split('/')[-1] for n in already_computed]
    total_to_compute = 0

    for alphafold_path in df.alphafold_path.unique():
        alphafold_name = os.path.splitext(
            alphafold_path.split('/')[-1])[0]
        if f"{alphafold_name}_relaxed" in crashed:
            continue

        subdf = df[df.alphafold_path.eq(alphafold_path)]
        # create the mutation list from the sub df that contains the alphafold_path
        for _, row in subdf.iterrows():
            # we add 1 to the position as we index starting at 0 and rosetta at 1
            output_name = f"{alphafold_name}_relaxed_{row['wild_aa']}{int(row['mutation_position']+1)}{row['mutated_aa']}_relaxed.pdb"
            if output_name not in already_computed:
                with open(f"mutations/{alphafold_name}.txt", "a+") as f:
                    line = f"{alphafold_name}{row['mutated_chain']} {int(row['mutation_position']+1)} {row['wild_aa']} {row['mutated_aa']}"
                    f.write(line)
                    f.write("\n")
                total_to_compute += 1
    print(total_to_compute)


1445


In [19]:
# split mutations lists between CPUs
mutations_lists = glob("mutations/*.txt")
print(f"{len(mutations_lists)}, Paris to: {int(len(mutations_lists)*PARIS_RATIO)}")
mutations_PARIS = mutations_lists[:int(len(mutations_lists)*PARIS_RATIO)]
mutations_SOUTH = mutations_lists[int(len(mutations_lists)*PARIS_RATIO):]

print("mutations/P06654.txt" in mutations_PARIS)
print("mutations/P06654.txt" in mutations_SOUTH)


26, Paris to: 13
False
True


In [20]:
if GET_ALREADY_COMPUTED:
    already_computed = glob("relaxed_pdb/**/*_relaxed*_relaxed.pdb")
    print(len(already_computed))
    print(already_computed[:5])
    total_to_compute = 0
    for mutations_list in mutations_PARIS:
        name, _ = os.path.splitext(mutations_list.split("/")[-1])
        num_lines = sum(1 for line in open(mutations_list))
        # computed = glob(f'relaxed_pdb/**/{name}_relaxed*_relaxed.pdb')
        # print(f"for {name} num_lines: {num_lines} computed: {len(computed)}")
        # print(computed)
        total_to_compute += num_lines
    print(total_to_compute)


In [21]:
# create bash scripts

for i, mutations_list in enumerate(mutations_PARIS):
    script_suffix = f"PARIS_{i % THREADS_PARIS}"
    name, _ = os.path.splitext(mutations_list.split("/")[-1])
    with open(f"mutations_{script_suffix}.sh", "a+") as f:
        cmd = f"python3 rosetta_relax.py --rosetta-bin {RELAX_BIN_PARIS} -l {mutations_list} --base-dir ./relaxed_pdb/ > {name}.log"
        f.write(cmd)
        f.write("\n")

for i, mutations_list in enumerate(mutations_SOUTH):
    name, _ = os.path.splitext(mutations_list.split("/")[-1])
    script_suffix = f"SOUTH_{i % THREADS_SOUTH}"
    with open(f"mutations_{script_suffix}.sh", "a+") as f:
        cmd = f"python3 rosetta_relax.py --rosetta-bin {RELAX_BIN_SOUTH} -l {mutations_list} --base-dir ./relaxed_pdb/ > {name}.log"
        f.write(cmd)
        f.write("\n")


In [22]:
with open("main_mutations_PARIS.sh", "w+") as f:
    for i in range(THREADS_PARIS):
        f.write(f"bash mutations_PARIS_{i}.sh & \n")

with open("main_mutations_SOUTH.sh", "w+") as f:
    for i in range(THREADS_SOUTH):
        f.write(f"bash mutations_SOUTH_{i}.sh & \n")
