In [12]:
# format to specify is the one from thermonet
# needs to bee: P03958A 263 W F
# ie: uniprot+mutated_chain mutation_position wild_aa mutated_aa

In [13]:
import pandas as pd
import numpy as np
import os
from glob import glob

In [14]:
DATASET_INPUT = "../data/main_dataset_creation/outputs/merged/dataset_with_3D_paths.csv"
SUBMISSION_INPUT = "../data/processed_test.csv"
COMPUTE_NEW_MUTATIONS_LISTS = True
CLEAN_MUTATIONS = True
GET_ALREADY_COMPUTED = False

ROSETTA_BIN_DIR_PARIS = "/home/ml/novozymes-prediction/resources/rosetta/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/"
ROSETTA_BIN_DIR_GCP = "/home/jupyter/novozymes-prediction/resources/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/"
ROSETTA_BIN_DIR_SOUTH = "/home/tom/Documents/Kaggle/novozymes-prediction/resources/rosetta_bin_linux_2021.16.61629_bundle/main/source/bin/"
RELAX_BIN_PARIS = f"{ROSETTA_BIN_DIR_PARIS}relax.static.linuxgccrelease"
RELAX_BIN_GCP = f"{ROSETTA_BIN_DIR_GCP}relax.static.linuxgccrelease"
RELAX_BIN_SOUTH = f"{ROSETTA_BIN_DIR_SOUTH}relax.static.linuxgccrelease"
THREADS_GCP = 32 # 32 max
THREADS_PARIS = 0 # 12 max
THREADS_SOUTH = 6 # 8 max
THREADS = THREADS_PARIS+THREADS_SOUTH+THREADS_GCP
GCP_RATIO = THREADS_GCP/THREADS
PARIS_RATIO = THREADS_PARIS/THREADS
DDG_ONLY = False
SUBSET_DUPLICATES_NO_PH = ["uniprot", "wild_aa", "mutation_position",
                           "mutated_aa", "sequence"]
MAX_PROTEIN_SIZE = 600
MAX_CONSECUTIVE_TASKS = 100

COMPUTE_SUBMISSION = False

In [15]:
df_train = pd.read_csv(DATASET_INPUT)
df_train = df_train[~(df_train.alphafold_path.isna())]
if DDG_ONLY:
    df_train = df_train[~(df_train.ddG.isna())]
df_train.drop_duplicates(subset=SUBSET_DUPLICATES_NO_PH, inplace=True)
df_train["already_computed"] = False
df_train = df_train[SUBSET_DUPLICATES_NO_PH +
                    ["already_computed", "alphafold_path", "mutated_chain"]]
# limit to protein with a sequence not too long
df_train["length"] = df_train.sequence.str.len()
df_train = df_train[df_train["length"].le(MAX_PROTEIN_SIZE)]

if COMPUTE_SUBMISSION:
    df_test = pd.read_csv(SUBMISSION_INPUT)
    df_test.drop_duplicates(subset=SUBSET_DUPLICATES_NO_PH, inplace=True)
    df_test["already_computed"] = False
    df_test = df_test[SUBSET_DUPLICATES_NO_PH +
                    ["already_computed", "alphafold_path", "mutated_chain"]]
    #rm deletion
    df_test = df_test[~(df_test.mutated_aa.eq('-'))]   
    df = pd.concat([df_train, df_test])
else:
    df = df_train
print(len(df))

8165


In [16]:
print(len(df.uniprot.unique()))
print(len(df.alphafold_path.unique()))

406
404


In [17]:
if CLEAN_MUTATIONS:
    for path in glob("./mutations/*"):
        os.remove(path)


In [18]:
crashed_backup = ['Q15562_relaxed', 'Q04206_relaxed', 'P0306_relaxed', 'P38398_relaxed', 'P21695_relaxed',
                  'P61769_relaxed', 'P01051_relaxed', 'P0CG63_relaxed', 'P25963_relaxed',
                  'A8T655_relaxed', 'P01053_relaxed', 'P06876_relaxed', 'P04080_relaxed', 'P01308_relaxed',
                  'Q8NBP7_relaxed', 'Q03026_relaxed', 'P03958_relaxed', 'P07320_relaxed']
crashed = glob("relaxed_pdb/**/*CRASH*")
crashed = [n.split('/')[-2] for n in crashed]
crashed.pop(crashed.index("AF70_relaxed"))
print(len(crashed))
print(crashed)


23
['Q15562_relaxed', 'Q04206_relaxed', 'P36075_relaxed', 'P0306_relaxed', 'P38398_relaxed', 'P21695_relaxed', 'O25949_relaxed', 'P61769_relaxed', 'P01051_relaxed', 'P0CG63_relaxed', 'P25963_relaxed', 'A8T655_relaxed', 'P01053_relaxed', 'P0C0Y9_relaxed', 'P06876_relaxed', 'P01837_relaxed', 'P00766_relaxed', 'P04080_relaxed', 'P01308_relaxed', 'Q8NBP7_relaxed', 'Q03026_relaxed', 'P03958_relaxed', 'P07320_relaxed']


In [19]:
if COMPUTE_NEW_MUTATIONS_LISTS:
    already_computed = glob("relaxed_pdb/**/*_relaxed*_relaxed.pdb")
    
    already_computed = [n.split('/')[-1] for n in already_computed]
    total_to_compute = 0

    for alphafold_path in df.alphafold_path.unique():
        alphafold_name = os.path.splitext(
            alphafold_path.split('/')[-1])[0]
        if f"{alphafold_name}_relaxed" in crashed:
            continue
            
        related_df = df[df.alphafold_path.eq(alphafold_path)]

        def rm_already_computed(row):
            alphafold_name = os.path.splitext(
                row["alphafold_path"].split('/')[-1])[0]
            output_name = f"{alphafold_name}_relaxed_{row['wild_aa']}{int(row['mutation_position']+1)}{row['mutated_aa']}_relaxed.pdb"
            if output_name in already_computed:
                row["already_computed"] = True
            return row

        related_df = related_df.apply(rm_already_computed, axis=1)
        related_df = related_df[~(related_df["already_computed"])]

        if len(related_df) > MAX_CONSECUTIVE_TASKS:
            # in the case of a protein with a lot of related mutation we split it into THREADS sub_subdf
            number_splits = min(THREADS, len(related_df)//MAX_CONSECUTIVE_TASKS)
            subdf_list = [related_df.iloc[index, :]
                          for index in np.array_split(range(len(related_df)), number_splits)]
            print(f"divided {alphafold_name} related_df into {len(subdf_list)} subdf")
        else:
            subdf_list = [related_df]
        
        for i, subdf in enumerate(subdf_list):
            # create the mutation list from the sub df that contains the alphafold_path
            for _, row in subdf.iterrows():
                # we add 1 to the position as we index starting at 0 and rosetta at 1
                with open(f"mutations/{alphafold_name}_{i}.txt", "a+") as f:
                    line = f"{alphafold_name}{row['mutated_chain']} {int(row['mutation_position']+1)} {row['wild_aa']} {row['mutated_aa']}"
                    f.write(line)
                    f.write("\n")
                total_to_compute += 1
    print(total_to_compute)


divided P06654 related_df into 1 subdf
divided P06241 related_df into 1 subdf
divided P00720 related_df into 1 subdf
divided D4Z2G1 related_df into 1 subdf
divided P63096 related_df into 1 subdf
1149


In [20]:
# split mutations lists between CPUs
mutations_lists = glob("mutations/*.txt")
GCP_index = int(len(mutations_lists)*GCP_RATIO)
PARIS_index = GCP_index+int(len(mutations_lists)*PARIS_RATIO)
mutations_GCP = mutations_lists[:GCP_index]
mutations_PARIS = mutations_lists[GCP_index:PARIS_index]
mutations_SOUTH = mutations_lists[PARIS_index:]
print(f"{len(mutations_lists)}, {GCP_index=}, {PARIS_index=}")


64, GCP_index=53, PARIS_index=53


In [21]:
# create bash scripts
for i, mutations_list in enumerate(mutations_GCP):
    script_suffix = f"GCP_{i % THREADS_GCP}"
    name, _ = os.path.splitext(mutations_list.split("/")[-1])
    with open(f"mutations_{script_suffix}.sh", "a+") as f:
        cmd = f"python3 rosetta_relax.py --rosetta-bin {RELAX_BIN_GCP} -l {mutations_list} --base-dir ./relaxed_pdb/ > {name}.log"
        f.write(cmd)
        f.write("\n")

for i, mutations_list in enumerate(mutations_PARIS):
    script_suffix = f"PARIS_{i % THREADS_PARIS}"
    name, _ = os.path.splitext(mutations_list.split("/")[-1])
    with open(f"mutations_{script_suffix}.sh", "a+") as f:
        cmd = f"python3 rosetta_relax.py --rosetta-bin {RELAX_BIN_PARIS} -l {mutations_list} --base-dir ./relaxed_pdb/ > {name}.log"
        f.write(cmd)
        f.write("\n")

for i, mutations_list in enumerate(mutations_SOUTH):
    name, _ = os.path.splitext(mutations_list.split("/")[-1])
    script_suffix = f"SOUTH_{i % THREADS_SOUTH}"
    with open(f"mutations_{script_suffix}.sh", "a+") as f:
        cmd = f"python3 rosetta_relax.py --rosetta-bin {RELAX_BIN_SOUTH} -l {mutations_list} --base-dir ./relaxed_pdb/ > {name}.log"
        f.write(cmd)
        f.write("\n")


In [22]:
with open("main_mutations_GCP.sh", "w+") as f:
    for i in range(THREADS_GCP):
        f.write(f"bash mutations_GCP_{i}.sh & \n")

with open("main_mutations_PARIS.sh", "w+") as f:
    for i in range(THREADS_PARIS):
        f.write(f"bash mutations_PARIS_{i}.sh & \n")

with open("main_mutations_SOUTH.sh", "w+") as f:
    for i in range(THREADS_SOUTH):
        f.write(f"bash mutations_SOUTH_{i}.sh & \n")
