In [1]:
# format to specify is a custom one (we tweaked gends.py from thermonet a bit)
# needs to bee: wt_pdb_path, pos, mt_pdb_path
# ie: relaxed_wild_3D_path mutation_position+1 relaxed_mutated_3D_path

In [2]:
import pandas as pd
import numpy as np
import os
from glob import glob


In [6]:
DATASET_INPUT = "../data/main_dataset_creation/outputs/all_v3/dataset_with_all_features.csv"
SUBMISSION_INPUT = "../data/main_dataset_creation/outputs/all_v3/submission_all_features_filled_nan.csv"
COMPUTE_NEW_MUTATIONS_LISTS = True

THREADS_PARIS = 6

SUBSET_DUPLICATES_NO_PH = ["uniprot", "wild_aa", "mutation_position",
                           "mutated_aa", "sequence"]

DELETION_ONLY = True                           
WRITE_VARIANT = True
MAX_SUBDF_SIZE = 100


In [7]:
if not DELETION_ONLY:
    df_train = pd.read_csv(DATASET_INPUT)
    df_train = df_train[~(df_train.alphafold_path.isna())]
    df_train = df_train[~(df_train.ddG.isna())]
    df_train.drop_duplicates(subset=SUBSET_DUPLICATES_NO_PH, inplace=True)
    df_train["already_computed"] = False
    df_train = df_train[SUBSET_DUPLICATES_NO_PH +
                        ["already_computed", "alphafold_path", "relaxed_wild_3D_path", "relaxed_mutated_3D_path"]]

    df_test = pd.read_csv(SUBMISSION_INPUT)
    df_test.drop_duplicates(subset=SUBSET_DUPLICATES_NO_PH, inplace=True)
    df_test["already_computed"] = False
    df_test = df_test[SUBSET_DUPLICATES_NO_PH +
                    ["already_computed", "alphafold_path", "relaxed_wild_3D_path", "relaxed_mutated_3D_path"]]
    # rm deletion
    df_test = df_test[~(df_test.mutated_aa.eq('-'))]


    df = pd.concat([df_train, df_test])
    print(len(df))
else:
    # we only look at the deletion mutation
    # for now they are only in submission dataset
    deletion_df = pd.read_csv(SUBMISSION_INPUT)
    deletion_df = deletion_df[deletion_df.mutated_aa.eq('-')]
    df = deletion_df
    print(len(df))


77


In [8]:
# create variant list in gends_outputs
# Split df by uniprot

unique_uniprot = df.uniprot.unique()
all_uniprot_dfs = []

for uniprot in unique_uniprot:
    uniprot_df = df[df.uniprot.eq(uniprot)]
    if len(uniprot_df) < MAX_SUBDF_SIZE:
        all_uniprot_dfs.append(uniprot_df.copy())
    else:
        number_splits = min(THREADS_PARIS, len(uniprot_df)//MAX_SUBDF_SIZE)
        print(f"splitting {uniprot} in {number_splits} splits")
        subdf_list = [uniprot_df.iloc[index, :].copy()
                    for index in np.array_split(range(len(uniprot_df)), number_splits)]
        all_uniprot_dfs += subdf_list

print("len(all_uniprot_dfs)", len(all_uniprot_dfs))


def append_to_variants(row, k):
    if DELETION_ONLY:
        output_path = f"./gends_input/deletion_{row['uniprot']}_{k}_variants.txt"
    else:
        output_path = f"./gends_input/{row['uniprot']}_{k}_variants.txt"

    with open(output_path, "a+") as f:
        wt_pdb_path = row["relaxed_wild_3D_path"]
        pos = int(row["mutation_position"]+1)
        mt_pdb_path = row["relaxed_mutated_3D_path"]
        f.write(f"{wt_pdb_path} {pos} {mt_pdb_path}\n")
    return row


if WRITE_VARIANT:
    for k, subdf in enumerate(all_uniprot_dfs):
        subdf.apply(lambda row: append_to_variants(row, k), axis=1)
    print("wrote variants")


len(all_uniprot_dfs) 1
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_variants.txt
./gends_input/deletion_AF70_0_v

the following scripts needs to be done in the thermonet conda environment:

> conda activate thermonet

In [9]:
already_computed = glob(
    "./gends_output/*_stacked_16_1*")
uniprot = "P03050"
i = 29
output = f"./gends_output/{uniprot}_{i}_stacked_16_1_direct.npy"
(output in already_computed)

True

In [10]:
already_computed = glob(
    "./gends_output/*_stacked_16_1*")

cmd_list = []
for k, subdf in enumerate(all_uniprot_dfs):
    uniprot = subdf.iloc[0,:]["uniprot"]
    
    if DELETION_ONLY:
        output_check = f"./gends_output/deletion_{uniprot}_{k}_stacked_16_1_direct.npy"
        input_arg = f"-i ./compute_mutated_structures/gends_input/deletion_{uniprot}_{k}_variants.txt"
        output_arg = f"-o ./compute_mutated_structures/gends_output/deletion_{uniprot}_{k}_stacked_16_1"
    else:
        output_check = f"./gends_output/{uniprot}_{k}_stacked_16_1_direct.npy"
        input_arg = f"-i ./compute_mutated_structures/gends_input/{uniprot}_{k}_variants.txt"
        output_arg = f"-o ./compute_mutated_structures/gends_output/{uniprot}_{k}_stacked_16_1"

    if output_check in already_computed:
        continue
    else:
        cmd = " ".join([
            "/home/ml/novozymes-prediction/resources/ThermoNet/ThermoNet/gends.py",
            input_arg,
            output_arg,
            "-p ./ --boxsize 16 --voxelsize 1",
            "\n"
            ])
        cmd_list.append(cmd)

for i, cmd in enumerate(cmd_list):
    script_suffix = f"PARIS_{i % THREADS_PARIS}"
    with open(f"../gends_{script_suffix}.sh", "a+") as f:
        f.write(cmd)

with open("../main_gends_PARIS.sh", "w+") as f:
    for i in range(THREADS_PARIS):
        f.write(f"bash gends_PARIS_{i}.sh & \n")


In [None]:
if DELETION_ONLY:
    print("error with:")
    print("D201__unrelaxed_rank_1_model_3_relaxed")
    print("for now removed it manually")


## Predict ddG using ThermoNet model
because of the headaches with the install of htmd in thermonet conda environment we preferred to install tensorflow keras on a distinct conda environment

be sure to be in this environment for the next step
> conda activate tensorflow

In [45]:
already_computed = glob(
    "./gends_output/*_stacked_16_1*")

cmd_list = []
for k, subdf in enumerate(all_uniprot_dfs):
    uniprot = subdf.iloc[0, :]["uniprot"]
    output = f"./gends_output/{uniprot}_{k}_stacked_16_1_direct.npy"

    if not (output in already_computed):
        print(f"{uniprot}_{k} has no computed features yet")
        continue
    else:
        for j in range(10):
            # we do 10 predictions
            cmd_direct = " ".join([
                "/home/ml/novozymes-prediction/resources/ThermoNet/ThermoNet/predict.py",
                f"-x ./compute_mutated_structures/gends_output/{uniprot}_{k}_stacked_16_1_direct.npy",
                f"-m /home/ml/novozymes-prediction/resources/ThermoNet/models/ThermoNet_ensemble_member_{j+1}.h5",
                f"-o ./compute_mutated_structures/thermonet_predictions/{uniprot}_{k}_direct_prediction_{j+1}.txt",
                "\n"
            ])
            cmd_reversed = " ".join([
                "/home/ml/novozymes-prediction/resources/ThermoNet/ThermoNet/predict.py",
                f"-x ./compute_mutated_structures/gends_output/{uniprot}_{k}_stacked_16_1_reversed.npy",
                f"-m /home/ml/novozymes-prediction/resources/ThermoNet/models/ThermoNet_ensemble_member_{j+1}.h5",
                f"-o ./compute_mutated_structures/thermonet_predictions/{uniprot}_{k}_reversed_prediction_{j+1}.txt",
                "\n"
            ])
            cmd_list.append(cmd_direct)
            cmd_list.append(cmd_reversed)

for i, cmd in enumerate(cmd_list):
    with open(f"../thermonet_all.sh", "a+") as f:
        f.write(cmd)


P02751_70 has no computed features yet
O60885_260 has no computed features yet
