# Compute Gends
gends are the input needed for the voxel representation computation

In [36]:
# format to specify is a custom one (we tweaked gends.py from thermonet a bit)
# needs to bee: wt_pdb_path, pos, mt_pdb_path
# ie: relaxed_wild_3D_path mutation_position+1 relaxed_mutated_3D_path

In [37]:
import pandas as pd
import numpy as np
import re
import os
from glob import glob


### Config

In [38]:
DATASET_INPUT = "../data/main_dataset_creation/outputs/merged/dataset_with_3D_paths.csv"
SUBMISSION_INPUT = "../data/main_dataset_creation/outputs/merged/dataset_with_features_path.csv"
COMPUTE_NEW_MUTATIONS_LISTS = True

THREADS_PARIS = 6

SUBSET_DUPLICATES_NO_PH = ["uniprot", "wild_aa", "mutation_position",
                           "mutated_aa", "sequence"]

NEW_RUN = 1
REMOVE_NAN_IN_VARIANT = True
KEEP_ALREADY_COMPUTED = True
DDG_ONLY = False
THERMONET_PREDICTION = False
DELETION_ONLY = False
COMPUTE_DELETION = False                           
WRITE_VARIANT = False
MAX_SUBDF_SIZE = 100


### Load Dataset

In [39]:
# loading the data that we will use to compute the mutated structures

if not DELETION_ONLY:
    df_train = pd.read_csv(DATASET_INPUT)
    df_train = df_train[~(df_train.alphafold_path.isna())]
    if DDG_ONLY:
        df_train = df_train[~(df_train.ddG.isna())]
    df_train.drop_duplicates(subset=SUBSET_DUPLICATES_NO_PH, inplace=True)
    df_train["already_computed"] = False
    # we remove files without relaxed_wild_3D_path or relaxed_mutated_3D_path
    df_train = df_train[~(df_train["relaxed_wild_3D_path"].isna())]
    df_train = df_train[~(df_train["relaxed_mutated_3D_path"].isna())]
    # keeping only columns of interest
    df_train = df_train[SUBSET_DUPLICATES_NO_PH +
                        ["already_computed", "alphafold_path", "relaxed_wild_3D_path", "relaxed_mutated_3D_path"]]
    if COMPUTE_DELETION:
        df_test = pd.read_csv(SUBMISSION_INPUT)
        df_test.drop_duplicates(subset=SUBSET_DUPLICATES_NO_PH, inplace=True)
        df_test["already_computed"] = False
        df_test = df_test[SUBSET_DUPLICATES_NO_PH +
                        ["already_computed", "alphafold_path", "relaxed_wild_3D_path", "relaxed_mutated_3D_path"]]
        # rm deletion
        df_test = df_test[~(df_test.mutated_aa.eq('-'))]


        df = pd.concat([df_train, df_test])
    else:
        df = df_train
    
    print(len(df))
else:
    # we only look at the deletion mutation
    # for now they are only in submission dataset
    deletion_df = pd.read_csv(SUBMISSION_INPUT)
    deletion_df = deletion_df[deletion_df.mutated_aa.eq('-')]
    df = deletion_df
    print(len(df))


8551


In [40]:
# make sure to not compute twice the same variant

already_computed_variants = []
if KEEP_ALREADY_COMPUTED:
    for variant_path in glob(f"./gends_input_backup/*_variants.txt"):
        with open(variant_path) as fp:
            for line in fp.readlines():
                already_computed_variants.append(line.strip())

print(len(already_computed_variants))
print(already_computed_variants[:3])


7994
['./compute_mutated_structures/relaxed_pdb/P01593_relaxed/P01593_relaxed.pdb 98 ./compute_mutated_structures/relaxed_pdb/P01593_relaxed/P01593_relaxed_S98T_relaxed.pdb', './compute_mutated_structures/relaxed_pdb/P01593_relaxed/P01593_relaxed.pdb 15 ./compute_mutated_structures/relaxed_pdb/P01593_relaxed/P01593_relaxed_L15P_relaxed.pdb', './compute_mutated_structures/relaxed_pdb/P01593_relaxed/P01593_relaxed.pdb 29 ./compute_mutated_structures/relaxed_pdb/P01593_relaxed/P01593_relaxed_S29N_relaxed.pdb']


In [41]:
# create variant list in gends_outputs
# Split df by uniprot (in order to speed up the computation)

unique_uniprot = df.uniprot.unique()
all_uniprot_dfs = []

for uniprot in unique_uniprot:
    uniprot_df = df[df.uniprot.eq(uniprot)]
    if len(uniprot_df) < MAX_SUBDF_SIZE:
        all_uniprot_dfs.append(uniprot_df.copy())
    else:
        number_splits = min(THREADS_PARIS, len(uniprot_df)//MAX_SUBDF_SIZE)
        print(f"splitting {uniprot} in {number_splits} splits")
        subdf_list = [uniprot_df.iloc[index, :].copy()
                    for index in np.array_split(range(len(uniprot_df)), number_splits)]
        all_uniprot_dfs += subdf_list

print("len(all_uniprot_dfs)", len(all_uniprot_dfs))


def append_to_variants(row, uniprot_variant_id, infos):
    if DELETION_ONLY:
        output_path = f"./gends_input/deletion_{row['uniprot']}_{uniprot_variant_id}_variants.txt"
    else:
        output_path = f"./gends_input/{row['uniprot']}_{uniprot_variant_id}_variants.txt"

    wt_pdb_path = row["relaxed_wild_3D_path"]
    pos = int(row["mutation_position"]+1)
    mt_pdb_path = row["relaxed_mutated_3D_path"]
    if f"{wt_pdb_path} {pos} {mt_pdb_path}" not in already_computed_variants:
        if WRITE_VARIANT:
            infos["count"] += 1
            with open(output_path, "a+") as f:
                f.write(f"{wt_pdb_path} {pos} {mt_pdb_path}\n")
    return row


infos = {"count": 0}
for k, subdf in enumerate(all_uniprot_dfs):
    if KEEP_ALREADY_COMPUTED:
        uniprot_variant_id = NEW_RUN*1000+k
    else:
        uniprot_variant_id = k
    subdf.apply(lambda row: append_to_variants(
        row, uniprot_variant_id, infos), axis=1)
print(f"wrote {infos['count']} variants")


splitting P06654 in 6 splits
splitting P0ABQ4 in 1 splits
splitting P00648 in 1 splits
splitting P61626 in 1 splits
splitting P00644 in 6 splits
splitting P07751 in 1 splits
splitting P00720 in 3 splits
splitting D4Z2G1 in 1 splits
splitting P63096 in 3 splits
splitting Q93D82 in 1 splits
len(all_uniprot_dfs) 455
wrote 0 variants


the following scripts that will be written needs to be executed in the thermonet conda environment:

> conda activate thermonet

In [44]:
# make sure to not compute twice the same voxel representation
already_computed = glob(
    "./gends_output/*_stacked_16_1*")
print(f"{len(already_computed)=}")

cmd_list = []
for variant_path in glob(f"./gends_input/*_variants.txt"):
    name = re.search('gends_input/(.*)_variants.txt', variant_path)
    name = name.group(1)
    if DELETION_ONLY:
        output_check = f"./gends_output/{name}_stacked_16_1_direct.npy"
        input_arg = f"-i ./compute_mutated_structures/gends_input/{name}_variants.txt"
        output_arg = f"-o ./compute_mutated_structures/gends_output/{name}_stacked_16_1"
    else:
        output_check = f"./gends_output/{name}_stacked_16_1_direct.npy"
        input_arg = f"-i ./compute_mutated_structures/gends_input/{name}_variants.txt"
        output_arg = f"-o ./compute_mutated_structures/gends_output/{name}_stacked_16_1"

    if output_check in already_computed:
        # the voxel representation has already been computed
        continue
    else:
        # we add the command to the list of commands to be executed
        cmd = " ".join([
            "/home/ml/novozymes-prediction/resources/ThermoNet/ThermoNet/gends.py",
            input_arg,
            output_arg,
            "-p ./ --boxsize 16 --voxelsize 1",
            "\n"
            ])
        cmd_list.append(cmd)


# split the commands in several bash files for multi-threading
for i, cmd in enumerate(cmd_list):
    script_suffix = f"PARIS_{i % THREADS_PARIS}"
    with open(f"../gends_{script_suffix}.sh", "a+") as f:
        f.write(cmd)

with open("../main_gends_PARIS.sh", "w+") as f:
    for i in range(THREADS_PARIS):
        f.write(f"bash gends_PARIS_{i}.sh & \n")


len(already_computed)=1206


In [45]:
if DELETION_ONLY:
    print("error with:")
    print("D201__unrelaxed_rank_1_model_3_relaxed")
    print("for now remove it manually")
