In [15]:
# format to specify is a custom one (we tweaked gends.py from thermonet a bit)
# needs to bee: wt_pdb_path, pos, mt_pdb_path
# ie: relaxed_wild_3D_path mutation_position+1 relaxed_mutated_3D_path

In [16]:
import pandas as pd
import numpy as np
import os
from glob import glob


In [17]:
DATASET_INPUT = "../data/main_dataset_creation/outputs/all_v3/dataset_with_all_features.csv"
SUBMISSION_INPUT = "../data/main_dataset_creation/outputs/all_v3/submission_all_features_filled_nan.csv"
COMPUTE_NEW_MUTATIONS_LISTS = True

THREADS_PARIS = 6

SUBSET_DUPLICATES_NO_PH = ["uniprot", "wild_aa", "mutation_position",
                           "mutated_aa", "sequence"]

MAX_SUBDF_SIZE = 100


In [18]:
df_train = pd.read_csv(DATASET_INPUT)
df_train = df_train[~(df_train.alphafold_path.isna())]
df_train = df_train[~(df_train.ddG.isna())]
df_train.drop_duplicates(subset=SUBSET_DUPLICATES_NO_PH, inplace=True)
df_train["already_computed"] = False
df_train = df_train[SUBSET_DUPLICATES_NO_PH +
                    ["already_computed", "alphafold_path", "relaxed_wild_3D_path", "relaxed_mutated_3D_path"]]

df_test = pd.read_csv(SUBMISSION_INPUT)
df_test.drop_duplicates(subset=SUBSET_DUPLICATES_NO_PH, inplace=True)
df_test["already_computed"] = False
df_test = df_test[SUBSET_DUPLICATES_NO_PH +
                  ["already_computed", "alphafold_path", "relaxed_wild_3D_path", "relaxed_mutated_3D_path"]]
# rm deletion
df_test = df_test[~(df_test.mutated_aa.eq('-'))]


df = pd.concat([df_train, df_test])
print(len(df))


7921


In [19]:
# create variant list in gends_outputs
# Split df by uniprot
unique_uniprot = df.uniprot.unique()
all_uniprot_dfs = []

for uniprot in unique_uniprot:
    uniprot_df = df[df.uniprot.eq(uniprot)]
    if len(uniprot_df) < MAX_SUBDF_SIZE:
        all_uniprot_dfs.append(uniprot_df.copy())
    else:
        number_splits = min(THREADS_PARIS, len(uniprot_df)//MAX_SUBDF_SIZE)
        print(f"splitting {uniprot} in {number_splits} splits")
        subdf_list = [uniprot_df.iloc[index, :].copy()
                      for index in np.array_split(range(len(uniprot_df)), number_splits)]
        all_uniprot_dfs += subdf_list

print("len(all_uniprot_dfs)", len(all_uniprot_dfs))


def append_to_variants(row, k):
    with open(f"./gends_input/{row['uniprot']}_{k}_variants.txt", "a+") as f:
        wt_pdb_path = row["relaxed_wild_3D_path"]
        pos = int(row["mutation_position"]+1)
        mt_pdb_path = row["relaxed_mutated_3D_path"]
        f.write(f"{wt_pdb_path} {pos} {mt_pdb_path}\n")
    return row

for k, subdf in enumerate(all_uniprot_dfs):
    subdf.apply(lambda row: append_to_variants(row, k), axis=1)
print("done")


splitting P06654 in 6 splits
splitting P0ABQ4 in 1 splits
splitting P00648 in 1 splits
splitting P00644 in 6 splits
splitting P07751 in 1 splits
splitting P00720 in 1 splits
splitting AF70 in 6 splits
len(all_uniprot_dfs) 307
done


the following scripts needs to be done in the thermonet conda environment:

> conda activate thermonet

In [20]:
for i, subdf in enumerate(all_uniprot_dfs):
    script_suffix = f"PARIS_{i % THREADS_PARIS}"
    uniprot = subdf.iloc[0,:]["uniprot"]
    with open(f"../gends_{script_suffix}.sh", "a+") as f:
        cmd = " ".join([
            "/home/ml/novozymes-prediction/resources/ThermoNet/ThermoNet/gends.py",
            f"-i ./compute_mutated_structures/gends_input/{uniprot}_{i}_variants.txt",
            f"-o ./compute_mutated_structures/gends_output/{uniprot}_{i}_stacked_16_1",
            "-p ./ --boxsize 16 --voxelsize 1",
            "\n"
            ])
        f.write(cmd)

with open("../main_gends_PARIS.sh", "w+") as f:
    for i in range(THREADS_PARIS):
        f.write(f"bash gends_PARIS_{i}.sh & \n")
