# Split voxel features
In order to speed up computing we computed the voxel features by batch

We now need to split those and add the infos the the dataset

We already have relaxed_wild_3D_path and relaxed_mutated_3D_path and now want voxel_direct_path and voxel_reversed_path as well


In [47]:
import numpy as np
import pandas as pd
from glob import glob
import os
import re


In [48]:
DATASET_INPUT = "../data/main_dataset_creation/outputs/merged/dataset_with_3D_paths.csv"
SUBMISSION_INPUT = "../data/main_dataset_creation/outputs/all_v3/submission_all_features_filled_nan.csv"

COMPUTE_SUBMISSION = False
REMOVE_EMPTY_VOXELS = True

DATASET_OUTPUT = "../data/main_dataset_creation/outputs/merged/dataset_with_voxel.csv"
SUBMISSION_OUTPUT = "../data/main_dataset_creation/outputs/all_v3/submission_with_voxel.csv"


In [49]:
train_df = pd.read_csv(DATASET_INPUT)
if COMPUTE_SUBMISSION:
    submission_df = pd.read_csv(SUBMISSION_INPUT)


In [50]:
def split_features(name: str, base_dir="./", output_dir="./splitted_voxel_features/", errors={}):
    """
    function that creates a df identification infos and the avg prediction values of thermonet
    (both direct and reversed)

    returns:
    a pandas.DataFrame which has the columns:
    ['wild_path', 'position', 'mutated_path',
        'direct_thermonet', 'reversed_thermonet']
    """
    uniprot = name.split("_")[0]
    variant_path = base_dir+"gends_input/"+name+"_variants.txt"
    base_features_path = base_dir+"gends_output/"+name

    # first we get the variant list in a df, in order to know to which mutations each value corresponds to
    variant_df = pd.read_csv(variant_path,
                             names=["wild_path", "position", "mutated_path"],
                             sep=' ')
    variant_df.position = variant_df.position.apply(lambda x: x-1).astype(int)
    variant_df = pd.concat([variant_df, pd.DataFrame(columns=["direct_voxel_path",
                                                              "reversed_voxel_path"])],
                           axis=1)

    # then we load the features from the gends outputs
    try:
        direct_features = np.load(
            base_features_path+"_stacked_16_1_direct.npy")
        reversed_features = np.load(
            base_features_path+"_stacked_16_1_reversed.npy")
    except Exception as e:
        print(f"Exception raised for {name}, {base_features_path}: {e}")
        print(f"not adding features for {name}")
        return pd.DataFrame(), errors
    if len(variant_df) != direct_features.shape[0]:
        if REMOVE_EMPTY_VOXELS:
            os.remove(base_features_path+"_stacked_16_1_direct.npy")
            os.remove(base_features_path+"_stacked_16_1_reversed.npy")
            print(f"rm 2 bad shape voxels: {base_features_path+'_stacked_16_1_direct.npy'} and {base_features_path+'_stacked_16_1_reversed.npy'}")

        errors[name] = {
            "name" : name, 
            "base_features_path" : base_features_path, 
            "len" : len(variant_df), 
            "direct_features" : direct_features.shape[0]}
        print(
            f"ERROR for {name}, {base_features_path}, {len(variant_df)=} != {direct_features.shape[0]=}")
        return pd.DataFrame(), errors
    for i, row in variant_df.iterrows():
        # ./compute_mutated_structures/relaxed_pdb/P03050_relaxed/P03050_relaxed_P8A_relaxed.pdb
        # => P8A
        mutated_path = row["mutated_path"]
        try:
            result = re.search('_relaxed_(.*)_relaxed', mutated_path)
            mutation_code = result.group(1)
        except:
            # case where we have: compute_mutated_structures/relaxed_pdb/AF70_alphafold/K212__unrelaxed_rank_1_model_3_relaxed.pdb
            result = re.search('_alphafold/(.*)_unrelaxed', mutated_path)
            mutation_code = result.group(1)

        direct = direct_features[i]
        reversed = reversed_features[i]
        direct_path = output_dir+f"{uniprot}_{mutation_code}_direct"
        reversed_path = output_dir+f"{uniprot}_{mutation_code}_reversed"

        variant_df.loc[i, "direct_voxel_path"] = direct_path
        variant_df.loc[i, "reversed_voxel_path"] = reversed_path
        np.save(direct_path, direct)
        np.save(reversed_path, reversed)

    return variant_df, errors


def update_main_df(row, main_df: pd.DataFrame):
    # we get mutated_path as a unique protein+mutation identifier
    # but multiple record could have the same mutation on the same protein
    # for example same mutation at different pH

    mutated_path = row["mutated_path"]
    main_df.loc[
        (main_df.relaxed_mutated_3D_path.eq(mutated_path)),
        "direct_voxel_path"
    ] = row["direct_voxel_path"]
    main_df.loc[
        (main_df.relaxed_mutated_3D_path.eq(mutated_path)),
        "reversed_voxel_path"
    ] = row["reversed_voxel_path"]

    return row


def split_voxel_features(main_df: pd.DataFrame, base_dir="./", errors={}):
    main_df = pd.concat([main_df, pd.DataFrame(columns=["direct_voxel_path",
                                                        "reversed_voxel_path"])],
                        axis=1)

    all_variants = glob(
        f"{base_dir}gends_input/*_variants.txt")
    all_names = [v.split('/')[-1].split("_variants.txt")[0]
                 for v in all_variants]
    for name in all_names:
        variant_df, errors = split_features(
            name, base_dir=base_dir, errors=errors)
        variant_df.apply(lambda row: update_main_df(row, main_df), axis=1)

    return main_df, errors


In [51]:
train_df, errors = split_voxel_features(train_df)
print(errors)
# train_df.to_csv(DATASET_OUTPUT, index=False)
if COMPUTE_SUBMISSION:
    submission_df = split_voxel_features(submission_df)
    submission_df.to_csv(SUBMISSION_OUTPUT, index=False)


Exception raised for P17350_1347, ./gends_output/P17350_1347: [Errno 2] No such file or directory: './gends_output/P17350_1347_stacked_16_1_direct.npy'
not adding features for P17350_1347
Exception raised for P60175_1423, ./gends_output/P60175_1423: [Errno 2] No such file or directory: './gends_output/P60175_1423_stacked_16_1_direct.npy'
not adding features for P60175_1423
Exception raised for P01764_1379, ./gends_output/P01764_1379: [Errno 2] No such file or directory: './gends_output/P01764_1379_stacked_16_1_direct.npy'
not adding features for P01764_1379
Exception raised for P47189_1371, ./gends_output/P47189_1371: [Errno 2] No such file or directory: './gends_output/P47189_1371_stacked_16_1_direct.npy'
not adding features for P47189_1371
Exception raised for P21873_1215, ./gends_output/P21873_1215: [Errno 2] No such file or directory: './gends_output/P21873_1215_stacked_16_1_direct.npy'
not adding features for P21873_1215
Exception raised for R9S082_1311, ./gends_output/R9S082_1311

In [52]:
train_df = pd.read_csv(DATASET_OUTPUT)
if COMPUTE_SUBMISSION:
    submission_df = pd.read_csv(SUBMISSION_OUTPUT)


FileNotFoundError: [Errno 2] No such file or directory: '../data/main_dataset_creation/outputs/merged/dataset_with_voxel.csv'

In [None]:
train_df = train_df[~(train_df.direct_voxel_path.isna())]
train_df.to_csv(DATASET_OUTPUT, index=False)


In [None]:
train_df = pd.read_csv(DATASET_OUTPUT)
train_df.direct_voxel_path.isna().sum()

0

In [None]:
import re
s = "compute_mutated_structures/relaxed_pdb/AF70_alphafold/K212__unrelaxed_rank_1_model_3_relaxed.pdb"
result = re.search('_alphafold/(.*)_unrelaxed', s)
result.group(1)

'K212_'