This notebook is to test the different feature computation, as it can take a lot of time to compute a multiprocessing script is also available
However for the esm features (using cuda) we cannot do multiprocessing (don't have multiple GPUs yet ;D), so we can do it in this NB, just make sure to change the DATASET_OUTPUT_PATH

In [1]:
import pandas as pd

from utils.features_biopython import add_structure_infos, add_protein_analysis
from utils.demask_features import add_demask_predictions
from utils.file_utils import open_json
from utils.infos_translation import aa_char2int
from utils.esm_features import add_esm_features
from utils.rosetta_features import add_rosetta_scores

ADD_STRUCTURE_INFOS = False
ADD_ROSETTA_SCORES = True
ADD_DEMASK_PREDICTIONS = False
ADD_PROTEIN_ANALYSIS = False
ADD_ESM_FEATURES = True
ONLY_DDG = False
SAVE_NEW_CSV = True
CONVERT_MUTATION_TO_INT = False
CLEAN_DF = True
START_FRESH = True

NAME = "merged"
DATASET_DIR = f"./data/main_dataset_creation/outputs/{NAME}/"
DATASET_INPUT_PATH = DATASET_DIR+"dataset_with_voxel.csv"
DATASET_OUTPUT_PATH = DATASET_DIR+"dataset_with_esm_rosetta.csv"




In [2]:
if START_FRESH:
    df = pd.read_csv(DATASET_INPUT_PATH)
else:
    df = pd.read_csv(DATASET_OUTPUT_PATH)

# df.head(2)

In [3]:
if CLEAN_DF:
    print(len(df))
    if ONLY_DDG:
        # we drop rows without ddG
        df = df[~(df.ddG.isna())]
    # we drop rows without essential values
    for k in ["wild_aa", "mutation_position", "mutated_aa", "sequence", "alphafold_path", "relaxed_wild_3D_path", "relaxed_mutated_3D_path"]:
        df = df[~(df[k].isna())]
    print(len(df))
    # print(df.isna().sum().to_dict())

# df

10583
10583


In [4]:
if CONVERT_MUTATION_TO_INT:
    df["wild_aa_int"] = df["wild_aa"].apply(lambda x: aa_char2int[x])
    df["mutated_aa_int"] = df["mutated_aa"].apply(lambda x: aa_char2int[x])


In [5]:
if ADD_ROSETTA_SCORES:
    df = add_rosetta_scores(df)

In [6]:
# add residue depth, sasa and c_alpha depth computed from alphafold pdb file => compute_sasa = True, compute_depth = True
# add residue dssp infos (rsa etc.) => compute_dssp = True
if ADD_STRUCTURE_INFOS:
    df = add_structure_infos(df, compute_sasa=True,
                             compute_depth=True, compute_dssp=True, compute_bfactor=True)

#sasa 2 s/it
#depth 20+ s/it
#dssp 1.15 it/s
#bfactor 1.3 it/s
#all 22.5s/it

In [7]:
if ADD_PROTEIN_ANALYSIS:
    df = add_protein_analysis(df)


In [8]:
if ADD_DEMASK_PREDICTIONS:
    df = add_demask_predictions(df)
    

In [9]:
if ADD_ESM_FEATURES:
    df = add_esm_features(df, use_saved_embeddings=False, only_ddg=ONLY_DDG)

loaded model
cuda
Extracting embeddings from proteins...


432it [01:45,  4.10it/s]


cuda


In [10]:
print(df.isna().sum().to_dict())

if SAVE_NEW_CSV:
    df.to_csv(DATASET_OUTPUT_PATH, index=False)
    print(df.isna().sum())
    print(df.head())


{'AlphaFoldDB': 1499, 'Tm': 6976, 'alphafold_decoy': 10583, 'alphafold_dslf_fa13': 0, 'alphafold_fa_atr': 0, 'alphafold_fa_dun': 0, 'alphafold_fa_elec': 0, 'alphafold_fa_intra_rep': 0, 'alphafold_fa_intra_sol_xover4': 0, 'alphafold_fa_rep': 0, 'alphafold_fa_sol': 0, 'alphafold_hbond_bb_sc': 0, 'alphafold_hbond_lr_bb': 0, 'alphafold_hbond_sc': 0, 'alphafold_hbond_sr_bb': 0, 'alphafold_linear_chainbreak': 0, 'alphafold_lk_ball_wtd': 0, 'alphafold_omega': 0, 'alphafold_overlap_chainbreak': 0, 'alphafold_p_aa_pp': 0, 'alphafold_path': 0, 'alphafold_pro_close': 0, 'alphafold_rama_prepro': 0, 'alphafold_ref': 0, 'alphafold_total_score': 0, 'alphafold_yhh_planarity': 0, 'chain_end': 0, 'chain_start': 0, 'dTm': 5812, 'dataset_source': 0, 'ddG': 2136, 'direct_voxel_path': 0, 'esm_mutation_entropy': 37, 'esm_mutation_probability': 37, 'esm_pca_local_0': 37, 'esm_pca_local_1': 37, 'esm_pca_local_10': 37, 'esm_pca_local_11': 37, 'esm_pca_local_12': 37, 'esm_pca_local_13': 37, 'esm_pca_local_14': 3