In [1]:
import pandas as pd

from utils.features_biopython import add_structure_infos, add_protein_analysis
from utils.demask_features import add_demask_predictions
from utils.file_utils import open_json
from utils.infos_translation import aa_char2int
from utils.esm_features import add_esm_features

ADD_STRUCTURE_INFOS = True
ADD_DEMASK_PREDICTIONS = False
ADD_PROTEIN_ANALYSIS = False
ADD_ESM_FEATURES = False
ONLY_DDG = True
SAVE_NEW_CSV = False
CONVERT_MUTATION_TO_INT = False
CLEAN_DF = True
START_FRESH = False
DROP_COLUMNS = False

NAME = "all_v3"
DATASET_DIR = f"./data/main_dataset_creation/outputs/{NAME}/"
DATASET_INPUT_PATH = DATASET_DIR+"dataset_with_3D_paths.csv"
DATASET_OUTPUT_PATH = DATASET_DIR+"dataset_with_features.csv"




In [2]:
if START_FRESH:
    df = pd.read_csv(DATASET_INPUT_PATH)
else:
    df = pd.read_csv(DATASET_OUTPUT_PATH)

# df.head(2)

In [3]:
if CLEAN_DF:
    print(len(df))
    if ONLY_DDG:
        # we drop rows without ddG
        df = df[~(df.ddG.isna())]
    # we drop rows without essential values
    for k in ["wild_aa", "mutation_position", "mutated_aa", "sequence", "alphafold_path", "relaxed_wild_3D_path", "relaxed_mutated_3D_path"]:
        df = df[~(df[k].isna())]
    print(len(df))
    # print(df.isna().sum().to_dict())

# df

6587
6587


In [4]:
if CONVERT_MUTATION_TO_INT:
    df["wild_aa_int"] = df["wild_aa"].apply(lambda x: aa_char2int[x])
    df["mutated_aa_int"] = df["mutated_aa"].apply(lambda x: aa_char2int[x])


In [5]:
# add residue depth, sasa and c_alpha depth computed from alphafold pdb file => compute_sasa = True, compute_depth = True
# add residue dssp infos (rsa etc.) => compute_dssp = True
if ADD_STRUCTURE_INFOS:
    df = add_structure_infos(df, compute_sasa=True,
                             compute_depth=True, compute_dssp=True, compute_bfactor=True)

#sasa 2 s/it
#depth 20+ s/it
#dssp 1.15 it/s
#bfactor 1.3 it/s
#all 22.5s/it

  7%|▋         | 20/288 [03:01<40:32,  9.08s/it] 


KeyboardInterrupt: 

In [None]:
if ADD_PROTEIN_ANALYSIS:
    df = add_protein_analysis(df)


In [None]:
if ADD_DEMASK_PREDICTIONS:
    df = add_demask_predictions(df)
    

In [None]:
if ADD_ESM_FEATURES:
    # TODO: extract embeddings from all available proteins, not just the ones with ddg
    df = add_esm_features(df, use_saved_embeddings=False)

In [None]:
print(df.isna().sum().to_dict())

if SAVE_NEW_CSV:
    if DROP_COLUMNS:
        df.drop(columns=["mutation_code", "AlphaFoldDB"], inplace=True)
    # if ONLY_DDG:
    #     df.drop(columns=["dTm", "Tm"], inplace=True)

    ordered_columns = open_json("./data/features.json")
    ordered_columns = sum([ordered_columns[k] for k in ordered_columns], [])

    # for col in ordered_columns:
    #     if col not in df.columns.to_list():
    #         df[col] = ""

    # df = df[ordered_columns]

    df.to_csv(DATASET_OUTPUT_PATH, index=False)
    features_columns = ordered_columns[4:-4]
    print(df.isna().sum())
    df.head()


{'uniprot': 0, 'wild_aa': 0, 'mutated_chain': 0, 'mutation_position': 0, 'mutated_aa': 0, 'pH': 0, 'sequence': 0, 'length': 0, 'chain_start': 0, 'chain_end': 0, 'AlphaFoldDB': 569, 'Tm': 5627, 'ddG': 0, 'dTm': 6243, 'dataset_source': 0, 'infos_found': 0, 'alphafold_path': 0, 'relaxed_wild_3D_path': 0, 'relaxed_mutated_3D_path': 0, 'esm_pca_pool_0': 27, 'esm_pca_pool_1': 27, 'esm_pca_pool_2': 27, 'esm_pca_pool_3': 27, 'esm_pca_pool_4': 27, 'esm_pca_pool_5': 27, 'esm_pca_pool_6': 27, 'esm_pca_pool_7': 27, 'esm_pca_pool_8': 27, 'esm_pca_pool_9': 27, 'esm_pca_pool_10': 27, 'esm_pca_pool_11': 27, 'esm_pca_pool_12': 27, 'esm_pca_pool_13': 27, 'esm_pca_pool_14': 27, 'esm_pca_pool_15': 27, 'esm_pca_pool_16': 27, 'esm_pca_pool_17': 27, 'esm_pca_pool_18': 27, 'esm_pca_pool_19': 27, 'esm_pca_pool_20': 27, 'esm_pca_pool_21': 27, 'esm_pca_pool_22': 27, 'esm_pca_pool_23': 27, 'esm_pca_pool_24': 27, 'esm_pca_pool_25': 27, 'esm_pca_pool_26': 27, 'esm_pca_pool_27': 27, 'esm_pca_pool_28': 27, 'esm_pca_p