In [7]:
import pandas as pd
from ssbio.protein.structure.properties import freesasa

from utils.features_biopython import add_structure_infos, add_protein_analysis, add_demask_predictions
from utils.file_utils import open_json
from utils.infos_translation import aa_char2int

ADD_DEMASK_PREDICTIONS = False
ADD_PROTEIN_ANALYSIS = False
ADD_STRUCTURE_INFOS = False
CONVERT_MUTATION_TO_INT = True
CLEAN_DF = False
KEEP_ONLY_DDG = True
DROP_COLUMNS = False
SAVE_NEW_CSV = True
START_FRESH = False

DATASET_DIR = "./data/main_dataset_creation/outputs/all/"
DATASET_INPUT_PATH = DATASET_DIR+"dataset_only_infos.csv"
DATASET_OUTPUT_PATH = DATASET_DIR+"dataset_with_features.csv"


In [8]:
if START_FRESH:
    df = pd.read_csv(DATASET_INPUT_PATH)
else:
    df = pd.read_csv(DATASET_OUTPUT_PATH)

# df.head(2)

In [9]:
if CLEAN_DF:
    print(len(df))
    if KEEP_ONLY_DDG:
        # we drop rows without ddG
        df = df[~pd.isna(df.ddG)]
    # we drop rows without alphafold_path
    df = df[~pd.isna(df.alphafold_path)]
    print(len(df))

# df

In [15]:
if CONVERT_MUTATION_TO_INT:
    df["wild_aa_int"] = df["wild_aa"].apply(lambda x: aa_char2int[x])
    df["mutated_aa_int"] = df["mutated_aa"].apply(lambda x: aa_char2int[x])


In [None]:
# add residue depth, sasa and c_alpha depth computed from alphafold pdb file => compute_sasa = True, compute_depth = True
# add residue dssp infos (rsa etc.) => compute_dssp = True
if ADD_STRUCTURE_INFOS:
    df = add_structure_infos(df, compute_sasa=True,
                            compute_depth=True, compute_dssp=True, compute_bfactor=True)
    df.head(2)

In [None]:
if ADD_PROTEIN_ANALYSIS:
    df = add_protein_analysis(df)

In [None]:
if ADD_DEMASK_PREDICTIONS:
    df = add_demask_predictions(df)

In [None]:
df.head()

In [16]:
if SAVE_NEW_CSV:
    if DROP_COLUMNS:
        df.drop(columns=["mutation_code", "AlphaFoldDB"], inplace=True)
    if KEEP_ONLY_DDG:
        df.drop(columns=["dTm", "Tm"], inplace=True)

    ordered_columns = open_json("./data/features.json")
    ordered_columns = sum([ordered_columns[k] for k in ordered_columns], [])

    for col in ordered_columns:
        if col not in df.columns.to_list():
            df[col] = ""

    df = df[ordered_columns]

    df.to_csv(DATASET_OUTPUT_PATH, index=False)


In [None]:
features_columns = ordered_columns[4:-4]
df[features_columns].isna().sum().sum()


In [None]:
# # test #
# from biopandas.pdb import PandasPdb

# alphafold_path = "./data/main_dataset/3D_structures/alphafold/P00509.pdb"
# pdb_df = PandasPdb().read_pdb(alphafold_path)
# atom_df = pdb_df.df['ATOM']
# b_factor = atom_df.groupby("residue_number")[
#     "b_factor"].apply(lambda x: x.median())
# b_factor.to_list()

In [None]:
# test
# from Bio.PDB.PDBParser import PDBParser
# from utils.features_biopython import get_dssp_data

# alphafold_path = "./data/main_dataset/3D_structures/alphafold/P00651.pdb"
# pdb_parser = PDBParser()
# structure = pdb_parser.get_structure("", alphafold_path)
# dssp = get_dssp_data(alphafold_path, structure)
# dssp
