In [1]:
import pandas as pd
from ssbio.protein.structure.properties import freesasa

from utils.features_biopython import add_structure_infos, add_protein_analysis, add_demask_predictions

ADD_DEMASK_PREDICTIONS = True
ADD_PROTEIN_ANALYSIS = False
ADD_STRUCTURE_INFOS = False
DECONSTRUCT_MUTATION_CODE = False
CLEAN_DF = False
DROP_COLUMNS = False
SAVE_NEW_CSV = False

In [2]:
# df = pd.read_csv("./data/main_dataset/main.csv")
df = pd.read_csv("./data/main_dataset.csv")

# df.head(2)

In [3]:
if DECONSTRUCT_MUTATION_CODE:
    df["wild_aa"] = ""
    df["mutated_aa"] = ""
    df["mutation_position"] = 0

    def deconstruct_mutation_code(row):
        s = row["mutation_code"]
        row["wild_aa"] = s[0]
        row["mutation_position"] = s[1:-1]
        row["mutated_aa"] = s[-1]

        return row

    df = df.apply(deconstruct_mutation_code, axis=1)


if CLEAN_DF:
    # we drop rows without ddG
    df = df[~pd.isna(df.ddG)]
    # we drop rows without alphafold_path
    df = df[~pd.isna(df.alphafold_path)]

df

Unnamed: 0,uniprot,PDB_wild,ddG,wild_aa,mutation_position,mutated_aa,mutated_chain,length,chain_start,chain_end,...,NH->O_2_energy,delta_isoelectric_point,NH->O_1_energy,O->NH_1_relidx,delta_molar_extinction_2,blosum90,dTm,dataset_source,alphafold_path,sequence
0,P06654,1EM7|2GB1|1PGA,-0.14,M,0,A,A,448.0,34.0,417.0,...,0.0,-0.000171,0.0,2.0,0,-2.0,,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...
1,P06654,1EM7|2GB1|1PGA,-0.38,M,0,D,A,448.0,34.0,417.0,...,0.0,0.026771,0.0,2.0,0,-4.0,,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...
2,P06654,1EM7|2GB1|1PGA,-0.64,M,0,E,A,448.0,34.0,417.0,...,0.0,0.021656,0.0,2.0,0,-3.0,,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...
3,P06654,1EM7|2GB1|1PGA,-1.14,M,0,F,A,448.0,34.0,417.0,...,0.0,-0.000171,0.0,2.0,0,-1.0,,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...
4,P06654,1EM7|2GB1|1PGA,-0.30,M,0,G,A,448.0,34.0,417.0,...,0.0,-0.000171,0.0,2.0,0,-4.0,,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5482,P00651,1RTB,-3.20,F,73,L,A,130.0,27.0,130.0,...,-0.0,0.000000,-0.1,2.0,0,0.0,,prothermdb,./data/main_dataset/3D_structures/alphafold/P0...,MMYSKLLTLTTLLLPTALALPSLVERACDYTCGSNCYSSSDVSTAQ...
5483,P00651,1RTB,-4.54,F,73,V,A,130.0,27.0,130.0,...,-0.0,0.000000,-0.1,2.0,0,-2.0,,prothermdb,./data/main_dataset/3D_structures/alphafold/P0...,MMYSKLLTLTTLLLPTALALPSLVERACDYTCGSNCYSSSDVSTAQ...
5484,P00651,1RTB,-6.36,F,73,A,A,130.0,27.0,130.0,...,-0.0,0.000000,-0.1,2.0,0,-3.0,,prothermdb,./data/main_dataset/3D_structures/alphafold/P0...,MMYSKLLTLTTLLLPTALALPSLVERACDYTCGSNCYSSSDVSTAQ...
5485,Q9EYL5,3BCI,-1.06,E,121,Q,A,199.0,26.0,199.0,...,-0.2,-0.068079,-0.2,4.0,0,2.0,,prothermdb,./data/main_dataset/3D_structures/alphafold/Q9...,MTKKLLTLFIVSMLILTACGKKESATTSSKNGKPLVVVYGDYKCPY...


In [4]:
# add residue depth, sasa and c_alpha depth computed from alphafold pdb file => compute_sasa = True, compute_depth = True
# add residue dssp infos (rsa etc.) => compute_dssp = True
if ADD_STRUCTURE_INFOS:
    df = add_structure_infos(df, compute_sasa=True,
                            compute_depth=True, compute_dssp=True, compute_bfactor=True)
    df.head(2)

In [5]:
if ADD_PROTEIN_ANALYSIS:
    df = add_protein_analysis(df)

In [6]:
if ADD_DEMASK_PREDICTIONS:
    df = add_demask_predictions(df)

100%|██████████| 257/257 [00:53<00:00,  4.84it/s]


In [7]:
df.head()

Unnamed: 0,uniprot,PDB_wild,ddG,wild_aa,mutation_position,mutated_aa,mutated_chain,length,chain_start,chain_end,...,delta_molar_extinction_2,blosum90,dTm,dataset_source,alphafold_path,sequence,demask_score,demask_entropy,demask_log2f_var,demask_matrix
0,P06654,1EM7|2GB1|1PGA,-0.14,M,0,A,A,448.0,34.0,417.0,...,0,-2.0,,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...,-0.3793,0.0015,-17.8728,-0.2641
1,P06654,1EM7|2GB1|1PGA,-0.38,M,0,D,A,448.0,34.0,417.0,...,0,-4.0,,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...,-0.4907,0.0015,-17.8728,-0.41
2,P06654,1EM7|2GB1|1PGA,-0.64,M,0,E,A,448.0,34.0,417.0,...,0,-3.0,,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...,-0.4809,0.0015,-17.8728,-0.3972
3,P06654,1EM7|2GB1|1PGA,-1.14,M,0,F,A,448.0,34.0,417.0,...,0,-1.0,,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...,-0.3843,0.0015,-17.8728,-0.2707
4,P06654,1EM7|2GB1|1PGA,-0.3,M,0,G,A,448.0,34.0,417.0,...,0,-4.0,,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...,-0.4602,0.0015,-17.8728,-0.37


In [8]:
if SAVE_NEW_CSV or True:
    if DROP_COLUMNS:
        df.drop(columns=["mutation_code", "countByFeatureType",
                "Tm", "AlphaFoldDB", "Texp"], inplace=True)

    FIRST_COLUMNS = ["uniprot", "PDB_wild", "ddG", "wild_aa", "mutation_position",
            "mutated_aa", "mutated_chain", "length", "chain_start", "chain_end",
            "pH", "sasa", "residue_depth", "c_alpha_depth", "molWeight"]
    LAST_COLUMNS = ["dataset_source","alphafold_path", "sequence"]

    df = df[FIRST_COLUMNS+(list(set(df.columns.to_list())-set(FIRST_COLUMNS)-set(LAST_COLUMNS)))+LAST_COLUMNS]

    df.to_csv("./data/main_dataset.csv", index=False)

In [9]:
# # test #
# from biopandas.pdb import PandasPdb

# alphafold_path = "./data/main_dataset/3D_structures/alphafold/P00509.pdb"
# pdb_df = PandasPdb().read_pdb(alphafold_path)
# atom_df = pdb_df.df['ATOM']
# b_factor = atom_df.groupby("residue_number")[
#     "b_factor"].apply(lambda x: x.median())
# b_factor.to_list()

In [10]:
# test
# from Bio.PDB.PDBParser import PDBParser
# from utils.features_biopython import get_dssp_data

# alphafold_path = "./data/main_dataset/3D_structures/alphafold/P00651.pdb"
# pdb_parser = PDBParser()
# structure = pdb_parser.get_structure("", alphafold_path)
# dssp = get_dssp_data(alphafold_path, structure)
# dssp


In [12]:
from utils.file_utils import write_json
write_json("./data/features.json", df.columns.to_list())