In [1]:
import pandas as pd
from ssbio.protein.structure.properties import freesasa

from utils.features_biopython import add_structure_infos, add_protein_analysis, add_demask_predictions
from utils.file_utils import open_json

ADD_DEMASK_PREDICTIONS = False
ADD_PROTEIN_ANALYSIS = True
ADD_STRUCTURE_INFOS = False
DECONSTRUCT_MUTATION_CODE = False
CLEAN_DF = False
DROP_COLUMNS = False
SAVE_NEW_CSV = False

In [2]:
# df = pd.read_csv("./data/main_dataset/main.csv")
df = pd.read_csv("./data/main_dataset.csv")

# df.head(2)

In [3]:
if DECONSTRUCT_MUTATION_CODE:
    df["wild_aa"] = ""
    df["mutated_aa"] = ""
    df["mutation_position"] = 0

    def deconstruct_mutation_code(row):
        s = row["mutation_code"]
        row["wild_aa"] = s[0]
        row["mutation_position"] = s[1:-1]
        row["mutated_aa"] = s[-1]

        return row

    df = df.apply(deconstruct_mutation_code, axis=1)


if CLEAN_DF:
    # we drop rows without ddG
    df = df[~pd.isna(df.ddG)]
    # we drop rows without alphafold_path
    df = df[~pd.isna(df.alphafold_path)]

df

Unnamed: 0,uniprot,PDB_wild,dataset_source,alphafold_path,wild_aa,mutation_position,mutated_aa,mutated_chain,length,chain_start,...,NH->O_1_relidx,O->NH_1_energy,O->NH_1_relidx,O->NH_2_energy,O->NH_2_relidx,NH->O_2_energy,NH->O_2_relidx,ddG,dTm,Tm
0,P06654,1EM7|2GB1|1PGA,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,M,0,A,A,448.0,34.0,...,0.0,-0.2,2.0,0.0,0.0,0.0,0.0,-0.14,,
1,P06654,1EM7|2GB1|1PGA,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,M,0,D,A,448.0,34.0,...,0.0,-0.2,2.0,0.0,0.0,0.0,0.0,-0.38,,
2,P06654,1EM7|2GB1|1PGA,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,M,0,E,A,448.0,34.0,...,0.0,-0.2,2.0,0.0,0.0,0.0,0.0,-0.64,,
3,P06654,1EM7|2GB1|1PGA,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,M,0,F,A,448.0,34.0,...,0.0,-0.2,2.0,0.0,0.0,0.0,0.0,-1.14,,
4,P06654,1EM7|2GB1|1PGA,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,M,0,G,A,448.0,34.0,...,0.0,-0.2,2.0,0.0,0.0,0.0,0.0,-0.30,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5482,P00651,1RTB,prothermdb,./data/main_dataset/3D_structures/alphafold/P0...,F,73,L,A,130.0,27.0,...,1.0,-1.4,2.0,-0.2,-1.0,-0.0,53.0,-3.20,,
5483,P00651,1RTB,prothermdb,./data/main_dataset/3D_structures/alphafold/P0...,F,73,V,A,130.0,27.0,...,1.0,-1.4,2.0,-0.2,-1.0,-0.0,53.0,-4.54,,
5484,P00651,1RTB,prothermdb,./data/main_dataset/3D_structures/alphafold/P0...,F,73,A,A,130.0,27.0,...,1.0,-1.4,2.0,-0.2,-1.0,-0.0,53.0,-6.36,,
5485,Q9EYL5,3BCI,prothermdb,./data/main_dataset/3D_structures/alphafold/Q9...,E,121,Q,A,199.0,26.0,...,2.0,-2.2,4.0,-0.2,-1.0,-0.2,3.0,-1.06,,


In [4]:
# add residue depth, sasa and c_alpha depth computed from alphafold pdb file => compute_sasa = True, compute_depth = True
# add residue dssp infos (rsa etc.) => compute_dssp = True
if ADD_STRUCTURE_INFOS:
    df = add_structure_infos(df, compute_sasa=True,
                            compute_depth=True, compute_dssp=True, compute_bfactor=True)
    df.head(2)

In [5]:
if ADD_PROTEIN_ANALYSIS:
    df = add_protein_analysis(df)

In [6]:
if ADD_DEMASK_PREDICTIONS:
    df = add_demask_predictions(df)

In [7]:
df.head()

Unnamed: 0,uniprot,PDB_wild,dataset_source,alphafold_path,wild_aa,mutation_position,mutated_aa,mutated_chain,length,chain_start,...,NH->O_1_relidx,O->NH_1_energy,O->NH_1_relidx,O->NH_2_energy,O->NH_2_relidx,NH->O_2_energy,NH->O_2_relidx,ddG,dTm,Tm
0,P06654,1EM7|2GB1|1PGA,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,M,0,A,A,448.0,34.0,...,0.0,-0.2,2.0,0.0,0.0,0.0,0.0,-0.14,,
1,P06654,1EM7|2GB1|1PGA,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,M,0,D,A,448.0,34.0,...,0.0,-0.2,2.0,0.0,0.0,0.0,0.0,-0.38,,
2,P06654,1EM7|2GB1|1PGA,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,M,0,E,A,448.0,34.0,...,0.0,-0.2,2.0,0.0,0.0,0.0,0.0,-0.64,,
3,P06654,1EM7|2GB1|1PGA,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,M,0,F,A,448.0,34.0,...,0.0,-0.2,2.0,0.0,0.0,0.0,0.0,-1.14,,
4,P06654,1EM7|2GB1|1PGA,FireProtDB,./data/main_dataset/3D_structures/alphafold/P0...,M,0,G,A,448.0,34.0,...,0.0,-0.2,2.0,0.0,0.0,0.0,0.0,-0.3,,


In [8]:
if SAVE_NEW_CSV or True:
    if DROP_COLUMNS:
        df.drop(columns=["mutation_code", "countByFeatureType",
                "Tm", "AlphaFoldDB", "Texp"], inplace=True)
    
    ordered_columns = open_json("./data/features.json")
    ordered_columns = sum([ordered_columns[k] for k in ordered_columns], [])

    for col in ordered_columns:
        if col not in df.columns.to_list():
            df[col] = ""

    df = df[ordered_columns]

    df.to_csv("./data/main_dataset.csv", index=False)


In [12]:
features_columns = ordered_columns[4:-4]
df[features_columns].isna().sum().sum()


0

In [10]:
# # test #
# from biopandas.pdb import PandasPdb

# alphafold_path = "./data/main_dataset/3D_structures/alphafold/P00509.pdb"
# pdb_df = PandasPdb().read_pdb(alphafold_path)
# atom_df = pdb_df.df['ATOM']
# b_factor = atom_df.groupby("residue_number")[
#     "b_factor"].apply(lambda x: x.median())
# b_factor.to_list()

In [11]:
# test
# from Bio.PDB.PDBParser import PDBParser
# from utils.features_biopython import get_dssp_data

# alphafold_path = "./data/main_dataset/3D_structures/alphafold/P00651.pdb"
# pdb_parser = PDBParser()
# structure = pdb_parser.get_structure("", alphafold_path)
# dssp = get_dssp_data(alphafold_path, structure)
# dssp
