In [1]:
import pandas as pd
from ssbio.protein.structure.properties import freesasa

from utils.features_biopython import add_structure_infos, add_protein_analysis, add_demask_predictions
from utils.file_utils import open_json
from utils.infos_translation import aa_char2int

ADD_STRUCTURE_INFOS = False
ADD_DEMASK_PREDICTIONS = True
ADD_PROTEIN_ANALYSIS = True
KEEP_ONLY_DDG = True
SAVE_NEW_CSV = True
CONVERT_MUTATION_TO_INT = False
CLEAN_DF = False
START_FRESH = False
DROP_COLUMNS = False

NAME = "all_v2"
DATASET_DIR = f"./data/main_dataset_creation/outputs/{NAME}/"
DATASET_INPUT_PATH = DATASET_DIR+"dataset_with_alphafold_paths.csv"
DATASET_OUTPUT_PATH = DATASET_DIR+"dataset_with_features.csv"


In [2]:
if START_FRESH:
    df = pd.read_csv(DATASET_INPUT_PATH)
else:
    df = pd.read_csv(DATASET_OUTPUT_PATH)

# df.head(2)

In [3]:
if CLEAN_DF:
    print(len(df))
    if KEEP_ONLY_DDG:
        # we drop rows without ddG
        df = df[~pd.isna(df.ddG)]
    # we drop rows without alphafold_path
    df = df[~pd.isna(df.alphafold_path)]
    print(len(df))

# df

In [4]:
if CONVERT_MUTATION_TO_INT:
    df["wild_aa_int"] = df["wild_aa"].apply(lambda x: aa_char2int[x])
    df["mutated_aa_int"] = df["mutated_aa"].apply(lambda x: aa_char2int[x])


In [5]:
# add residue depth, sasa and c_alpha depth computed from alphafold pdb file => compute_sasa = True, compute_depth = True
# add residue dssp infos (rsa etc.) => compute_dssp = True
if ADD_STRUCTURE_INFOS:
    df = add_structure_infos(df, compute_sasa=True,
                            compute_depth=True, compute_dssp=True, compute_bfactor=True)
    df.head(2)

In [6]:
df.to_csv(DATASET_OUTPUT_PATH, index=False)


In [7]:
if ADD_PROTEIN_ANALYSIS:
    df = add_protein_analysis(df)

In [8]:
if ADD_DEMASK_PREDICTIONS:
    df = add_demask_predictions(df)

  6%|▌         | 19/308 [00:09<02:09,  2.23it/s]

error: prediction contains more than one element
row:  AlphaFoldDB           NaN
NH->O_1_energy        0.0
NH->O_1_relidx        0.0
NH->O_2_energy        0.0
NH->O_2_relidx        0.0
                     ... 
delta_charge_at_pH    0.0
demask_score          0.0
demask_entropy        0.0
demask_log2f_var      0.0
demask_matrix         0.0
Name: 1295, Length: 65, dtype: object
error: prediction contains more than one element
row:  AlphaFoldDB           NaN
NH->O_1_energy       -0.2
NH->O_1_relidx        2.0
NH->O_2_energy       -0.2
NH->O_2_relidx       -3.0
                     ... 
delta_charge_at_pH    0.0
demask_score          0.0
demask_entropy        0.0
demask_log2f_var      0.0
demask_matrix         0.0
Name: 1296, Length: 65, dtype: object
error: prediction contains more than one element
row:  AlphaFoldDB           NaN
NH->O_1_energy       -0.5
NH->O_1_relidx       -2.0
NH->O_2_energy       -0.0
NH->O_2_relidx        2.0
                     ... 
delta_charge_at_pH    0.0
demas

 92%|█████████▏| 283/308 [01:48<00:08,  2.82it/s]

error: prediction contains more than one element
row:  AlphaFoldDB             P42771
NH->O_1_energy            -2.3
NH->O_1_relidx            -4.0
NH->O_2_energy            -0.2
NH->O_2_relidx             2.0
                        ...   
delta_charge_at_pH    0.999965
demask_score               0.0
demask_entropy             0.0
demask_log2f_var           0.0
demask_matrix              0.0
Name: 6402, Length: 65, dtype: object
error: prediction contains more than one element
row:  AlphaFoldDB           P42771
NH->O_1_energy          -0.2
NH->O_1_relidx          -2.0
NH->O_2_energy          -0.2
NH->O_2_relidx           1.0
                       ...  
delta_charge_at_pH       0.0
demask_score             0.0
demask_entropy           0.0
demask_log2f_var         0.0
demask_matrix            0.0
Name: 6403, Length: 65, dtype: object
error: prediction contains more than one element
row:  AlphaFoldDB             P42771
NH->O_1_energy            -1.9
NH->O_1_relidx            -4.0
NH->O_

 95%|█████████▍| 292/308 [01:51<00:05,  2.81it/s]

error: prediction contains more than one element
row:  AlphaFoldDB           P23176
NH->O_1_energy          -2.8
NH->O_1_relidx          -4.0
NH->O_2_energy          -0.2
NH->O_2_relidx           1.0
                       ...  
delta_charge_at_pH       0.0
demask_score             0.0
demask_entropy           0.0
demask_log2f_var         0.0
demask_matrix            0.0
Name: 7901, Length: 65, dtype: object
error: prediction contains more than one element
row:  AlphaFoldDB           P23176
NH->O_1_energy          -2.5
NH->O_1_relidx         -18.0
NH->O_2_energy          -0.0
NH->O_2_relidx           2.0
                       ...  
delta_charge_at_pH       0.0
demask_score             0.0
demask_entropy           0.0
demask_log2f_var         0.0
demask_matrix            0.0
Name: 7904, Length: 65, dtype: object


 95%|█████████▌| 293/308 [01:51<00:05,  2.79it/s]

error: prediction contains more than one element
row:  AlphaFoldDB           P01593
NH->O_1_energy          -3.0
NH->O_1_relidx         -15.0
NH->O_2_energy          -0.5
NH->O_2_relidx          -2.0
                       ...  
delta_charge_at_pH       0.0
demask_score             0.0
demask_entropy           0.0
demask_log2f_var         0.0
demask_matrix            0.0
Name: 7906, Length: 65, dtype: object
error: prediction contains more than one element
row:  AlphaFoldDB           P01593
NH->O_1_energy          -1.3
NH->O_1_relidx          -4.0
NH->O_2_energy          -0.6
NH->O_2_relidx          -3.0
                       ...  
delta_charge_at_pH       0.0
demask_score             0.0
demask_entropy           0.0
demask_log2f_var         0.0
demask_matrix            0.0
Name: 7907, Length: 65, dtype: object
error: prediction contains more than one element
row:  AlphaFoldDB           P01593
NH->O_1_energy          -2.0
NH->O_1_relidx          15.0
NH->O_2_energy          -0.3
NH->O

 97%|█████████▋| 299/308 [01:54<00:03,  2.79it/s]

error: prediction contains more than one element
row:  AlphaFoldDB             P10912
NH->O_1_energy            -0.0
NH->O_1_relidx             2.0
NH->O_2_energy            -0.0
NH->O_2_relidx             1.0
                        ...   
delta_charge_at_pH   -0.999718
demask_score               0.0
demask_entropy             0.0
demask_log2f_var           0.0
demask_matrix              0.0
Name: 8109, Length: 65, dtype: object
error: prediction contains more than one element
row:  AlphaFoldDB            P10912
NH->O_1_energy           -0.0
NH->O_1_relidx            2.0
NH->O_2_energy           -0.0
NH->O_2_relidx            1.0
                       ...   
delta_charge_at_pH    0.00017
demask_score              0.0
demask_entropy            0.0
demask_log2f_var          0.0
demask_matrix             0.0
Name: 8110, Length: 65, dtype: object
error: prediction contains more than one element
row:  AlphaFoldDB             P10912
NH->O_1_energy            -0.0
NH->O_1_relidx            

100%|██████████| 308/308 [01:57<00:00,  2.62it/s]


In [9]:
df.head()

Unnamed: 0,AlphaFoldDB,NH->O_1_energy,NH->O_1_relidx,NH->O_2_energy,NH->O_2_relidx,O->NH_1_energy,O->NH_1_relidx,O->NH_2_energy,O->NH_2_relidx,Phi,...,blosum62,blosum80,blosum90,delta_instability_index,delta_flexibility,delta_charge_at_pH,demask_score,demask_entropy,demask_log2f_var,demask_matrix
0,P06654,0.0,0.0,0.0,0.0,-0.2,2.0,0.0,0.0,360.0,...,-1.0,-1.0,-2.0,0.0,-0.001524,-0.16508,-0.3793,0.0015,-17.8728,-0.2641
1,P06654,0.0,0.0,0.0,0.0,-0.2,2.0,0.0,0.0,360.0,...,-3.0,-4.0,-4.0,0.0,-0.005524,0.84712,-0.4907,0.0015,-17.8728,-0.41
2,P06654,0.0,0.0,0.0,0.0,-0.2,2.0,0.0,0.0,360.0,...,-2.0,-2.0,-3.0,-0.727679,-0.006762,0.810264,-0.4809,0.0015,-17.8728,-0.3972
3,P06654,0.0,0.0,0.0,0.0,-0.2,2.0,0.0,0.0,360.0,...,0.0,0.0,-1.0,0.0,0.001762,-0.149344,-0.3843,0.0015,-17.8728,-0.2707
4,P06654,0.0,0.0,0.0,0.0,-0.2,2.0,0.0,0.0,360.0,...,-3.0,-4.0,-4.0,0.168304,-0.003762,-0.149344,-0.4602,0.0015,-17.8728,-0.37


In [10]:
if SAVE_NEW_CSV:
    if DROP_COLUMNS:
        df.drop(columns=["mutation_code", "AlphaFoldDB"], inplace=True)
    if KEEP_ONLY_DDG:
        df.drop(columns=["dTm", "Tm"], inplace=True)

    ordered_columns = open_json("./data/features.json")
    ordered_columns = sum([ordered_columns[k] for k in ordered_columns], [])

    for col in ordered_columns:
        if col not in df.columns.to_list():
            df[col] = ""

    df = df[ordered_columns]

    df.to_csv(DATASET_OUTPUT_PATH, index=False)


In [11]:
features_columns = ordered_columns[4:-4]
df[features_columns].isna().sum().sum()


0

In [12]:
# # test #
# from biopandas.pdb import PandasPdb

# alphafold_path = "./data/main_dataset/3D_structures/alphafold/P00509.pdb"
# pdb_df = PandasPdb().read_pdb(alphafold_path)
# atom_df = pdb_df.df['ATOM']
# b_factor = atom_df.groupby("residue_number")[
#     "b_factor"].apply(lambda x: x.median())
# b_factor.to_list()

In [13]:
# test
# from Bio.PDB.PDBParser import PDBParser
# from utils.features_biopython import get_dssp_data

# alphafold_path = "./data/main_dataset/3D_structures/alphafold/P00651.pdb"
# pdb_parser = PDBParser()
# structure = pdb_parser.get_structure("", alphafold_path)
# dssp = get_dssp_data(alphafold_path, structure)
# dssp
