In [1]:
import difflib
import pandas as pd
from ssbio.protein.structure.properties import freesasa

from utils.features_biopython import add_structure_infos, add_protein_analysis, add_demask_predictions
from utils.file_utils import open_json

COMPUTE_MUTATION_CODE = True
ADD_DEMASK_PREDICTIONS = True
ADD_PROTEIN_ANALYSIS = True
ADD_STRUCTURE_INFOS = True
CLEAN_DF = False
DROP_COLUMNS = False
SAVE_NEW_CSV = True
START_FRESH = True

In [2]:
# seq_id 32559 is the wildtype sequence (just the chain though)
sequence_only_chain = "VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK"
wildtype_sequence = "MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK"
chain_start = 22
chain_end = 243
wildtype_sequence[chain_start:chain_end]==sequence_only_chain

True

In [3]:
if START_FRESH:
    df = pd.read_csv("./data/Kaggle/test.csv")
    df["uniprot"] = "AF70"
    df["PDB_wild"] = "AF70"
    df["alphafold_path"] = "./data/main_dataset/3D_structures/alphafold/AF70.pdb"
    df["dataset_source"] = "Novozymes"
    df["sequence"] = wildtype_sequence
    df["chain_start"] = 22
    df["chain_end"] = 243
    df["mutated_chain"] = "A"
    df["length"] = len(wildtype_sequence)
    df.drop(columns=["data_source"], inplace=True)
else:
    df = pd.read_csv("./data/processed_test.csv")    
df.head(2)

Unnamed: 0,seq_id,protein_sequence,pH,uniprot,PDB_wild,alphafold_path,dataset_source,sequence,chain_start,chain_end,mutated_chain,length
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243


In [4]:
protein_sequence = "VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVG"
row = {"protein_sequence": protein_sequence}

sequence = wildtype_sequence[:chain_start]+row["protein_sequence"]

if len(wildtype_sequence)==len(sequence):
    for i, c in enumerate(wildtype_sequence):
        if c != sequence[i]:
            row["mutation_position"] = i
            row["wild_aa"] = wildtype_sequence[i]
            row["mutated_aa"] = sequence[i]
            break
if len(wildtype_sequence) > len(sequence):
    for i, c in enumerate(sequence):
        if c != wildtype_sequence[i]:
            row["mutation_position"] = i
            row["wild_aa"] = wildtype_sequence[i]
            row["mutated_aa"] = "-"
            break
    else:
        row["mutation_position"] = i+1
        row["wild_aa"] = wildtype_sequence[i+1]
        row["mutated_aa"] = "-"

if len(wildtype_sequence) < len(sequence):
    print(f"len wt < len s: this should not happen ! see {row['seq_id']}")



print(row)


{'protein_sequence': 'VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVG', 'mutation_position': 242, 'wild_aa': 'K', 'mutated_aa': '-'}


In [5]:
if COMPUTE_MUTATION_CODE:
    df["wild_aa"] = ""
    df["mutated_aa"] = ""
    df["mutation_position"] = 0

    def construct_mutation_code(row, wildtype_sequence):
        # we use the fact that we know the mutations are all single mutations
        # and that there are some deletions
        sequence = wildtype_sequence[:chain_start]+row["protein_sequence"]
        if len(wildtype_sequence) == len(sequence):
            for i, c in enumerate(wildtype_sequence):
                if c != sequence[i]:
                    row["mutation_position"] = i
                    row["wild_aa"] = wildtype_sequence[i]
                    row["mutated_aa"] = sequence[i]
                    break
        if len(wildtype_sequence) > len(sequence):
            for i, c in enumerate(sequence):
                if c != wildtype_sequence[i]:
                    row["mutation_position"] = i
                    row["wild_aa"] = wildtype_sequence[i]
                    row["mutated_aa"] = "-"
                    break
            else:
                row["mutation_position"] = i+1
                row["wild_aa"] = wildtype_sequence[i+1]
                row["mutated_aa"] = "-"

        if len(wildtype_sequence) < len(sequence):
            print(f"len wt < len s: this should not happen ! see {row['seq_id']}")

        return row

    df = df.apply(lambda row: construct_mutation_code(
        row, wildtype_sequence), axis=1)


df

Unnamed: 0,seq_id,protein_sequence,pH,uniprot,PDB_wild,alphafold_path,dataset_source,sequence,chain_start,chain_end,mutated_chain,length,wild_aa,mutated_aa,mutation_position
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,L,E,38
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,L,K,38
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,L,-,38
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,K,C,39
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,K,F,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,A,I,37
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,A,L,37
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,A,N,37
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,A,P,37


In [6]:
# add residue depth, sasa and c_alpha depth computed from alphafold pdb file => compute_sasa = True, compute_depth = True
# add residue dssp infos (rsa etc.) => compute_dssp = True
if ADD_STRUCTURE_INFOS:
    df = add_structure_infos(df, compute_sasa=True,
                             compute_depth=True, compute_dssp=True, compute_bfactor=True)
    df.head(2)

100%|██████████| 1/1 [00:17<00:00, 17.88s/it]


In [7]:
"seq_id" in df.columns.to_list()


Unnamed: 0,NH->O_1_energy,NH->O_1_relidx,NH->O_2_energy,NH->O_2_relidx,O->NH_1_energy,O->NH_1_relidx,O->NH_2_energy,O->NH_2_relidx,PDB_wild,Phi,...,mutated_chain,mutation_position,pH,protein_sequence,residue_depth,sasa,seq_id,sequence,uniprot,wild_aa
0,-0.1,11.0,-0.0,13.0,-0.9,2.0,-0.2,-1.0,AF70,-54.4,...,A,38,8,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,1.575880,136.087626,31390,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,AF70,L
1,-0.1,11.0,-0.0,13.0,-0.9,2.0,-0.2,-1.0,AF70,-54.4,...,A,38,8,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,1.575880,136.087626,31391,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,AF70,L
2,-0.1,11.0,-0.0,13.0,-0.9,2.0,-0.2,-1.0,AF70,-54.4,...,A,38,8,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,1.575880,136.087626,31392,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,AF70,L
3,-1.1,-2.0,-0.3,11.0,-1.6,11.0,-0.0,2.0,AF70,-92.6,...,A,39,8,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,1.635789,118.020739,31393,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,AF70,K
4,-1.1,-2.0,-0.3,11.0,-1.6,11.0,-0.0,2.0,AF70,-92.6,...,A,39,8,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,1.635789,118.020739,31394,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,AF70,K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,-1.8,-2.0,-0.2,1.0,-1.1,2.0,-0.2,13.0,AF70,41.1,...,A,37,8,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,1.483619,79.639585,33798,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,AF70,A
2409,-1.8,-2.0,-0.2,1.0,-1.1,2.0,-0.2,13.0,AF70,41.1,...,A,37,8,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,1.483619,79.639585,33799,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,AF70,A
2410,-1.8,-2.0,-0.2,1.0,-1.1,2.0,-0.2,13.0,AF70,41.1,...,A,37,8,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,1.483619,79.639585,33800,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,AF70,A
2411,-1.8,-2.0,-0.2,1.0,-1.1,2.0,-0.2,13.0,AF70,41.1,...,A,37,8,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,1.483619,79.639585,33801,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,AF70,A


In [8]:
if ADD_PROTEIN_ANALYSIS:
    df = add_protein_analysis(df)

"seq_id" in df.columns.to_list()


In [9]:
if ADD_DEMASK_PREDICTIONS:
    df = add_demask_predictions(df)

"seq_id" in df.columns.to_list()


  0%|          | 0/1 [00:00<?, ?it/s]

error: prediction contains more than one element
row:  NH->O_1_energy             0.0
NH->O_1_relidx             0.0
NH->O_2_energy             0.0
NH->O_2_relidx             0.0
O->NH_1_energy            -0.1
                        ...   
delta_charge_at_pH   -0.149344
demask_score               0.0
demask_entropy             0.0
demask_log2f_var           0.0
demask_matrix              0.0
Name: 1169, Length: 62, dtype: object


100%|██████████| 1/1 [00:02<00:00,  2.42s/it]


In [16]:
"seq_id" in df.columns.to_list()

['uniprot',
 'PDB_wild',
 'dataset_source',
 'alphafold_path',
 'wild_aa',
 'mutation_position',
 'mutated_aa',
 'mutated_chain',
 'length',
 'chain_start',
 'chain_end',
 'pH',
 'sequence',
 'blosum62',
 'blosum80',
 'blosum90',
 'demask_entropy',
 'demask_log2f_var',
 'demask_matrix',
 'demask_score',
 'aromaticity',
 'charge_at_pH',
 'flexibility',
 'gravy',
 'helix_fraction',
 'isoelectric_point',
 'instability_index',
 'molWeight',
 'molar_extinction_1',
 'molar_extinction_2',
 'sheet_fraction',
 'turn_fraction',
 'delta_aromaticity',
 'delta_charge_at_pH',
 'delta_flexibility',
 'delta_gravy',
 'delta_helix_fraction',
 'delta_isoelectric_point',
 'delta_instability_index',
 'delta_molecular_weight',
 'delta_molar_extinction_1',
 'delta_molar_extinction_2',
 'delta_sheet_fraction',
 'delta_turn_fraction',
 'bfactor',
 'sasa',
 'residue_depth',
 'c_alpha_depth',
 'Secondary structure',
 'Relative ASA',
 'Psi',
 'Phi',
 'NH->O_1_energy',
 'NH->O_1_relidx',
 'O->NH_1_energy',
 'O->NH

In [15]:
if SAVE_NEW_CSV or True:
    if DROP_COLUMNS:
        df.drop(columns=["mutation_code", "countByFeatureType",
                "Tm", "AlphaFoldDB", "Texp"], inplace=True)
    
    ordered_columns = open_json("./data/features.json")
    ordered_columns = sum([ordered_columns[k] for k in ordered_columns], [])
    ordered_columns = ordered_columns[:-4]

    for col in ordered_columns:
        if col not in df.columns.to_list():
            df[col] = ""

    df = df[["seq_id"]+ordered_columns]

    df.to_csv("./data/processed_test.csv", index=False)


KeyError: "['seq_id'] not in index"

In [12]:
features_columns = ordered_columns[4:]
df[features_columns].isna().sum()


wild_aa                      0
mutation_position            0
mutated_aa                   0
mutated_chain                0
length                       0
chain_start                  0
chain_end                    0
pH                           0
sequence                     0
blosum62                     0
blosum80                     0
blosum90                     0
demask_entropy               0
demask_log2f_var             0
demask_matrix                0
demask_score                 0
aromaticity                  0
charge_at_pH                 0
flexibility                  0
gravy                        0
helix_fraction               0
isoelectric_point            0
instability_index            0
molWeight                    0
molar_extinction_1           0
molar_extinction_2           0
sheet_fraction               0
turn_fraction                0
delta_aromaticity            0
delta_charge_at_pH           0
delta_flexibility            0
delta_gravy                  0
delta_he

In [13]:
# # test #
# from biopandas.pdb import PandasPdb

# alphafold_path = "./data/main_dataset/3D_structures/alphafold/P00509.pdb"
# pdb_df = PandasPdb().read_pdb(alphafold_path)
# atom_df = pdb_df.df['ATOM']
# b_factor = atom_df.groupby("residue_number")[
#     "b_factor"].apply(lambda x: x.median())
# b_factor.to_list()

In [14]:
# test
# from Bio.PDB.PDBParser import PDBParser
# from utils.features_biopython import get_dssp_data

# alphafold_path = "./data/main_dataset/3D_structures/alphafold/P00651.pdb"
# pdb_parser = PDBParser()
# structure = pdb_parser.get_structure("", alphafold_path)
# dssp = get_dssp_data(alphafold_path, structure)
# dssp
