In [5]:
import pandas as pd
import numpy as np
import glob
from pprint import pprint

from utils.features_biopython import add_structure_infos, add_protein_analysis
from utils.demask_features import add_demask_predictions
from utils.esm_features import submission_compute_pca, submission_add_esm_features
from utils.file_utils import open_json
from utils.infos_translation import aa_char2int


UPDATE_PATHS = False
ADD_ESM_FEATURES = False
COMPUTE_MUTATION_CODE = False
ADD_DEMASK_PREDICTIONS = False
ADD_PROTEIN_ANALYSIS = False
ADD_STRUCTURE_INFOS = False
CONVERT_MUTATION_TO_INT = False
DROP_COLUMNS = False
SAVE_NEW_CSV = False
START_FRESH = False
CHECK_DATA = True
FILL_NAN = True

OUTPUT_DIR = "./data/main_dataset_creation/outputs/all_v3/"
INPUT_DATASET = OUTPUT_DIR+"test_with_esm_features.csv"


In [2]:
# seq_id 32559 is the wildtype sequence (just the chain though)
sequence_only_chain = "VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK"
wildtype_sequence = "MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK"
chain_start = 22
chain_end = 243
wildtype_sequence[chain_start:chain_end]==sequence_only_chain

True

In [3]:
if START_FRESH:
    df = pd.read_csv("./data/Kaggle/test.csv")
    df["uniprot"] = "AF70"
    df["PDB_wild"] = "AF70"
    df["alphafold_path"] = "./data/main_dataset/3D_structures/alphafold/AF70.pdb"
    df["dataset_source"] = "Novozymes"
    df["sequence"] = wildtype_sequence
    df["chain_start"] = 22
    df["chain_end"] = 243
    df["mutated_chain"] = "A"
    df["length"] = len(wildtype_sequence)
    df.drop(columns=["data_source"], inplace=True)
else:
    df = pd.read_csv("./data/processed_test.csv")    
df.head(2)

Unnamed: 0,seq_id,protein_sequence,pH,uniprot,PDB_wild,alphafold_path,dataset_source,sequence,chain_start,chain_end,mutated_chain,length
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243


In [4]:
if COMPUTE_MUTATION_CODE:
    df["wild_aa"] = ""
    df["mutated_aa"] = ""
    df["mutation_position"] = 0

    def construct_mutation_code(row, wildtype_sequence):
        # we use the fact that we know the mutations are all single mutations
        # and that there are some deletions
        sequence = wildtype_sequence[:chain_start]+row["protein_sequence"]
        if len(wildtype_sequence) == len(sequence):
            for i, c in enumerate(wildtype_sequence):
                if c != sequence[i]:
                    row["mutation_position"] = i
                    row["wild_aa"] = wildtype_sequence[i]
                    row["mutated_aa"] = sequence[i]
                    break
            else:
                # case where we exit the loop normally: no mutation !
                # ie. special case of wildtype_sequence == sequence
                # we put arbitrary values for now...
                row["mutation_position"] = 100
                row["wild_aa"] = "A"
                row["mutated_aa"] = "A"

        if len(wildtype_sequence) > len(sequence):
            for i, c in enumerate(sequence):
                if c != wildtype_sequence[i]:
                    row["mutation_position"] = i
                    row["wild_aa"] = wildtype_sequence[i]
                    row["mutated_aa"] = "-"
                    break
            else:
                # case where we exit the loop normally: no mutation detected !
                # ie. special case of deletion of the last aa
                row["mutation_position"] = len(sequence)
                row["wild_aa"] = wildtype_sequence[len(sequence)]
                row["mutated_aa"] = "-"

        if len(wildtype_sequence) < len(sequence):
            print(f"len wt < len s: this should not happen ! see {row['seq_id']}")

        return row

    df = df.apply(lambda row: construct_mutation_code(
        row, wildtype_sequence), axis=1)


df

Unnamed: 0,seq_id,protein_sequence,pH,uniprot,PDB_wild,alphafold_path,dataset_source,sequence,chain_start,chain_end,mutated_chain,length,wild_aa,mutated_aa,mutation_position
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,L,E,38
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,L,K,38
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,L,-,38
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,K,C,39
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,K,F,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,A,I,37
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,A,L,37
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,A,N,37
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset/3D_structures/alphafold/AF...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,A,P,37


In [5]:
if UPDATE_PATHS:
    # we now add the path to each record of the dataframe

    df["alphafold_path"] = "./data/main_dataset_creation/3D_structures/alphafold/AF70.pdb"
    df["relaxed_wild_3D_path"] = "./compute_mutated_structures/relaxed_pdb/AF70_relaxed/AF70_relaxed.pdb"
    df["relaxed_mutated_3D_path"] = ""

    all_relaxed_paths = glob.glob(
        f"./compute_mutated_structures/relaxed_pdb/AF70_relaxed/AF70_relaxed_*_relaxed.pdb")

    def find_mutation_3D_path(row):
        name = "AF70"
        w_aa, m_aa = row["wild_aa"], row["mutated_aa"]
        pos = int(row["mutation_position"])+1
        path = (f"./compute_mutated_structures/relaxed_pdb/{name}_relaxed/" +
                f"{name}_relaxed_{w_aa}{pos}{m_aa}_relaxed.pdb")

        row["relaxed_mutated_3D_path"] = path if path in all_relaxed_paths else np.nan
        return row

    df = df.apply(find_mutation_3D_path, axis=1)

df.relaxed_mutated_3D_path.isna().sum()


77

In [6]:
if UPDATE_PATHS:
    # we do not have predicted relaxed 3D structure for deletion
    # but we have the alphafold 3D structure of the deletion sequences directly
    # so we put the relaxed alphafold 3D structures predicted for deletion 
    all_deletion_relaxed_paths = glob.glob(
        f"./compute_mutated_structures/relaxed_pdb/AF70_alphafold/*_relaxed.pdb")
    def find_deletion_mutation_path(row):
        name = "AF70"
        w_aa, m_aa = row["wild_aa"], row["mutated_aa"]
        pos = int(row["mutation_position"])+1
        pos -= 22
        if row["mutated_aa"] == "-":
            path = (f"./compute_mutated_structures/relaxed_pdb/AF70_alphafold/" +
                    f"{w_aa}{pos}__unrelaxed_rank_1_model_3_relaxed.pdb")

            row["relaxed_mutated_3D_path"] = path if path in all_deletion_relaxed_paths else np.nan
        return row


    df = df.apply(find_deletion_mutation_path, axis=1)

deletion = df[df.mutated_aa.eq('-')]
deletion.head()

Unnamed: 0,seq_id,protein_sequence,pH,uniprot,PDB_wild,alphafold_path,dataset_source,sequence,chain_start,chain_end,mutated_chain,length,wild_aa,mutated_aa,mutation_position,relaxed_wild_3D_path,relaxed_mutated_3D_path
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,L,-,38,./compute_mutated_structures/relaxed_pdb/AF70_...,./compute_mutated_structures/relaxed_pdb/AF70_...
13,31403,VPVNPEPDATSVENVALKGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,T,-,40,./compute_mutated_structures/relaxed_pdb/AF70_...,./compute_mutated_structures/relaxed_pdb/AF70_...
56,31446,VPVNPEPDATSVENVALKTGSGDQSDPIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,S,-,45,./compute_mutated_structures/relaxed_pdb/AF70_...,./compute_mutated_structures/relaxed_pdb/AF70_...
70,31460,VPVNPEPDATSVENVALKTGSGDSQDPIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,S,-,47,./compute_mutated_structures/relaxed_pdb/AF70_...,./compute_mutated_structures/relaxed_pdb/AF70_...
84,31474,VPVNPEPDATSVENVALKTGSGDSQSDIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,A,243,P,-,49,./compute_mutated_structures/relaxed_pdb/AF70_...,./compute_mutated_structures/relaxed_pdb/AF70_...


In [7]:
print(df.isna().sum().sum())


0


In [8]:
if CONVERT_MUTATION_TO_INT:
    df["wild_aa_int"] = df["wild_aa"].apply(lambda x: aa_char2int[x])
    df["mutated_aa_int"] = df["mutated_aa"].apply(lambda x: aa_char2int[x])
    df["mutated_chain"].fillna("A")
    df["mutated_chain"] = df["mutated_chain"].apply(
        lambda x: "A" if x == "unsigned" else x)
    df["mutated_chain_int"] = df["mutated_chain"].apply(lambda x: ord(x))


In [None]:
if ADD_ESM_FEATURES:
    # TODO: extract embeddings from all available proteins, not just the ones with ddg
    main_df = pd.read_csv(OUTPUT_DIR+"dataset_with_esm_features.csv")
    context = submission_compute_pca(main_df, df)
    df = submission_add_esm_features(df, context)


## You should do the following via the multiprocessing script at this point

In [10]:
# add residue depth, sasa and c_alpha depth computed from alphafold pdb file => compute_sasa = True, compute_depth = True
# add residue dssp infos (rsa etc.) => compute_dssp = True
if ADD_STRUCTURE_INFOS:
    df = add_structure_infos(df, compute_sasa=True,
                             compute_depth=True, compute_dssp=True, compute_bfactor=True)
    df.head(2)

In [11]:
if ADD_PROTEIN_ANALYSIS:
    df = add_protein_analysis(df)


In [12]:
if ADD_DEMASK_PREDICTIONS:
    df = add_demask_predictions(df)

In [13]:
if SAVE_NEW_CSV:
    df.to_csv(OUTPUT_DIR+"test_with_esm_features.csv", index=False)


## Data verification

In [10]:
if CHECK_DATA:
    df = pd.read_csv(OUTPUT_DIR+"submission_all_features_filled_nan.csv")
    pprint({k: v for k,v in df.isna().sum().to_dict().items() if v>0})


{'direct_demask_entropy': 78,
 'direct_demask_log2f_var': 78,
 'direct_demask_matrix': 78,
 'direct_demask_score': 78,
 'esm_mutation_entropy': 89,
 'esm_mutation_probability': 89,
 'indirect_demask_entropy': 78,
 'indirect_demask_log2f_var': 78,
 'indirect_demask_matrix': 78,
 'indirect_demask_score': 78,
 'mutated_relaxed_NH->O_1_energy': 4,
 'mutated_relaxed_NH->O_1_relidx': 4,
 'mutated_relaxed_NH->O_2_energy': 4,
 'mutated_relaxed_NH->O_2_relidx': 4,
 'mutated_relaxed_O->NH_1_energy': 4,
 'mutated_relaxed_O->NH_1_relidx': 4,
 'mutated_relaxed_O->NH_2_energy': 4,
 'mutated_relaxed_O->NH_2_relidx': 4,
 'mutated_relaxed_Phi': 4,
 'mutated_relaxed_Psi': 4,
 'mutated_relaxed_Relative_ASA': 4,
 'mutated_relaxed_Secondary_structure': 4,
 'mutated_relaxed_bfactor': 4,
 'mutated_relaxed_c_alpha_depth': 4,
 'mutated_relaxed_residue_depth': 4,
 'mutated_relaxed_sasa': 4,
 'mutation_NH->O_1_energy': 4,
 'mutation_NH->O_1_relidx': 4,
 'mutation_NH->O_2_energy': 4,
 'mutation_NH->O_2_relidx': 4

In [9]:
if FILL_NAN:
    df = pd.read_csv(OUTPUT_DIR+"submission_with_all_features.csv")

    columns_with_nan = {k: v for k,
                        v in df.isna().sum().to_dict().items() if v > 0}
    for col, n in columns_with_nan.items():
        quantile = df[col].quantile(0.25)
        df[col].fillna(quantile, inplace=True)
        print(f"filling the {n} nan values from {col} with {quantile}")
    
    df.to_csv(OUTPUT_DIR+"submission_all_features_filled_nan.csv", index=False)


filling the 89 nan values from esm_mutation_probability with 0.0009774446371011
filling the 89 nan values from esm_mutation_entropy with 0.4755664765834808
filling the 4 nan values from mutated_relaxed_Secondary_structure with 1.0
filling the 4 nan values from mutated_relaxed_Relative_ASA with 0.0774647887323943
filling the 4 nan values from mutated_relaxed_Phi with -96.3
filling the 4 nan values from mutated_relaxed_Psi with -37.1
filling the 4 nan values from mutated_relaxed_NH->O_1_relidx with -4.0
filling the 4 nan values from mutated_relaxed_NH->O_1_energy with -2.2
filling the 4 nan values from mutated_relaxed_O->NH_1_relidx with -1.0
filling the 4 nan values from mutated_relaxed_O->NH_1_energy with -2.3
filling the 4 nan values from mutated_relaxed_NH->O_2_relidx with -3.0
filling the 4 nan values from mutated_relaxed_NH->O_2_energy with -0.3
filling the 4 nan values from mutated_relaxed_O->NH_2_relidx with -2.0
filling the 4 nan values from mutated_relaxed_O->NH_2_energy with -