In [6]:
import pandas as pd
import numpy as np
import glob
from pprint import pprint

from utils.features_biopython import add_structure_infos, add_protein_analysis
from utils.demask_features import add_demask_predictions
from utils.esm_features import submission_compute_pca, submission_add_esm_features
from utils.file_utils import open_json
from utils.infos_translation import aa_char2int


UPDATE_PATHS = False
ADD_ESM_FEATURES = True
COMPUTE_MUTATION_CODE = False
ADD_DEMASK_PREDICTIONS = False
ADD_PROTEIN_ANALYSIS = False
ADD_STRUCTURE_INFOS = False
CONVERT_MUTATION_TO_INT = False
DROP_COLUMNS = False
SAVE_NEW_CSV = True
START_FRESH = False
CHECK_DATA = True
FILL_NAN = True

INPUT_DATASET = "./data/main_dataset_creation/outputs/all_v3/submission_with_voxel_from_kaggle_filled_nan.csv"
OUTPUT_DIR = "./data/main_dataset_creation/outputs/merged/"
OUTPUT_NAME = "submission_dataset_5_12.csv"
TRAIN_DATASET_WITH_ESM = "dataset_with_esm_rosetta.csv"
 

In [2]:
# seq_id 32559 is the wildtype sequence (just the chain though)
sequence_only_chain = "VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK"
wildtype_sequence = "MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK"
chain_start = 22
chain_end = 243
print(wildtype_sequence[chain_start:chain_end]==sequence_only_chain)


True


In [3]:
if START_FRESH:
    df = pd.read_csv("./data/Kaggle/test.csv")
    df["uniprot"] = "AF70"
    df["PDB_wild"] = "AF70"
    df["alphafold_path"] = "./data/main_dataset/3D_structures/alphafold/AF70.pdb"
    df["dataset_source"] = "Novozymes"
    df["sequence"] = wildtype_sequence
    df["chain_start"] = 22
    df["chain_end"] = 243
    df["mutated_chain"] = "A"
    df["length"] = len(wildtype_sequence)
    df.drop(columns=["data_source"], inplace=True)
else:
    df = pd.read_csv(INPUT_DATASET)
df.head(2)

Unnamed: 0,seq_id,protein_sequence,pH,uniprot,PDB_wild,alphafold_path,dataset_source,sequence,chain_start,chain_end,...,mutation_p_aa_pp.1,mutation_pro_close.1,mutation_rama_prepro.1,mutation_ref.1,mutation_total_score.1,mutation_yhh_planarity.1,direct_voxel_path,reversed_voxel_path,kaggle_voxel_path,operation
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-5.080053,-0.340337,-8.156276,-4.386,-42.312223,-0.003405,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/AF70_L39E_reversed,../compute_mutated_structures/splitted_voxel_f...,mutation
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-5.090853,-0.323684,-8.357216,-2.37605,-42.290404,-0.001081,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/AF70_L39K_reversed,../compute_mutated_structures/splitted_voxel_f...,mutation


In [4]:
if COMPUTE_MUTATION_CODE:
    df["wild_aa"] = ""
    df["mutated_aa"] = ""
    df["mutation_position"] = 0

    def construct_mutation_code(row, wildtype_sequence):
        # we use the fact that we know the mutations are all single mutations
        # and that there are some deletions
        sequence = wildtype_sequence[:chain_start]+row["protein_sequence"]
        if len(wildtype_sequence) == len(sequence):
            for i, c in enumerate(wildtype_sequence):
                if c != sequence[i]:
                    row["mutation_position"] = i
                    row["wild_aa"] = wildtype_sequence[i]
                    row["mutated_aa"] = sequence[i]
                    break
            else:
                # case where we exit the loop normally: no mutation !
                # ie. special case of wildtype_sequence == sequence
                # we put arbitrary values for now...
                row["mutation_position"] = 100
                row["wild_aa"] = "A"
                row["mutated_aa"] = "A"

        if len(wildtype_sequence) > len(sequence):
            for i, c in enumerate(sequence):
                if c != wildtype_sequence[i]:
                    row["mutation_position"] = i
                    row["wild_aa"] = wildtype_sequence[i]
                    row["mutated_aa"] = "-"
                    break
            else:
                # case where we exit the loop normally: no mutation detected !
                # ie. special case of deletion of the last aa
                row["mutation_position"] = len(sequence)
                row["wild_aa"] = wildtype_sequence[len(sequence)]
                row["mutated_aa"] = "-"

        if len(wildtype_sequence) < len(sequence):
            print(f"len wt < len s: this should not happen ! see {row['seq_id']}")

        return row

    df = df.apply(lambda row: construct_mutation_code(
        row, wildtype_sequence), axis=1)


df

Unnamed: 0,seq_id,protein_sequence,pH,uniprot,PDB_wild,alphafold_path,dataset_source,sequence,chain_start,chain_end,...,mutation_p_aa_pp.1,mutation_pro_close.1,mutation_rama_prepro.1,mutation_ref.1,mutation_total_score.1,mutation_yhh_planarity.1,direct_voxel_path,reversed_voxel_path,kaggle_voxel_path,operation
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-5.080053,-0.340337,-8.156276,-4.38600,-42.312223,-0.003405,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/AF70_L39E_reversed,../compute_mutated_structures/splitted_voxel_f...,mutation
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-5.090853,-0.323684,-8.357216,-2.37605,-42.290404,-0.001081,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/AF70_L39K_reversed,../compute_mutated_structures/splitted_voxel_f...,mutation
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-8.018257,1.611177,-38.489130,-23.13191,-81.246759,0.045207,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/deletion_L17__reversed,../compute_mutated_structures/splitted_voxel_f...,deletion
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-4.163646,-0.303249,-7.019241,3.96937,-35.632674,-0.005618,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/AF70_K40C_reversed,../compute_mutated_structures/splitted_voxel_f...,mutation
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-4.359797,-0.325178,-7.248838,1.93287,-36.957136,-0.007015,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/AF70_K40F_reversed,../compute_mutated_structures/splitted_voxel_f...,mutation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-3.663072,-0.289461,-7.236210,0.97906,-36.554850,-0.003494,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/AF70_A38I_reversed,../compute_mutated_structures/splitted_voxel_f...,mutation
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-4.564676,-0.288388,-7.335446,0.33679,-39.272182,-0.002911,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/AF70_A38L_reversed,../compute_mutated_structures/splitted_voxel_f...,mutation
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-4.775504,-0.300240,-7.563481,-2.66494,-39.545189,-0.004809,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/AF70_A38N_reversed,../compute_mutated_structures/splitted_voxel_f...,mutation
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-5.586132,0.743034,-5.645281,-2.96789,-39.234884,-0.011716,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/AF70_A38P_reversed,../compute_mutated_structures/splitted_voxel_f...,mutation


In [5]:
if UPDATE_PATHS:
    # we now add the path to each record of the dataframe

    df["alphafold_path"] = "./data/main_dataset_creation/3D_structures/alphafold/AF70.pdb"
    df["relaxed_wild_3D_path"] = "./compute_mutated_structures/relaxed_pdb/AF70_relaxed/AF70_relaxed.pdb"
    df["relaxed_mutated_3D_path"] = ""

    all_relaxed_paths = glob.glob(
        f"./compute_mutated_structures/relaxed_pdb/AF70_relaxed/AF70_relaxed_*_relaxed.pdb")

    def find_mutation_3D_path(row):
        name = "AF70"
        w_aa, m_aa = row["wild_aa"], row["mutated_aa"]
        pos = int(row["mutation_position"])+1
        path = (f"./compute_mutated_structures/relaxed_pdb/{name}_relaxed/" +
                f"{name}_relaxed_{w_aa}{pos}{m_aa}_relaxed.pdb")

        row["relaxed_mutated_3D_path"] = path if path in all_relaxed_paths else np.nan
        return row

    df = df.apply(find_mutation_3D_path, axis=1)

df.relaxed_mutated_3D_path.isna().sum()


0

In [6]:
if UPDATE_PATHS:
    # we do not have predicted relaxed 3D structure for deletion
    # but we have the alphafold 3D structure of the deletion sequences directly
    # so we put the relaxed alphafold 3D structures predicted for deletion 
    all_deletion_relaxed_paths = glob.glob(
        f"./compute_mutated_structures/relaxed_pdb/AF70_alphafold/*_relaxed.pdb")
    def find_deletion_mutation_path(row):
        name = "AF70"
        w_aa, m_aa = row["wild_aa"], row["mutated_aa"]
        pos = int(row["mutation_position"])+1
        pos -= 22
        if row["mutated_aa"] == "-":
            path = (f"./compute_mutated_structures/relaxed_pdb/AF70_alphafold/" +
                    f"{w_aa}{pos}__unrelaxed_rank_1_model_3_relaxed.pdb")

            row["relaxed_mutated_3D_path"] = path if path in all_deletion_relaxed_paths else np.nan
        return row


    df = df.apply(find_deletion_mutation_path, axis=1)

deletion = df[df.mutated_aa.eq('-')]
deletion.head()

Unnamed: 0,seq_id,protein_sequence,pH,uniprot,PDB_wild,alphafold_path,dataset_source,sequence,chain_start,chain_end,...,mutation_p_aa_pp.1,mutation_pro_close.1,mutation_rama_prepro.1,mutation_ref.1,mutation_total_score.1,mutation_yhh_planarity.1,direct_voxel_path,reversed_voxel_path,kaggle_voxel_path,operation
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-8.018257,1.611177,-38.48913,-23.13191,-81.246759,0.045207,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/deletion_L17__reversed,../compute_mutated_structures/splitted_voxel_f...,deletion
13,31403,VPVNPEPDATSVENVALKGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-9.546029,-0.87075,-37.1671,-22.62219,-69.549287,-0.078684,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/deletion_T19__reversed,../compute_mutated_structures/splitted_voxel_f...,deletion
56,31446,VPVNPEPDATSVENVALKTGSGDQSDPIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-6.417109,-0.411457,-33.178529,-21.18075,-67.332818,-0.034803,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/deletion_S24__reversed,../compute_mutated_structures/splitted_voxel_f...,deletion
70,31460,VPVNPEPDATSVENVALKTGSGDSQDPIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,-2.980167,15.329733,-31.710145,-21.18075,36.382038,-0.013991,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/deletion_S26__reversed,../compute_mutated_structures/splitted_voxel_f...,deletion
84,31474,VPVNPEPDATSVENVALKTGSGDSQSDIKADLEVKGQSALPFDVDC...,8,AF70,AF70,./data/main_dataset_creation/3D_structures/alp...,Novozymes,MQLTKSLLVFALYMFGTQHVLAVPVNPEPDATSVENVALKTGSGDS...,22,243,...,4.812052,2.634262,-14.980988,-19.82723,26.97018,0.091833,../compute_mutated_structures/splitted_voxel_f...,./splitted_voxel_features/deletion_P28__reversed,../compute_mutated_structures/splitted_voxel_f...,deletion


In [7]:
print(df.isna().sum().sum())


4


In [8]:
if CONVERT_MUTATION_TO_INT:
    df["wild_aa_int"] = df["wild_aa"].apply(lambda x: aa_char2int[x])
    df["mutated_aa_int"] = df["mutated_aa"].apply(lambda x: aa_char2int[x])
    df["mutated_chain"].fillna("A")
    df["mutated_chain"] = df["mutated_chain"].apply(
        lambda x: "A" if x == "unsigned" else x)
    df["mutated_chain_int"] = df["mutated_chain"].apply(lambda x: ord(x))


In [9]:
if ADD_ESM_FEATURES:
    main_df = pd.read_csv(OUTPUT_DIR+TRAIN_DATASET_WITH_ESM)
    context = submission_compute_pca(main_df, df, only_ddg=False)
    df = submission_add_esm_features(df, context)


loaded model
cuda
Extracting embeddings from proteins...


432it [01:04,  6.68it/s]


cuda
Extracting embeddings from proteins...


1it [00:00,  8.13it/s]


cuda


## You should do the following via the multiprocessing script at this point

In [10]:
# add residue depth, sasa and c_alpha depth computed from alphafold pdb file => compute_sasa = True, compute_depth = True
# add residue dssp infos (rsa etc.) => compute_dssp = True
if ADD_STRUCTURE_INFOS:
    df = add_structure_infos(df, compute_sasa=True,
                             compute_depth=True, compute_dssp=True, compute_bfactor=True)
    df.head(2)

In [11]:
if ADD_PROTEIN_ANALYSIS:
    df = add_protein_analysis(df)


In [12]:
if ADD_DEMASK_PREDICTIONS:
    df = add_demask_predictions(df)

In [13]:
if SAVE_NEW_CSV:
    df.to_csv(OUTPUT_DIR+OUTPUT_NAME, index=False)


## Data verification

In [10]:
if CHECK_DATA:
    df = pd.read_csv(OUTPUT_DIR+"submission_6_12.csv")
    pprint({k: v for k,v in df.isna().sum().to_dict().items() if v>0})


{'reversed_voxel_path': 4}


In [9]:
if FILL_NAN:
    df = pd.read_csv(OUTPUT_DIR+"submission_dataset_5_12.csv")

    columns_with_nan = {k: v for k,
                        v in df.isna().sum().to_dict().items() if v > 0}
    for col, n in columns_with_nan.items():
        if "path" in col:
            continue
        quantile = df[col].quantile(0.25)
        df[col].fillna(quantile, inplace=True)
        print(f"filling the {n} nan values from {col} with {quantile}")
    
    df.to_csv(OUTPUT_DIR+"submission_6_12.csv", index=False)


filling the 89 nan values from esm_mutation_probability.1 with 0.00097744430240705
filling the 89 nan values from esm_mutation_entropy.1 with 0.4755664765834808
