In [2]:
import pandas as pd
import tqdm
import os
import numpy as np
from utils.file_utils import write_json

OUTPUT_DIR = "./data/main_dataset_creation/outputs/merged_source_id/"
MERGE_SLAYKOVSKIY = False
RECOMPUTE_NEWLY_FOUND = True

DF_DDG_INPUT = "./data/main_dataset_creation/outputs/all_ddG_source_id/dataset_only_infos.csv"
DF_DTM_INPUT = "./data/main_dataset_creation/outputs/all_dTm_source_id/dataset_only_infos.csv"
SUBSET_DUPLICATES = ["wild_aa", "mutation_position",
                     "mutated_aa", "pH", "sequence"]


In [2]:
df_ddG = pd.read_csv(DF_DDG_INPUT)
df_dTm = pd.read_csv(DF_DTM_INPUT)

print(df_ddG.dTm.isna().sum())


for _, row_dtm in tqdm.tqdm(df_dTm.iterrows()):
    # add dTm
    df_ddG.loc[(df_ddG["wild_aa"] == row_dtm["wild_aa"]) &
               (df_ddG["mutation_position"] == row_dtm["mutation_position"]) &
               (df_ddG["mutated_aa"] == row_dtm["mutated_aa"]) &
               (df_ddG["pH"] == row_dtm["pH"]) &
               (df_ddG["sequence"] == row_dtm["sequence"]), ["dTm"]] = row_dtm["dTm"]
    # add Tm
    df_ddG.loc[(df_ddG["wild_aa"] == row_dtm["wild_aa"]) &
               (df_ddG["mutation_position"] == row_dtm["mutation_position"]) &
               (df_ddG["mutated_aa"] == row_dtm["mutated_aa"]) &
               (df_ddG["pH"] == row_dtm["pH"]) &
               (df_ddG["sequence"] == row_dtm["sequence"]), ["Tm"]] = row_dtm["Tm"]


print(df_ddG.dTm.isna().sum())


8198


1785it [00:07, 244.12it/s]

In [None]:
main_df = pd.concat([df_ddG, df_dTm], ignore_index=True)
main_df.drop_duplicates(SUBSET_DUPLICATES, inplace=True)

dataset_infos = {
    "total_len": len(main_df),
    "dataset_source_repartition": main_df.dataset_source.value_counts().to_dict(),
    "unique_uniprot": len(main_df.uniprot.unique()),
    "both_ddG_dTm": len(main_df[~(main_df.ddG.isna()) & ~(main_df.dTm.isna())]),
    "ddG": int(len(main_df)-main_df.ddG.isna().sum()),
    "dTm": int(len(main_df)-main_df.dTm.isna().sum()),
    "Tm": int(len(main_df)-main_df.Tm.isna().sum()),
    "nan_repartition": main_df.isna().sum().to_dict(),
    "no_pH_repartition": main_df[main_df.pH.isna()].dataset_source.value_counts().to_dict(),
}
os.makedirs(OUTPUT_DIR, exist_ok=True)
main_df.to_csv(OUTPUT_DIR+"dataset_only_infos.csv", index=False)
write_json(OUTPUT_DIR+"dataset_infos.json", dataset_infos)


### Slaykovskiy_14k_dataset

In [4]:
if not MERGE_SLAYKOVSKIY:
    assert False

AssertionError: 

In [3]:
df = pd.read_csv("./data/Slaykovskiy_14k_dataset/14k_dataset_source_id.csv")
main_df = pd.read_csv(
    "./data/main_dataset_creation/outputs/merged/dataset_all_no_source_id.csv")
source_id_df = pd.read_csv("./data/main_dataset_creation/outputs/merged_source_id/dataset_only_infos.csv")


In [4]:
# 1st we add source id to main_df
def add_source_id(row, source_id_df):
    subdf = source_id_df.loc[
            (source_id_df.wild_aa == row.wild_aa) &
            (source_id_df.mutation_position == row.mutation_position) &
            (source_id_df.mutated_aa == row.mutated_aa) &
            (source_id_df.pH == row.pH) &
            (source_id_df.sequence == row.sequence)
            ]
    if len(subdf) == 0:
        print(f"no data was found in source_id_df that corresponds to {row.uniprot}")
    elif len(subdf) != 1:
        print(
            f"too much data was found in source_id_df that corresponds to {row.uniprot}")
    else:
        return subdf["source_id"].iloc[0]

    return ""


main_df["source_id"] = main_df.apply(lambda row: add_source_id(row, source_id_df), axis=1)


no data was found in source_id_df that corresponds to C3YEM4


In [5]:
# 2nd we remove the data rows for which we do not have any source id
print(len(main_df))
main_df = main_df[~(main_df.source_id.eq(""))]
print(len(main_df))


9980
9979


In [6]:
# 3rd we add PDB_chain and pdb_position to main_df, based on Slaykovskiy_14k_dataset

main_df["PDB_chain"] = ""
main_df["pdb_position"] = np.nan
errors = {
    "no_data_found": [],
    "too_much_data_found": [],
}


def add_PDB_infos(row, df, errors):
    subdf = df.loc[
        (df.source_id.str.contains(f"'{row.source_id}'"))
    ]
    if len(subdf) == 0:
        errors["no_data_found"].append(
            {"source_id": row.source_id, "uniprot": row.uniprot})
    elif len(subdf) != 1:
        print(
            f"too much data was found in Slaykovskiy_14k_dataset that corresponds to {row.uniprot}")
    else:
        row["PDB_chain"] = subdf["PDB_chain"].iloc[0]
        row["pdb_position"] = subdf["pdb_position"].iloc[0]

    return row

main_df = main_df.apply(lambda row: add_PDB_infos(row, df, errors), axis=1)

In [7]:
# 4th we add the pdb_chain_voxel_path

def add_voxel_path(row):
    try:
        pos = int(row.pdb_position)
    except:
        print(
            f"error for {row.source_id}: cannot make an int out of pdb_position")
        return ""
    local_pdb_chain_voxel_path = '/'.join(["./compute_mutated_structures/splitted_pdb_chain_voxel_features",
                                           f"{row.PDB_chain}_{row.wild_aa}{pos}{row.mutated_aa}.npy"])
    if os.path.exists(local_pdb_chain_voxel_path):
        return local_pdb_chain_voxel_path
    else:
        print(f"error for {row.source_id}: {local_pdb_chain_voxel_path}")
        return ""


main_df["pdb_chain_voxel_path"] = main_df.apply(add_voxel_path, axis=1)


error for fireprotdb_569: ./compute_mutated_structures/splitted_pdb_chain_voxel_features/AF-P06654-F1A_M1K.npy
error for fireprotdb_606: ./compute_mutated_structures/splitted_pdb_chain_voxel_features/AF-P06654-F1A_K230G.npy
error for fireprotdb_630: ./compute_mutated_structures/splitted_pdb_chain_voxel_features/AF-P06654-F1A_L233K.npy
error for fireprotdb_653: ./compute_mutated_structures/splitted_pdb_chain_voxel_features/AF-P06654-F1A_T237D.npy
error for fireprotdb_675: ./compute_mutated_structures/splitted_pdb_chain_voxel_features/AF-P06654-F1A_K239A.npy
error for fireprotdb_709: ./compute_mutated_structures/splitted_pdb_chain_voxel_features/AF-P06654-F1A_T243I.npy
error for fireprotdb_747: ./compute_mutated_structures/splitted_pdb_chain_voxel_features/AF-P06654-F1A_V247Q.npy
error for fireprotdb_782: ./compute_mutated_structures/splitted_pdb_chain_voxel_features/AF-P06654-F1A_V255I.npy
error for fireprotdb_834: ./compute_mutated_structures/splitted_pdb_chain_voxel_features/AF-P06654

In [8]:
print(len(main_df))
print(len(errors["no_data_found"]))
print(main_df["pdb_position"].isna().sum())
print(main_df["PDB_chain"].eq("").sum())
print(main_df["pdb_chain_voxel_path"].eq("").sum())


9979
1388
1388
1388
2716


In [23]:
main_df.to_csv(
    "./data/main_dataset_creation/outputs/merged/dataset_source_id_6_12__14_07.csv",
    index=False)
