# Dataset creation

![flowchart](./resources/dataset_creation_flowchart.drawio.png)


In [1]:
import os
import pandas as pd
import numpy as np

from utils.file_utils import open_json, write_json
from utils.dataset_creation import *
from utils.dataset_mapping import *


In [2]:
FRESH_START = True
UPDATE_MAPPING = True


In [3]:
COLUMNS = ["pdbs", "uniprot", "wild_aa", "mutation_position",
           "mutated_aa", "pH",
           "sequence", "length", "chain_start", "chain_end",
           "AlphaFoldDB", "Tm", "ddG", "dTm",
           "dataset_source", "infos_found"]

SUBSET_DUPLICATES = ["uniprot", "wild_aa", "mutation_position",
                     "mutated_aa", "pH", "sequence"]

NAME = "all"
DIR = "./data/main_dataset_creation"
OUTPUT_DIR = DIR+'/outputs/'+NAME

LOCAL_UNIPROT_INFOS_PATH = DIR+"/uniprot_infos.json"
PDB_UNIPROT_MAPPING_PATH = DIR+"/mapping/pdb_uniprot_mapping.json"
LINKED_UNIPROT_MAPPING_PATH = DIR+"/mapping/linked_uniprot_mapping.json"
SEQUENCE_UNIPROT_MAPPING_PATH = DIR + \
    "/mapping/sequence_uniprot_mapping.json"
PDB_NO_UNIPROT_PATH = DIR+"/mapping/pdb_no_uniprot.json"
SEQUENCE_NO_UNIPROT_PATH = DIR+"/mapping/sequence_no_uniprot.json"

DATASET_OUTPUT_PATH_RAW = OUTPUT_DIR+f"/dataset_raw.csv"
DATASET_OUTPUT_PATH_ONLY_INFOS = OUTPUT_DIR+f"/dataset_only_infos.csv"


# Infos for dataset creation


In [4]:
local_uniprot_infos = open_json(LOCAL_UNIPROT_INFOS_PATH)
dataset_config = open_json(DIR+"/dataset_config.json")

print(f"loaded {len(local_uniprot_infos)} uniprot infos from local storage")


loaded 578 uniprot infos from local storage


In [5]:
# prepare output dir
if not os.path.exists(OUTPUT_DIR):
    print(f"creating {OUTPUT_DIR} folder")
    os.mkdir(OUTPUT_DIR)


### Loop through all the required dataset


In [6]:
if not FRESH_START:
    main_df = pd.read_csv(DATASET_OUTPUT_PATH_RAW)
else:
    main_df = pd.DataFrame()
    main_df = add_missing_column(main_df, COLUMNS)

    for dataset_source in dataset_config["dataset_to_process"]:
        errors = {
            "no_sequence_in_data": 0,
            "not_in_local": 0,
            "wrong_position": 0,
            "no_uniprot": 0,
            "no_pdb": 0,
            "no_sequence": 0,
        }

        individual_config = dataset_config[dataset_source]
        # load csv
        if dataset_source == "thermomutdb":
            df = pd.read_json(individual_config["data_path"])
            df = df[df.mut_count.eq(0)]
            df[df.uniprot.eq('-')] = np.nan
            print(len(df))
        else:
            df = pd.read_csv(individual_config["data_path"])
        # rename columns
        df.rename(columns=individual_config["renaming_dict"],
                inplace=True)
        # add missing columns
        df = add_missing_column(df, COLUMNS)
        # split mutation code if needed
        if individual_config["need_mutation_code_split"]:
            df = df.apply(apply_split_mutation_code, axis=1)
        # remove nan mutation_code
        df = df[~df["mutation_position"].isna()]
        # keep only COLUMNS
        df = df[COLUMNS]
        # drop duplicates
        df.drop_duplicates(inplace=True)
        # add dataset_source
        df["dataset_source"] = dataset_source
        # index start at 0
        df["mutation_position"] = df["mutation_position"].apply(lambda x: x-1)
        # apply target corrections
        df["ddG"] = df["ddG"].apply(
            lambda x: x*individual_config["corrections"]["ddG"])
        df["dTm"] = df["dTm"].apply(
            lambda x: x*individual_config["corrections"]["dTm"])
        # better to initialize infos_found at 0 than nan
        df["infos_found"] = 0
        
        # check number of rows without uniprot
        # check validity of uniprot, and add the infos for those
        df = df.apply(lambda row: apply_valid_uniprot(
            row, local_uniprot_infos, dataset_config, errors), axis=1)
        
        
        print(f"processed {dataset_source}:")
        print(f"{errors=}\n")

        main_df = pd.concat([main_df, df], ignore_index=True)
        main_df.drop_duplicates(SUBSET_DUPLICATES, inplace=True)

    # save
    write_json(LOCAL_UNIPROT_INFOS_PATH, local_uniprot_infos)
    main_df.to_csv(DATASET_OUTPUT_PATH_RAW, index=False)


processed fireprotdb_curated:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 222, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

processed fireprotdb_not_curated:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 5, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

11201
processed thermomutdb:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 2651, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

processed O2567_new:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 0, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

processed prothermdb:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 1451, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

processed jinyuan_sun_train:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 0, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

processed jinyuan_sun_test:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_p

### update mapping and try to add infos


In [7]:
if not UPDATE_MAPPING:
    # don't go beyond here with Run All
    assert False


In [8]:
# merge pdb_uniprot_mapping
# pdb_uniprot_mapping = open_json(PDB_UNIPROT_MAPPING_PATH)
# pdb_uniprot_mapping2 = open_json(
#     "./data/main_dataset_creation/mapping/pdb_uniprot_mapping_2.json")

# l = len(pdb_uniprot_mapping)
# for pdb, mapped_uniprot in pdb_uniprot_mapping2.items():
#     if pdb not in pdb_uniprot_mapping:
#         pdb_uniprot_mapping[pdb] = mapped_uniprot

# print(f"added {len(pdb_uniprot_mapping)-l} new pdb mapping via merge")


In [9]:
# update pdb to uniprot mapping
update_pdb_uniprot_mapping(LOCAL_UNIPROT_INFOS_PATH,
                           PDB_UNIPROT_MAPPING_PATH,
                           LINKED_UNIPROT_MAPPING_PATH)

pdb_uniprot_mapping = open_json(PDB_UNIPROT_MAPPING_PATH)
linked_uniprot_mapping = open_json(LINKED_UNIPROT_MAPPING_PATH)
pdb_without_uniprot = open_json(PDB_NO_UNIPROT_PATH)


added 0 entries to pdb_uniprot_mapping


In [10]:
# add infos based on pdb not uniprot
df = pd.read_csv(DATASET_OUTPUT_PATH_RAW)

with_infos = df.infos_found.sum()
df = df.apply(lambda row: apply_infos_from_pdb(row, local_uniprot_infos, pdb_uniprot_mapping,
                                               linked_uniprot_mapping, dataset_config,
                                               pdb_without_uniprot, errors),
              axis=1)
print(
    f"added {df.infos_found.sum()-with_infos} new infos thanks to uniprot_from_pdb")
df.to_csv(DATASET_OUTPUT_PATH_RAW, index=False)


added 2433.0 new infos thanks to uniprot_from_pdb


In [11]:
# update sequence to uniprot mapping
update_sequence_uniprot_mapping(LOCAL_UNIPROT_INFOS_PATH,
                                SEQUENCE_UNIPROT_MAPPING_PATH,
                                LINKED_UNIPROT_MAPPING_PATH)

sequence_uniprot_mapping = open_json(SEQUENCE_UNIPROT_MAPPING_PATH)
sequence_without_uniprot = open_json(SEQUENCE_NO_UNIPROT_PATH)


added 0 entries to sequence_uniprot_mapping


In [12]:
# add infos based on sequence not pdb or uniprot

df = pd.read_csv(DATASET_OUTPUT_PATH_RAW)

with_infos = df.infos_found.sum()
df = df.apply(lambda row: apply_infos_from_sequence(row, local_uniprot_infos, sequence_uniprot_mapping,
                                                    linked_uniprot_mapping, dataset_config,
                                                    sequence_without_uniprot, errors),
              axis=1)
print(
    f"added {df.infos_found.sum()-with_infos} new infos thanks to uniprot_from_sequence")

df.to_csv(DATASET_OUTPUT_PATH_RAW, index=False)


added 0.0 new infos thanks to uniprot_from_sequence


In [13]:
# make sure mapping and other data is saved
write_json(LOCAL_UNIPROT_INFOS_PATH, local_uniprot_infos)
write_json(PDB_NO_UNIPROT_PATH, pdb_without_uniprot)
write_json(SEQUENCE_NO_UNIPROT_PATH, sequence_without_uniprot)


## Final filtering

In [16]:
main_df = pd.read_csv(DATASET_OUTPUT_PATH_RAW)
dataset_config = open_json(DIR+"/dataset_config.json")
# remove pdbs
main_df = main_df[COLUMNS[1:]]
print(main_df.infos_found.eq(0.0).sum())

# remove record without uniprot infos
print(len(main_df))
main_df = main_df.loc[main_df.infos_found == 1]
print(len(main_df))
# fill na pH if in config, otherwise remove those
if dataset_config["general_config"]["fill_na_pH"]:
    mean_pH = main_df.pH.mean()
    print(mean_pH)
    main_df.pH.fillna(mean_pH, inplace=True)
else:
    print(len(main_df))
    main_df = main_df.loc[~[main_df.pH.isna()]]
    print(len(main_df))

dataset_infos = {
    "total_len": len(main_df),
    "dataset_processed": dataset_config["dataset_to_process"],
    "general_config": dataset_config["general_config"],
    "dataset_source_repartition": main_df.dataset_source.value_counts().to_dict(),
    "unique_uniprot": len(main_df.uniprot.unique()),
    "ddG": (len(main_df)-main_df.ddG.isna().sum()),
    "dTm": (len(main_df)-main_df.dTm.isna().sum()),
    "Tm": (len(main_df)-main_df.Tm.isna().sum()),
    "nan_repartition": main_df.isna().sum().to_dict(),
    "no_pH_repartition": main_df[main_df.pH.isna()].dataset_source.value_counts().to_dict(),
}

main_df.to_csv(DATASET_OUTPUT_PATH_ONLY_INFOS, index=False)
write_json(OUTPUT_DIR+"/dataset_config.json", dataset_config)
write_json(OUTPUT_DIR+"/dataset_infos.json", dataset_infos)


7589
23277
15688
6.504306897326444


In [15]:
print(dataset_infos)

{'total_len': 15688, 'dataset_processed': ['fireprotdb_curated', 'fireprotdb_not_curated', 'thermomutdb', 'O2567_new', 'prothermdb', 'jinyuan_sun_train', 'jinyuan_sun_test', 'datasetDDG_train', 'datasetDDG_test'], 'general_config': {'fill_na_pH': True}, 'dataset_source_repartition': {'fireprotdb_curated': 6329, 'thermomutdb': 3316, 'fireprotdb_not_curated': 2614, 'O2567_new': 1359, 'prothermdb': 1087, 'datasetDDG_train': 519, 'jinyuan_sun_train': 405, 'datasetDDG_test': 53, 'jinyuan_sun_test': 6}, 'unique_uniprot': 505, 'ddG': 10540, 'dTm': 4888, 'Tm': 4467, 'nan_repartition': {'uniprot': 0, 'wild_aa': 0, 'mutation_position': 0, 'mutated_aa': 0, 'pH': 0, 'sequence': 0, 'length': 0, 'chain_start': 0, 'chain_end': 0, 'AlphaFoldDB': 2051, 'Tm': 11221, 'ddG': 5148, 'dTm': 10800, 'dataset_source': 0, 'infos_found': 0}, 'no_pH_repartition': {}}
