# Dataset creation

![flowchart](./resources/dataset_creation_flowchart.drawio.png)


In [1]:
import pandas as pd
import numpy as np

from utils.file_utils import open_json, write_json
from utils.dataset_creation import *
from utils.dataset_mapping import *


In [2]:
FRESH_START = True
UPDATE_MAPPING = True


In [3]:
COLUMNS = ["pdbs", "uniprot", "wild_aa", "mutation_position",
           "mutated_aa", "pH",
           "sequence", "length", "chain_start", "chain_end",
           "AlphaFoldDB", "Tm", "ddG", "dTm",
           "dataset_source", "infos_found"]

SUBSET_DUPLICATES = ["uniprot", "wild_aa", "mutation_position",
                     "mutated_aa", "pH", "sequence"]

OUPUT_DIR = "./data/main_dataset_creation"
LOCAL_UNIPROT_INFOS_PATH = OUPUT_DIR+"/uniprot_infos.json"
PDB_UNIPROT_MAPPING_PATH = OUPUT_DIR+"/mapping/pdb_uniprot_mapping.json"
LINKED_UNIPROT_MAPPING_PATH = OUPUT_DIR+"/mapping/linked_uniprot_mapping.json"
SEQUENCE_UNIPROT_MAPPING_PATH = OUPUT_DIR + \
    "/mapping/sequence_uniprot_mapping.json"
PDB_NO_UNIPROT_PATH = OUPUT_DIR+"/mapping/pdb_no_uniprot.json"
SEQUENCE_NO_UNIPROT_PATH = OUPUT_DIR+"/mapping/sequence_no_uniprot.json"

DATASET_OUTPUT_PATH_RAW = OUPUT_DIR+"/dataset_raw.csv"
DATASET_OUTPUT_PATH_ONLY_INFOS = OUPUT_DIR+"/dataset_only_infos.csv"


# Infos for dataset creation


In [4]:
local_uniprot_infos = open_json(LOCAL_UNIPROT_INFOS_PATH)
dataset_config = open_json(OUPUT_DIR+"/dataset_config.json")

print(f"loaded {len(local_uniprot_infos)} uniprot infos from local storage")


loaded 549 uniprot infos from local storage


In [5]:
df = pd.read_csv("./data/ProThermDB/processed_prothermdb.csv")
df.mutation_code.isna().sum()


0

##### FireProtDB


In [6]:
if FRESH_START:
    main_df = pd.DataFrame()
    main_df = add_missing_column(main_df, COLUMNS)
else:
    main_df = pd.read_csv(DATASET_OUTPUT_PATH_RAW)

for dataset_source in dataset_config["dataset_to_process"]:
    errors = {
        "no_sequence_in_data": 0,
        "not_in_local": 0,
        "wrong_position": 0,
        "no_uniprot": 0,
        "no_pdb": 0,
        "no_sequence": 0,
    }

    individual_config = dataset_config[dataset_source]
    # load csv
    if dataset_source == "thermomutdb":
        df = pd.read_json(individual_config["data_path"])
        df = df[df.mut_count.eq(0)]
        df[df.uniprot.eq('-')] = np.nan
        print(len(df))
    else:
        df = pd.read_csv(individual_config["data_path"])
    # rename columns
    df.rename(columns=individual_config["renaming_dict"],
              inplace=True)
    # add missing columns
    df = add_missing_column(df, COLUMNS)
    # split mutation code if needed
    if individual_config["need_mutation_code_split"]:
        df = df.apply(apply_split_mutation_code, axis=1)
    # remove nan mutation_code
    df = df[~df["mutation_position"].isna()]
    # keep only COLUMNS
    df = df[COLUMNS]
    # drop duplicates
    df.drop_duplicates(inplace=True)
    # add dataset_source
    df["dataset_source"] = dataset_source
    # index start at 0
    df["mutation_position"] = df["mutation_position"].apply(lambda x: x-1)
    # apply target corrections
    df["ddG"] = df["ddG"].apply(
        lambda x: x*individual_config["corrections"]["ddG"])
    df["dTm"] = df["dTm"].apply(
        lambda x: x*individual_config["corrections"]["dTm"])
    # better to initialize infos_found at 0 than nan
    df["infos_found"] = 0
    
    # check number of rows without uniprot
    # check validity of uniprot, and add the infos for those
    df = df.apply(lambda row: apply_valid_uniprot(
        row, local_uniprot_infos, dataset_config, errors), axis=1)
    
    
    print(f"processed {dataset_source}:")
    print(f"{errors=}\n")

    main_df = pd.concat([main_df, df], ignore_index=True)
    main_df.drop_duplicates(SUBSET_DUPLICATES, inplace=True)

# save
write_json(LOCAL_UNIPROT_INFOS_PATH, local_uniprot_infos)
main_df.to_csv(DATASET_OUTPUT_PATH_RAW, index=False)


processed fireprotdb:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 222, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

11201
processed thermomutdb:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 2651, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

processed O2567_new:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 0, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

processed prothermdb:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 1451, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

processed jinyuan_sun_train:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 0, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

processed jinyuan_sun_test:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 0, 'no_uniprot': 0, 'no_pdb': 0, 'no_sequence': 0}

processed datasetDDG_train:
errors={'no_sequence_in_data': 0, 'not_in_local': 0, 'wrong_position': 1527

In [7]:
# total number of elements before any filtering:
# fireprotdb          34448
# thermomutdb         13633
# prothermdb           8740
# O2567_new            2568
# datasetDDG_train     5444
# datasetDDG_test       276
# jinyuan_sun_train    4048
# jinyuan_sun_test      168
print(df.dataset_source.value_counts())


datasetDDG_test    276
Name: dataset_source, dtype: int64


### update mapping and try to add infos


In [8]:
if not UPDATE_MAPPING:
    # don't go beyond here with Run All
    assert False


In [9]:
# update pdb to uniprot mapping
update_pdb_uniprot_mapping(LOCAL_UNIPROT_INFOS_PATH,
                           PDB_UNIPROT_MAPPING_PATH,
                           LINKED_UNIPROT_MAPPING_PATH)

pdb_uniprot_mapping = open_json(PDB_UNIPROT_MAPPING_PATH)
linked_uniprot_mapping = open_json(LINKED_UNIPROT_MAPPING_PATH)
pdb_without_uniprot = open_json(PDB_NO_UNIPROT_PATH)


added 0 entries to pdb_uniprot_mapping


In [10]:
# add infos based on pdb not uniprot
df = pd.read_csv(DATASET_OUTPUT_PATH_RAW)

with_infos = df.infos_found.sum()
df = df.apply(lambda row: apply_infos_from_pdb(row, local_uniprot_infos, pdb_uniprot_mapping,
                                               linked_uniprot_mapping, dataset_config,
                                               pdb_without_uniprot, errors),
              axis=1)
print(
    f"added {df.infos_found.sum()-with_infos} new infos thanks to uniprot_from_pdb")
df.to_csv(DATASET_OUTPUT_PATH_RAW, index=False)


added 2362.0 new infos thanks to uniprot_from_pdb


In [11]:
# update sequence to uniprot mapping
update_sequence_uniprot_mapping(LOCAL_UNIPROT_INFOS_PATH,
                                SEQUENCE_UNIPROT_MAPPING_PATH,
                                LINKED_UNIPROT_MAPPING_PATH)

sequence_uniprot_mapping = open_json(SEQUENCE_UNIPROT_MAPPING_PATH)
sequence_without_uniprot = open_json(SEQUENCE_NO_UNIPROT_PATH)


added 0 entries to sequence_uniprot_mapping


In [12]:
# add infos based on sequence not pdb or uniprot

df = pd.read_csv(DATASET_OUTPUT_PATH_RAW)

with_infos = df.infos_found.sum()
df = df.apply(lambda row: apply_infos_from_sequence(row, local_uniprot_infos, sequence_uniprot_mapping,
                                                    linked_uniprot_mapping, dataset_config,
                                                    sequence_without_uniprot, errors),
              axis=1)
print(
    f"added {df.infos_found.sum()-with_infos} new infos thanks to uniprot_from_sequence")

df.to_csv(DATASET_OUTPUT_PATH_RAW, index=False)


added 0.0 new infos thanks to uniprot_from_sequence


In [13]:
# make sure mapping and other data is saved
write_json(LOCAL_UNIPROT_INFOS_PATH, local_uniprot_infos)
write_json(PDB_NO_UNIPROT_PATH, pdb_without_uniprot)
write_json(SEQUENCE_NO_UNIPROT_PATH, sequence_without_uniprot)


## Final filtering

In [14]:
main_df = pd.read_csv(DATASET_OUTPUT_PATH_RAW)
# remove pdbs
main_df = main_df[COLUMNS[1:]]
# remove record without uniprot infos
main_df = main_df[~main_df.infos_found.eq("0.0")]
print(len(main_df))
main_df.to_csv(DATASET_OUTPUT_PATH_ONLY_INFOS)


20922


## Quick check of the data

In [15]:
"I"
s="MMSFVSLLLVGILFHATQAEQLTKCEVFQKLKDLKDYGGVSLPEWVCTAFHTSGYDTQAIVQNNDSTEYGLFQINNKIWCKDDQNPHSRNICNISCDKFLDDDLTDDIVCAKKILDKVGINYWLAHKALCSEKLDQWLCEKL"
s[107]

'I'

In [16]:
main_df.pH.isna().sum()


4147

In [17]:
len(main_df.uniprot.unique())


548

In [18]:
print(len(main_df)-main_df.ddG.isna().sum())
print(len(main_df)-main_df.dTm.isna().sum())
print(len(main_df)-main_df.Tm.isna().sum())


14882
6066
4047


In [19]:
# total number of elements:
# fireprotdb          34448
# thermomutdb         13633
# prothermdb           8740
# O2567_new            2568
# datasetDDG_train     5444
# datasetDDG_test       276
# jinyuan_sun_train    4048
# jinyuan_sun_test      168
print(main_df.dataset_source.value_counts())


fireprotdb           6724
thermomutdb          5655
jinyuan_sun_train    3473
O2567_new            2522
prothermdb           1491
datasetDDG_train      864
jinyuan_sun_test      127
datasetDDG_test        66
Name: dataset_source, dtype: int64
