# Individual Datasets Creation
see [./data/sources.csv](./data/sources.csv) for the different sources where we were able to download each datasets
Simply put there exist a few dataset containing information about the impact of mutations on protein stability.
The main 3 big dataset that exist as of Oct. 2022 are:
- FireProtDB
- ThermoMutDB
- ProThermDB

Which all incorporates data from the same old DB (ProTherm) and additionnal data added by each DB devs.
There exists also other datasets, that were cited in papers from the litterature.
We can expect a lot of redundancies between each datasets.


The goal of this Notebook is to create coherent datasets from each DB, in order to then compare them and put them all together.
The merging itself is done in another Notebook: [datasets_merging](./datasets_merging.ipynb).

In [1]:
import pandas as pd
import numpy as np
import pypdb

In [2]:
COLUMNS = ["PDB_wild", "uniprot", "mutated_chain", "mutation_code", "pH", "Texp", "Tm", "ddG", "dTm"]
# "mutation_sequence_code"
# turn to False if you want to save all available info in the db
SAVE_ONLY_COLUMNS = True

def add_missing_column(df):
    for name in COLUMNS:
        if name not in df.columns.to_list():
            df[name] = np.nan
    
    return df

def save_df(df, name):
    if SAVE_ONLY_COLUMNS:
        df = df[COLUMNS]
    df.to_csv(f"./data/main_dataset/{name}.csv", index=False)

In [3]:
def seq_to_pdb(seq):
    # get pdb id from protein sequence using the pypdb package to query the RCSB Protein Data Bank API
    q = pypdb.Query(seq, 
        query_type="sequence", 
        return_type="polymer_entity")
    
    for result in q.search()["result_set"]:
        [result_id, chain] = result["identifier"].split('_')
        if result["score"] == 1.0 and chain=="1":
            return result_id

    return ""

In [4]:
##### Ssym+ #####

# load Ssym+ and remove predictions and index columns
columns_experimental = ["Protein","Mut_pdb","DDG_dir","DDG_inv","DDG","Ph","T","Mut_Seq","Protein_inv","Mut_pdb_inv","Mut_Seq_inv"]
ssym_dir_df = pd.read_csv("./data/Ssym+/Ssym+_experimental.csv")[columns_experimental]

ssym_dir_df.rename(columns={"Ph": "pH", "Mut_pdb": "mutation_code", 
                            "Mut_Seq": "mutation_sequence_code", "DDG_dir": "ddG", "DDG_inv": "ddG_inv"}
                            , inplace=True)
ssym_dir_df = add_missing_column(ssym_dir_df)


def process_temperature(row):
    if row["T"] == 25:
        row["Texp"] = row["T"]+273.15
    else:
        row["Tm"] = row["T"]+273.15
    return row

def process_prot_name(row):
    name = row["Protein"]
    row["PDB_wild"]=name[:-1].upper()
    row["mutated_chain"]=name[-1]
    return row

ssym_dir_df = ssym_dir_df.apply(process_temperature, axis=1)
ssym_dir_df = ssym_dir_df.apply(process_prot_name, axis=1)
ssym_dir_df = ssym_dir_df.drop(columns=["T", "Protein"])

def create_reverse_row(row):
    reverse_row = []
    row["PDB_wild"] = row["Protein_inv"][:-1].upper()
    row["mutated_chain"] = row["Protein_inv"][-1]

    for name in COLUMNS:
        k = name+"_inv" if (name+"_inv" in row) else name
        reverse_row.append(row[k])

    return pd.Series(reverse_row)


# then for each line we create a new line for the reverse mutation
ssym_reverse_df = ssym_dir_df.apply(create_reverse_row, axis=1)
ssym_reverse_df.columns = COLUMNS
ssym_dir_df = ssym_dir_df[COLUMNS]

ssym_df = pd.concat([ssym_dir_df, ssym_reverse_df], ignore_index=True)

save_df(ssym_df, "ssym")

In [5]:
##### ThermoMutDB #####

thermomut_df = pd.read_json("./data/ThermoMutDB/thermomutdb.json")

columns_to_rm = ["year", "reference", "PMID"]
thermomut_df.drop(columns=columns_to_rm, axis=1, inplace=True)

thermomut_df = thermomut_df.rename(columns={"temperature": "Texp", "ph": "pH", "dtm": "dTm", "ddg": "ddG"})

def process_code(row):
    # "R15H,E20S" => R15H E20S
    code = row["mutation_code"]
    if ',' in code:
        row["mutation_code"] = ' '.join([s.replace('"', '') for s in code.split(',')])
    return row

thermomut_df = add_missing_column(thermomut_df)
thermomut_df = thermomut_df[COLUMNS+(list(set(thermomut_df.columns.to_list())-set(COLUMNS)))]
thermomut_df = thermomut_df.apply(process_code, axis=1)
save_df(thermomut_df, "thermomut")


In [6]:
##### O2567 #####

o2567_df = pd.read_csv("./data/O2567_new/O2567_new.csv")
o2567_df = o2567_df.rename(columns={"PDB code": "PDB_wild", "Chain": "mutated_chain", "dtm": "dTm", 
                                    "Experimental ddG": "ddG", "Temperature": "Texp", "Method": "method",
                                    "RSA": "rsa"
                                    })

o2567_df = add_missing_column(o2567_df)

def process_o2567(row):
    # convert to mutation code directly
    row["mutation_code"] = row["Wild"]+str(row["Residue number"])+row["Mutated"]
    # convert Temp to K (same as ThermoMutDB)
    t = row["Texp"]
    if type(t)==type("") and " K" in t:
        row["Texp"] = float(t[:-2])
    else:
        row["Texp"] = float(t) + 273.15
    return row



o2567_df = o2567_df.apply(process_o2567, axis=1)
o2567_df = o2567_df[COLUMNS]
save_df(o2567_df, "o2567")

In [7]:
##### Jynyuan Sun's dataset (Phd. student in bio) #####

"pdb,wildtype,pdb_resseq,seq_index,mutation,wt_seq,mut_seq,ddG"
v2_test_df = pd.read_csv("./data/jinyuan_sun/v2/test.csv")
"pdb,wildtype,pdb_resseq,seq_index,mutation,wt_seq,mut_seq,ddG,group"
v2_train_df = pd.read_csv("./data/jinyuan_sun/v2/train.csv")

v2_train_df.drop(columns=["group"], axis=1, inplace=True)

jinyuan_sun_df = pd.concat([v2_test_df, v2_train_df], ignore_index=True)
jinyuan_sun_df.rename(columns={"pdb": "PDB_wild", "wt_seq": "wildtype_seq", 
                                "mut_seq": "mutated_seq"}, inplace=True)

def process_jinyuan_sun(row):
    # convert to mutation code directly
    row["mutation_code"] = row["wildtype"]+str(row["pdb_resseq"])+row["mutation"]
    row["mutation_sequence_code"] = row["wildtype"]+str(row["seq_index"])+row["mutation"]
    return row

jinyuan_sun_df = jinyuan_sun_df.apply(process_jinyuan_sun, axis=1)
jinyuan_sun_df.drop(columns=["wildtype", "pdb_resseq", "seq_index", "mutation"], axis=1, inplace=True)


jinyuan_sun_df = add_missing_column(jinyuan_sun_df)
    
save_df(jinyuan_sun_df, "jinyuan_sun")


In [9]:
##### FireProtDB #####

# csv obtained by searching all values with "has_ddg"
fire_df = pd.read_csv("./data/FireProtDB/fireprotdb_has_ddg_or_dtm_is_curated.csv")

# Keep only curated data
fire_df = fire_df[fire_df["is_curated"]]

# Drop duplicate rows
# THIS REMOVE 14k+ entries !!!!
duplicate_subset = ["pdb_id", "dTm", "ddG", "chain", "wild_type", "position", "mutation", "sequence"]
fire_df = fire_df.drop_duplicates(duplicate_subset)

# Drop rows with no valid pdb files ~10-15
fire_df = fire_df[~pd.isna(fire_df.pdb_id)]
# Remove weird duplicate pdb ids in the pdb_id column (keep '|' for now)
fire_df["pdb_id"] = fire_df["pdb_id"].apply(lambda x: x if "|" not in x else "|".join(list(set(x.split("|")))))

# Remove columns without useful informations
fire_df = fire_df.drop(columns=['is_curated', 'is_essential', 'is_back_to_consensus', 'method_details', 
'technique_details', 'notes', 'publication_doi', 'publication_pubmed'])

# fire_df.columns.to_list()
fire_df.rename(columns={"pdb_id": "PDB_wild", "chain": "mutated_chain", "tm": "Tm", "uniprot_id": "uniprot"}, inplace=True)

fire_df = add_missing_column(fire_df)

# count_wrong_seq_to_pdb = 0
# no_seq_to_pdb = 0
# wrong_seq_to_pdb_ids = []
# count = 0

def process_fire(row):
    row["mutation_code"] = row["wild_type"]+str(row["position"])+row["mutation"]
    
    # global count_wrong_seq_to_pdb
    # global wrong_seq_to_pdb_ids
    # global no_seq_to_pdb
    # global count

    # if count < 100:
    #     count += 1
    #     # convert to mutation code directly
    #     computed_pdb = seq_to_pdb(row["sequence"])
    #     if computed_pdb == "":
    #         no_seq_to_pdb += 1
    #     elif (computed_pdb not in row["PDB_wild"]):
    #         count_wrong_seq_to_pdb += 1
    #         wrong_seq_to_pdb_ids.append(row["experiment_id"])
    return row

fire_df = fire_df.apply(process_fire, axis=1)

# print(f"{count_wrong_seq_to_pdb=}")
# print(f"{no_seq_to_pdb=}")
# from utils.file_utils import write_json
# write_json("wrong_seq_to_pdb_ids.json", wrong_seq_to_pdb_ids)

fire_df.drop(columns=["wild_type", "position", "mutation"], axis=1, inplace=True)

fire_df = fire_df[COLUMNS+(list(set(fire_df.columns.to_list())-set(COLUMNS)))]
save_df(fire_df, "fireprotdb_ddg_dtm_curated")

In [10]:
##### Kaggle (competition dataset) #####

kaggle_df = pd.read_csv("./data/Kaggle/updated_train.csv")
kaggle_df.rename(columns={"tm": "Tm", "protein_sequence": "mutated_seq"}, inplace=True)

kaggle_df = add_missing_column(kaggle_df)
kaggle_df = kaggle_df[COLUMNS+(list(set(kaggle_df.columns.to_list())-set(COLUMNS)))]

save_df(kaggle_df, "kaggle")

In [11]:
##### DeepDDG dataset #####
train_df = pd.read_csv("./data/DeepDDG_train_dataset/datasetDDG_train.csv")
test_df = pd.read_csv("./data/DeepDDG_train_dataset/datasetDDG_test.csv")

train_df.drop(columns="Fold ID in 10-fold cross validation", axis=1, inplace= True)
deepddg_df = pd.concat([train_df, test_df], ignore_index=True)
deepddg_df.rename(columns={"Mutation": "mutation_code", "T": "Texp", "ΔΔG (kcal/mol) positive is stable": "ddG", 
                            "PDB ID with modifications to be made": "PDB_wild", "Uniprot ID": "uniprot",
                            "Source": "source", "Protein name": "protein", "PubMed ID": "PMID"}, 
                            inplace= True)

deepddg_df = add_missing_column(deepddg_df)

# remove '_' from mutation code
def process_deepddg(row):
    row["mutation_code"] = row["mutation_code"].replace("_", "")
    row["Texp"] += 273.15
    return row

deepddg_df = deepddg_df.apply(process_deepddg, axis=1)

deepddg_df = deepddg_df[COLUMNS+(list(set(deepddg_df.columns.to_list())-set(COLUMNS)))]
save_df(deepddg_df, "deepddg")


In [12]:
##### ProThermDB #####
all_ddg_df = pd.read_csv("./data/ProThermDB/prothermdb_all_ddg.tsv", sep="\t")
all_dtm_df = pd.read_csv("./data/ProThermDB/prothermdb_all_dTm.tsv", sep="\t")

prothermdb_df = pd.concat([all_ddg_df, all_dtm_df], ignore_index=True)

# removes duplicate: ~400 rows
prothermdb_df = prothermdb_df.drop_duplicates()

# columns we let "as is" 
# ['LENGTH', '∆G_H2O_(kcal/mol)', '∆∆G_H2O_(kcal/mol)', '∆H_(kcal/mol)', '∆HvH_(kcal/mol)', 'm_(kcal/mol/M)', 'Cm_(M)', '∆Cp_(kcal/mol)', 'STATE', 'REVERSIBILITY']

prothermdb_df.rename(columns={"PROTEIN": "protein", "UniProt_ID": "uniprot", "PubMed_ID": "PMID",
                                "SOURCE": "source", 'T_(C)': "Texp", 'MEASURE': "measure", 'METHOD': "method",
                                '∆G_(kcal/mol)': "dG", '∆∆G_(kcal/mol)': "ddG", 'Tm_(C)':"Tm", '∆Tm_(C)': "dTm"}, 
                    inplace= True)

prothermdb_df = add_missing_column(prothermdb_df)

def process_prothermdb(row):
    for k in row.keys():
        if row[k]=='-':
            row[k]=""
    
    # coherent PDB_WILD: all caps
    row["PDB_wild"] = row["PDB_wild"].upper()

    # 1st: convert T to float (sometimes: 23(1.2) or >96)
    if '>' in row["Tm"]:
        row["Tm"] = row["Tm"].split(">")[1]

    texp = float(row["Texp"].split('(')[0]) if row["Texp"] else np.nan
    tm = float(row["Tm"].split('(')[0]) if row["Tm"] else np.nan
    # 2nd: convert temperatures from C to K
    row["Texp"] = texp + 273.15 if texp else np.nan
    row["Tm"] = tm + 273.15  if tm else np.nan
    
    # make sure that dTm, ddG and Tm are all floats:
    row["ddG"] = float(str(row["ddG"]).split('(')[0]) if row["ddG"] else np.nan
    row["dTm"] = float(str(row["dTm"]).split('(')[0]) if row["dTm"] else np.nan
    row["Tm"] = float(str(row["Tm"]).split('(')[0]) if row["Tm"] else np.nan


    # handling the difference in mutation code, including multiple mutations
    row["mutated_chain"] = ""
    row["mutation_code"] = ""

    if row["PDB_Chain_Mutation"]:
        pdb_split = [s.split(":") for s in row["PDB_Chain_Mutation"].split(' ')]
        for s in pdb_split:
            if len(s)!=2:
                continue
            [pdb_wild, mut] = s
            # normally we have 1csp_A => A
            if '_' in pdb_wild:
                row["mutated_chain"] += pdb_wild.split('_')[-1]
            
            # sometimes we have A_M1R => A & M1R
            if '_' in mut:
                row["mutated_chain"] += mut.split('_')[0]
                row["mutation_code"] += mut.split('_')[1]+" "
            else:
                row["mutation_code"] += mut+" "

    # M1R E3K K65I E66K(Based on UniProt and PDB) => M1R E3K K65I E66K
    row["mutation_sequence_code"] = row["MUTATION"].split('(')[0]
    
    # we only need first char of mutated_chain
    row["mutated_chain"] = row["mutated_chain"][0] if row["mutated_chain"] else ""
    # remove last space in str
    row["mutation_code"] = row["mutation_code"].strip()
    row["mutation_sequence_code"] = row["mutation_sequence_code"].strip()
    # remove spaces in some uniprot id:
    row["uniprot"] = row["uniprot"].replace(" ", "")

    return row

prothermdb_df = prothermdb_df.apply(process_prothermdb, axis=1)

prothermdb_df.drop(columns=['NO','KEY_WORDS', 'REFERENCE', 'AUTHOR', 'REMARKS', 'RELATED_ENTRIES', "MUTATION", "PDB_Chain_Mutation"],
                    inplace=True)

prothermdb_df = prothermdb_df[COLUMNS+(list(set(prothermdb_df.columns.to_list())-set(COLUMNS)))]
save_df(prothermdb_df, "prothermdb")

In [None]:
# TODO:
# - check for multiple mutations : how to handle ?
# - careful sometimes '-' instead of NaN/empty