In [12]:
import glob
import os
import pandas as pd
import numpy as np

In [13]:
COLUMNS = ["PDB_wild", "mutated_chain", "mutation_code", "mutation_sequence_code", "pH", "Texp", "Tm", "ddG"]

In [14]:
# load Ssym+ and remove predictions and index columns
columns_experimental = ["Protein","Mut_pdb","DDG_dir","DDG_inv","DDG","Ph","T","Mut_Seq","Protein_inv","Mut_pdb_inv","Mut_Seq_inv"]
ssym_dir_df = pd.read_csv("./data/Ssym+/Ssym+_experimental.csv")[columns_experimental]

# then for each line we create a new line for the reverse mutation
# add Texp and Tm columns
ssym_dir_df["Texp"] = np.nan
ssym_dir_df["Tm"] = np.nan
ssym_dir_df["PDB_wild"] = np.nan
ssym_dir_df["mutated_chain"] = np.nan
ssym_dir_df = ssym_dir_df.rename(columns={"Ph": "pH", "Mut_pdb": "mutation_code", "Mut_Seq": "mutation_sequence_code", "DDG": "ddG"})



def process_temperature(row):
    if row["T"] == 25:
        row["Texp"] = row["T"]+273.15
    else:
        row["Tm"] = row["T"]+273.15
    return row

def process_prot_name(row):
    name = row["Protein"]
    row["PDB_wild"]=name[:-1].upper()
    row["mutated_chain"]=name[-1]
    return row

ssym_dir_df = ssym_dir_df.apply(process_temperature, axis=1)
ssym_dir_df = ssym_dir_df.apply(process_prot_name, axis=1)
ssym_dir_df = ssym_dir_df.drop(columns=["T", "Protein"])

def create_reverse_row(row):
    reverse_row = []
    for name in COLUMNS:
        k = name+"_inv" if (name+"_inv" in row) else name
        reverse_row.append(row[k])
    return pd.Series(reverse_row)


ssym_reverse_df = ssym_dir_df.apply(create_reverse_row, axis=1)
ssym_reverse_df.columns = COLUMNS
ssym_dir_df = ssym_dir_df[COLUMNS]
ssym_df = pd.concat([ssym_dir_df, ssym_reverse_df], ignore_index=True)

ssym_df.to_csv("./data/main_dataset/ssym.csv", index=False)

In [15]:
thermomut_df = pd.read_json("./data/ThermoMutDB/thermomutdb.json")

columns_to_rm = ["year", "reference", "PMID"]
thermomut_df.drop(columns=columns_to_rm, axis=1, inplace=True)

thermomut_df = thermomut_df.rename(columns={"temperature": "Texp", "ph": "pH", "dtm": "dTm", "ddg": "ddG"})
thermomut_cols = thermomut_df.columns.to_list()

for name in COLUMNS:
    if name not in thermomut_cols:
        thermomut_df[name] = np.nan

thermomut_df = thermomut_df[COLUMNS+(list(set(thermomut_cols)-set(COLUMNS)))]
thermomut_df.to_csv("./data/main_dataset/thermomutdb.csv", index=False)

In [16]:
o2567_df = pd.read_csv("./data/O2567_new/O2567_new.csv")
o2567_df = o2567_df.rename(columns={"PDB code": "PDB_wild", "Chain": "mutated_chain", "dtm": "dTm", 
                                    "Experimental ddG": "ddG", "Temperature": "Texp", "Method": "method",
                                    "RSA": "rsa"
                                    })

for name in COLUMNS:
    if name not in o2567_df.columns.to_list():
        o2567_df[name] = np.nan


def process_o2567(row):
    # convert to mutation code directly
    row["mutation_code"] = row["Wild"]+str(row["Residue number"])+row["Mutated"]
    # convert Temp to K (same as ThermoMutDB)
    t = row["Texp"]
    if type(t)==type("") and " K" in t:
        row["Texp"] = float(t[:-2])
    else:
        row["Texp"] = float(t) + 273.15
    return row



o2567_df = o2567_df.apply(process_o2567, axis=1)
o2567_df = o2567_df[COLUMNS]
o2567_df.to_csv("./data/main_dataset/o2567.csv", index=False)

In [18]:
"pdb,wildtype,pdb_resseq,seq_index,mutation,wt_seq,mut_seq,ddG"
v2_test_df = pd.read_csv("./data/jinyuan_sun/v2/test.csv")
"pdb,wildtype,pdb_resseq,seq_index,mutation,wt_seq,mut_seq,ddG,group"
v2_train_df = pd.read_csv("./data/jinyuan_sun/v2/train.csv")

v2_train_df.drop(columns=["group"], axis=1, inplace=True)

jinyuan_sun_df = pd.concat([v2_test_df, v2_train_df], ignore_index=True)
jinyuan_sun_df.rename(columns={"pdb": "PDB_wild", "wt_seq": "wildtype_seq", 
                                "mut_seq": "mutated_seq"}, inplace=True)

def process_jinyuan_sun(row):
    # convert to mutation code directly
    row["mutation_code"] = row["wildtype"]+str(row["pdb_resseq"])+row["mutation"]
    row["mutation_sequence_code"] = row["wildtype"]+str(row["seq_index"])+row["mutation"]
    return row

jinyuan_sun_df = jinyuan_sun_df.apply(process_jinyuan_sun, axis=1)
jinyuan_sun_df.drop(columns=["wildtype", "pdb_resseq", "seq_index", "mutation"], axis=1, inplace=True)

for name in COLUMNS:
    if name not in jinyuan_sun_df.columns.to_list():
        jinyuan_sun_df[name] = np.nan
    
jinyuan_sun_df = jinyuan_sun_df[COLUMNS]
jinyuan_sun_df.to_csv("./data/main_dataset/jinyuan_sun.csv", index=False)

In [41]:
# csv obtained by searching all values with "has_ddg"
fire_df = pd.read_csv("./data/FireProtDB/fireprotdb_has_ddg_is_curated.csv")

# Keep only curated data
fire_df = fire_df[fire_df["is_curated"]]

# Drop duplicate rows
# THIS REMOVE 14k entries !!!!
duplicate_subset = ["pdb_id", "dTm", "ddG", "chain", "wild_type", "position", "mutation", "sequence"]
fire_df = fire_df.drop_duplicates(duplicate_subset)

# Remove weird duplicate pdb ids in the pdb_id column (keep '|' for now)
fire_df["pdb_id"] = fire_df["pdb_id"].apply(lambda x: x if "|" not in x else "|".join(list(set(x.split("|")))))

# Remove columns without useful informations
fire_df = fire_df.drop(columns=['is_curated', 'is_essential', 'is_back_to_consensus', 'method_details', 
'technique_details', 'notes', 'publication_doi', 'publication_pubmed'])

# fire_df.columns.to_list()
fire_df.rename(columns={"pdb_id": "PDB_wild", "chain": "mutated_chain", "tm": "Tm"}, inplace=True)

for name in COLUMNS:
    if name not in fire_df.columns.to_list():
        fire_df[name] = np.nan

def process_fire(row):
    # convert to mutation code directly
    row["mutation_code"] = row["wild_type"]+str(row["position"])+row["mutation"]
    return row

fire_df = fire_df.apply(process_fire, axis=1)
fire_df.drop(columns=["wild_type", "position", "mutation"], axis=1, inplace=True)

fire_df = fire_df[COLUMNS+(list(set(fire_df.columns.to_list())-set(COLUMNS)))]
fire_df.to_csv("./data/main_dataset/firedb.csv", index=False)
