# Creating a cross id db for pdb uniprot and basic infos abt the protein
see [./data/sources.csv](./data/sources.csv) for the different sources where we were able to download each datasets

This notebook reuse the code in [individual_datasets_creation](./individual_datasets_creation.ipynb) NB.
The goal is to have a helping DB for merging, duplicate search and already known sequence search

In [28]:
import pandas as pd
import numpy as np
import json
import pypdb
import urllib.request

In [29]:
COLUMNS = ["uniprot", "PDB_wild", "sequence", "length", "molWeight", "countByFeatureType", "chain_start", "chain_end"]
# "mutation_sequence_code"
# turn to False if you want to save all available info in the db
SAVE_ONLY_COLUMNS = True

main_df = pd.DataFrame(columns=COLUMNS)

def add_missing_column(df):
    for name in COLUMNS:
        if name not in df.columns.to_list():
            df[name] = np.nan
    
    return df

def save_df(df, name):
    if SAVE_ONLY_COLUMNS:
        df = df[COLUMNS]
    df.to_csv(f"./data/main_dataset/{name}.csv", index=False)

def add_infos(main_df, df):
    if SAVE_ONLY_COLUMNS:
        df = df[COLUMNS]
    main_df = pd.concat([main_df, df], ignore_index=True)
    main_df = main_df.drop_duplicates()
    main_df.reset_index(inplace=True)
    main_df.drop(columns="index", inplace=True)
    return main_df

In [30]:
##### Ssym+ #####

# load Ssym+ and remove predictions and index columns
columns_experimental = ["Protein","Mut_pdb","DDG_dir","DDG_inv","DDG","Ph","T","Mut_Seq","Protein_inv","Mut_pdb_inv","Mut_Seq_inv"]
ssym_dir_df = pd.read_csv("./data/Ssym+/Ssym+_experimental.csv")[columns_experimental]


ssym_dir_df = add_missing_column(ssym_dir_df)

def process_prot_name(row):
    name = row["Protein"]
    row["PDB_wild"]=name[:-1].upper()
    row["mutated_chain"]=name[-1]
    return row

ssym_dir_df = ssym_dir_df.apply(process_prot_name, axis=1)

main_df = add_infos(main_df, ssym_dir_df)
print(len(main_df))

19


In [31]:
##### ThermoMutDB #####

thermomut_df = pd.read_json("./data/ThermoMutDB/thermomutdb.json")

thermomut_df = add_missing_column(thermomut_df)
main_df = add_infos(main_df, thermomut_df)
print(len(main_df))

686


In [32]:
##### O2567 #####

o2567_df = pd.read_csv("./data/O2567_new/O2567_new.csv")
o2567_df = o2567_df.rename(columns={"PDB code": "PDB_wild"})

o2567_df = add_missing_column(o2567_df)
main_df = add_infos(main_df, o2567_df)
print(len(main_df))

786


In [33]:
##### Jynyuan Sun's dataset (Phd. student in bio) #####

"pdb,wildtype,pdb_resseq,seq_index,mutation,wt_seq,mut_seq,ddG"
v2_test_df = pd.read_csv("./data/jinyuan_sun/v2/test.csv")
"pdb,wildtype,pdb_resseq,seq_index,mutation,wt_seq,mut_seq,ddG,group"
v2_train_df = pd.read_csv("./data/jinyuan_sun/v2/train.csv")

v2_train_df.drop(columns=["group"], axis=1, inplace=True)

jinyuan_sun_df = pd.concat([v2_test_df, v2_train_df], ignore_index=True)
jinyuan_sun_df.rename(columns={"pdb": "PDB_wild", "wt_seq": "sequence"})


jinyuan_sun_df = add_missing_column(jinyuan_sun_df)
main_df = add_infos(main_df, jinyuan_sun_df)
print(len(main_df))

787


In [34]:
##### FireProtDB #####

# csv obtained by searching all values with "has_ddg"
fire_df = pd.read_csv("./data/FireProtDB/fireprotdb_has_ddg_or_dtm_is_curated.csv")

# Keep only curated data
fire_df = fire_df[fire_df["is_curated"]]

# Drop duplicate rows
# THIS REMOVE 14k+ entries !!!!
duplicate_subset = ["pdb_id", "dTm", "ddG", "chain", "wild_type", "position", "mutation", "sequence"]
fire_df = fire_df.drop_duplicates(duplicate_subset)

# Drop rows with no valid pdb files ~10-15
fire_df = fire_df[~pd.isna(fire_df.pdb_id)]
# Remove weird duplicate pdb ids in the pdb_id column (keep '|' for now)
fire_df["pdb_id"] = fire_df["pdb_id"].apply(lambda x: x if "|" not in x else "|".join(list(set(x.split("|")))))

# fire_df.columns.to_list()
fire_df.rename(columns={"pdb_id": "PDB_wild", "uniprot_id": "uniprot"}, inplace=True)

fire_df = add_missing_column(fire_df)
main_df = add_infos(main_df, fire_df)
print(len(main_df))

980


In [35]:
##### DeepDDG dataset #####
train_df = pd.read_csv("./data/DeepDDG_train_dataset/datasetDDG_train.csv")
test_df = pd.read_csv("./data/DeepDDG_train_dataset/datasetDDG_test.csv")

train_df.drop(columns="Fold ID in 10-fold cross validation", axis=1, inplace= True)
deepddg_df = pd.concat([train_df, test_df], ignore_index=True)
deepddg_df.rename(columns={"PDB ID with modifications to be made": "PDB_wild", "Uniprot ID": "uniprot"}, 
                            inplace= True)

deepddg_df = add_missing_column(deepddg_df)
main_df = add_infos(main_df, fire_df)
print(len(main_df))

980


In [36]:
##### ProThermDB #####
all_ddg_df = pd.read_csv("./data/ProThermDB/prothermdb_all_ddg.tsv", sep="\t")
all_dtm_df = pd.read_csv("./data/ProThermDB/prothermdb_all_dTm.tsv", sep="\t")
all_tm_df = pd.read_csv("./data/ProThermDB/prothermdb_all_Tm.tsv", sep="\t")

prothermdb_df = pd.concat([all_ddg_df, all_dtm_df, all_tm_df], ignore_index=True)

# removes duplicate: ~400 rows
prothermdb_df = prothermdb_df.drop_duplicates()

prothermdb_df.rename(columns={"UniProt_ID": "uniprot"}, 
                    inplace= True)

prothermdb_df = add_missing_column(prothermdb_df)

def process_prothermdb(row):
    for k in row.keys():
        if row[k]=='-':
            row[k]=""
    
    # coherent PDB_WILD: all caps
    row["PDB_wild"] = row["PDB_wild"].upper()
    row["uniprot"] = row["uniprot"].replace(" ", "")
    return row

prothermdb_df = prothermdb_df.apply(process_prothermdb, axis=1)
main_df = add_infos(main_df, fire_df)
print(len(main_df))

980


In [38]:
save_df(main_df, "tmp_pdb_uniprot_db")