# Datasets Merging
see [./data/sources.csv](./data/sources.csv) for the different sources where we were able to download each datasets

Simply put there exist a few dataset containing information about the impact of mutations on protein stability.
The main 3 big dataset that exist as of Oct. 2022 are:
- FireProtDB
- ThermoMutDB
- ProThermDB

Which all incorporates data from the same old DB (ProTherm) and additionnal data added by each DB devs.
There exists also other datasets, that were cited in papers from the litterature.
We can expect a lot of redundancies between each datasets.


The goal of this Notebook is to compare the different datasets created in a [former notebook](./individual_datasets_creation.ipynb) and merge them all together.

In [1]:
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

from utils.pdb_uniprot import get_uniprot_infos, convert_columns, add_uniprot_infos, get_uniprot_infos, correct_mutation_code, apply_correct_mutation_code

COLUMNS = ["PDB_wild", "uniprot", "mutated_chain", "mutation_code", "pH", "Texp", "Tm", "ddG", "dTm"]

Texp_range = [20+273.15, 30+273.15]
# , "mutation_sequence_code", 
# we don't put "PDB_WILD" in merge_cols because it's not necessarily unique
merge_cols = ["uniprot","mutated_chain","mutation_code","pH"]
target_cols = ["Tm", "ddG", "dTm"]
uniprot_infos_cols = ['sequence', 'length', 'molWeight', 'countByFeatureType', 'chain_start', 'chain_end', 'AlphaFoldDB']

RECOMPUTE_FIREDB = True


In [2]:
if RECOMPUTE_FIREDB:
    # we will start with the fireprotdb datasets, which contains only data with either ddG or dTm
    main_df = pd.read_csv("./data/main_dataset/fireprotdb_ddg_dtm_curated.csv")
    # for now we want to compare, so we only take the COLUMNS, 
    # as they are the same accross all file from ./data/main_dataset

    main_df = main_df[COLUMNS]
    main_df = convert_columns(main_df)
    main_df["dataset_source"] = "FireProtDB"
    l = len(main_df)

    #### Cleaning dataset ####
    # rm duplicates
    main_df.drop_duplicates()
    print(f"rm {l-len(main_df)} duplicates")
    # add uniprot infos
    main_df = add_uniprot_infos(main_df)
    main_df.to_csv("./data/main_dataset/main.csv", index=False)
else:
    main_df=pd.read_csv("./data/main_dataset/main.csv")

main_df.head()

rm 0 duplicates


Unnamed: 0,PDB_wild,uniprot,mutated_chain,mutation_code,pH,Texp,Tm,ddG,dTm,dataset_source,sequence,length,molWeight,countByFeatureType,chain_start,chain_end,AlphaFoldDB
0,1CQW,P59336,A,V245L,,,52.5,,2.1,FireProtDB,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,33331.0,"{'Chain': 1, 'Domain': 1, 'Active site': 3, 'B...",1.0,294.0,P59336
1,1CQW,P59336,A,L95V,,,50.0,,-0.4,FireProtDB,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,33331.0,"{'Chain': 1, 'Domain': 1, 'Active site': 3, 'B...",1.0,294.0,P59336
2,1CQW,P59336,A,C176F,,,55.6,,5.2,FireProtDB,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,33331.0,"{'Chain': 1, 'Domain': 1, 'Active site': 3, 'B...",1.0,294.0,P59336
3,1CQW,P59336,A,G171Q,,,53.5,,3.1,FireProtDB,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,33331.0,"{'Chain': 1, 'Domain': 1, 'Active site': 3, 'B...",1.0,294.0,P59336
4,1CQW,P59336,A,T148L,,,51.5,,1.1,FireProtDB,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,33331.0,"{'Chain': 1, 'Domain': 1, 'Active site': 3, 'B...",1.0,294.0,P59336


In [3]:
def clean_df(df: pd.DataFrame):
    df = convert_columns(df, with_infos=True)
    # Drop rows where we don't have ddG or dTm
    df = df[~(pd.isna(df.ddG) & pd.isna(df.dTm))]
    # fill NaN cells
    df.chain_start.fillna(0, inplace=True)
    df.sequence.fillna("", inplace=True)
    df.chain_end.fillna(-1, inplace=True)
    # correct the mutation code when needed, rm wrong ones 
    df = apply_correct_mutation_code(df)
    df = df.loc[~df.mutation_code.eq("")]
    # drop duplicates
    df.drop_duplicates(inplace=True)
    
    # - (bonus) see if our uniprot_infos works correctly
    # TODO
    
    return df

In [4]:
#### Check FireProt ####
if RECOMPUTE_FIREDB:
    l = len(main_df)
    main_df = clean_df(main_df)
    print(f"cleaned {l-len(main_df)} rows")
    main_df.to_csv("./data/main_dataset/main.csv", index=False)

main_df.head()


cleaned 371 rows


Unnamed: 0,PDB_wild,uniprot,mutated_chain,mutation_code,pH,Texp,Tm,ddG,dTm,dataset_source,sequence,length,molWeight,countByFeatureType,chain_start,chain_end,AlphaFoldDB
0,1CQW,P59336,A,V244L,,,52.5,,2.1,FireProtDB,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,33331.0,"{'Chain': 1, 'Domain': 1, 'Active site': 3, 'B...",1.0,294.0,P59336
1,1CQW,P59336,A,L96V,,,50.0,,-0.4,FireProtDB,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,33331.0,"{'Chain': 1, 'Domain': 1, 'Active site': 3, 'B...",1.0,294.0,P59336
2,1CQW,P59336,A,C175F,,,55.6,,5.2,FireProtDB,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,33331.0,"{'Chain': 1, 'Domain': 1, 'Active site': 3, 'B...",1.0,294.0,P59336
3,1CQW,P59336,A,G170Q,,,53.5,,3.1,FireProtDB,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,33331.0,"{'Chain': 1, 'Domain': 1, 'Active site': 3, 'B...",1.0,294.0,P59336
4,1CQW,P59336,A,T147L,,,51.5,,1.1,FireProtDB,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,33331.0,"{'Chain': 1, 'Domain': 1, 'Active site': 3, 'B...",1.0,294.0,P59336


In [5]:
# add infos and clean Thermomut #

path="./data/main_dataset/thermomut.csv"
dataset_src = path.split('/')[-1][:-4]
print(dataset_src)
df = pd.read_csv(path)

df = df[COLUMNS]
if "thermomut" in path:
    # thermomutdb use an inversed sign for ddg
    df['ddG'] = df['ddG'].apply(lambda x: x*-1)

# remove multiple mutations
df = df.loc[~df.mutation_code.str.contains(" ")]
# add uniprot infos
df = add_uniprot_infos(df)
l = len(df)
df = clean_df(df)
print(f"cleaned {l-len(df)} rows")

thermomut
error occured, mutation code cannot be parsed, mutation_code: L67AL
error occured, mutation code cannot be parsed, mutation_code: L66LA
error occured, mutation code cannot be parsed, mutation_code: T27TG
error occured, mutation code cannot be parsed, mutation_code: AVIG
error occured, mutation code cannot be parsed, mutation_code: AIIG
cleaned 3946 rows


In [6]:
# l1, l2 = len(df), len(main_df)
# # Drop rows that are already in main_df by merging and keeping only 'left' (df) values
# unique_val_df = (pd.merge(df,main_df, indicator=True, how='left', on=merge_cols)
#                     .query('_merge=="left_only"')
#                     .drop('_merge', axis=1))
# # drops duplicated columns
# unique_val_df = unique_val_df.loc[:,~unique_val_df.columns.duplicated()]
# # adds the dataset source
# unique_val_df["dataset_source"] = dataset_src
# print(f"{l1-len(unique_val_df)}/{l1} rows drop from {dataset_src} as redundancies with the main_df")
# main_df = pd.concat([main_df, unique_val_df], ignore_index=True)
# print(f"added {len(main_df)-l2} from {dataset_src}")

# adds the dataset source
df["dataset_source"] = dataset_src
l1 = len(main_df)
main_df = pd.concat([main_df, df], ignore_index=True)
l2 = len(main_df)
main_df.drop_duplicates(merge_cols, keep='first', inplace=True)

print(f"added {len(main_df)-l1} from {dataset_src}\n removed {l2-l1} redundancies")

added 317 from thermomut
 removed 7255 redundancies


In [7]:
# add infos and clean ProThermDB #

path="./data/main_dataset/prothermdb.csv"
dataset_src = path.split('/')[-1][:-4]
print(dataset_src)
df = pd.read_csv(path)

df = df[COLUMNS]
# remove empty mutation code:
df = df.loc[~df.mutation_code.eq("")]
df = df.loc[~df.mutation_code.isna()]
# remove multiple mutations
df = df.loc[~df.mutation_code.str.contains(" ")]
# add uniprot infos
df = add_uniprot_infos(df)
l = len(df)
df = clean_df(df)
print(f"cleaned {l-len(df)} rows")

prothermdb
error occured, mutation code cannot be parsed, mutation_code: D23K,Y103M,E114Q,S169T,Q180A,S187D,A193V,V208L,D210V,K217Q
cleaned 2177 rows


In [8]:
df.to_csv("test.csv", index=False)

In [9]:
# Drop rows that are already in main_df by merging and keeping only 'left' (df) values
# unique_val_df = (pd.merge(df,main_df, indicator=True, how='left', on=merge_cols)
#                     .query('_merge=="left_only"')
#                     .drop('_merge', axis=1))
# # drops duplicated columns
# unique_val_df = unique_val_df.loc[:,~unique_val_df.columns.duplicated()]

# adds the dataset source
df["dataset_source"] = dataset_src

l1 = len(main_df)
main_df = pd.concat([main_df, df], ignore_index=True)
l2 = len(main_df)
main_df.drop_duplicates(merge_cols, keep='first', inplace=True)
print(f"added {len(main_df)-l1} from {dataset_src}\n removed {l2-l1} redundancies")

added 237 from prothermdb
 removed 4983 redundancies


In [10]:
main_df.to_csv(f"./data/main_dataset/main.csv", index=False)
print(len(main_df))

9988


In [11]:
main_df = main_df.drop_duplicates(merge_cols)
print(len(main_df))
main_df.to_csv(f"./data/main_dataset/main.csv", index=False)

9988
