# Dataset creation
![flowchart](./resources/dataset_creation_flowchart.drawio.png)

In [1]:
import json
import pandas as pd
import numpy as np
import urllib.request
from utils.file_utils import open_json, write_json
from utils.pdb_uniprot import correct_mutation_position


In [2]:
COLUMNS = ["pdbs", "uniprot", "wild_aa", "mutation_position",
           "mutated_aa", "pH",
           "sequence", "length", "chain_start", "chain_end",
           "AlphaFoldDB", "Tm", "ddG", "dTm"]

LOCAL_UNIPROT_INFOS_PATH = "./data/main_dataset_creation/uniprot_infos.json"

In [3]:
def get_uniprot_infos_online(uniprot: str):
    """
    get uniprot infos from the web via a simple http request to their restAPI
    if the entry is wrong, or the data in uniprotDB is missing infos we return {}
    """
    try:
        with urllib.request.urlopen(f"https://rest.uniprot.org/uniprotkb/{uniprot}.json") as url:
            data = json.load(url)
    except Exception as e:
        print(f"exception raised for {uniprot}: {e}")
        return {}

    # some uniprot entries exists, but without the needed data
    if "sequence" not in data:
        return {}

    sequence = data.get("sequence", {}).get("value")
    features = data.get("features", [])
    chain_location = next(
        (x for x in features if x["type"] == "Chain"), {}).get("location", {})

    databases = data.get("uniProtKBCrossReferences", [])
    pdb_ids = " ".join([x["id"]
                        for x in databases if (x["database"] == "PDB")])

    return {
        "pdbs": pdb_ids,
        "sequence": sequence,
        "length": len(sequence),
        "chain_start": chain_location.get("start", {}).get("value", 0),
        "chain_end": chain_location.get("end", {}).get("value", 0),
        "AlphaFoldDB": " ".join([x["id"] for x in databases if (x["database"] == "AlphaFoldDB")]),
    }


def valid_uniprot(uniprot: str, local_uniprot_infos: dict, wild_aa: str,
                  mutation_position: int, dataset_config: dict, errors: dict):
    data = local_uniprot_infos.get(uniprot, {})
    """
    get uniprot data from either local storage or from web
    then check whether the info is coherent with the mutation 
    > sequence[position] = wild_aa
    applies offset on position according to the individual dataset config
    returns:
    - data: infos abt the requested uniprot (can be {} if none was found)
    - updated local_uniprot_infos
    - updated mutation_position (can be np.nan if none was found/incoherent)
    - 
    """

    if data == {}:
        errors["not_in_local"] += 1
        data = get_uniprot_infos_online(uniprot)
        local_uniprot_infos[uniprot] = data
        # validate the data
        if "sequence" not in data:
            errors["no_sequence_in_data"] += 1
            return {}, local_uniprot_infos, np.nan, errors
        # index start at 0 => chain start & end -1
        # this means that if one of those is -1 it means there were no data in uniprot DB
        data["chain_start"] -= 1
        data["chain_end"] -= 1

    mutation_position = correct_mutation_position(wild_aa, mutation_position,
                                                  data.get("sequence", ""),
                                                  data.get(
                                                      "chain_start", 0) if dataset_config["positions"]["add_chain_start"] else 0,
                                                  dataset_config["positions"]["position_offset"])
    if mutation_position is None:
        errors["wrong_position"] += 1
        return {}, local_uniprot_infos, np.nan, errors

    return data, local_uniprot_infos, mutation_position, errors


def apply_valid_uniprot(row, local_uniprot_infos: dict, dataset_config: dict, errors: dict):
    """helper function to apply valid_uniprot on a df"""

    # we get the valid infos
    data, local_uniprot_infos, mutation_position, errors = valid_uniprot(row["uniprot"], local_uniprot_infos,
                                                                         row["wild_aa"], row["mutation_position"],
                                                                         dataset_config, errors)
    # we update with the updated position (taking chain start and offset into account)
    row["mutation_position"] = mutation_position
    # we add each data values in the row
    for k, v in data.items():
        row[k] = v

    # NB: if infos are not valid

    return row


In [4]:
def add_missing_column(df):
    for name in COLUMNS:
        if name not in df.columns.to_list():
            df[name] = np.nan

    return df


def save_df(df, name):
    df.to_csv(f"./data/main_dataset/{name}.csv", index=False)


##### FireProtDB


In [5]:
local_uniprot_infos = open_json(LOCAL_UNIPROT_INFOS_PATH)
print(f"loaded {len(local_uniprot_infos)} uniprot infos from local storage")
errors = {
            "no_sequence_in_data": 0, 
            "not_in_local": 0, 
            "wrong_position": 0,
            "no_uniprot": 0
        }
dataset_config = open_json("./individual_dataset_config.json")["fireprotdb"]

# load csv
df = pd.read_csv(
    "./data/FireProtDB/fireprotdb_has_ddg_or_dtm_is_curated.csv")
# rename columns
df.rename(columns={"pdb_id": "pdbs",
                   "uniprot_id": "uniprot",
                   "wild_type": "wild_aa",
                   "position": "mutation_position",
                   "mutation": "mutated_aa",
                   "tm": "Tm"},
          inplace=True)
# add missing columns
df = add_missing_column(df)
# keep only COLUMNS
df = df[COLUMNS]
# drop duplicates
df.drop_duplicates(inplace=True)
# index start at 0
df["mutation_position"] = df["mutation_position"].apply(lambda x: x-1)
# check number of rows without uniprot
print(f"found {df.uniprot.isna().sum()} nan uniprot")
# check validity of uniprot, and add the infos for those
df = df.apply(lambda row: apply_valid_uniprot(
    row, local_uniprot_infos, dataset_config, errors), axis=1)
errors


loaded 5 uniprot infos from local storage
found 174 nan uniprot
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised for nan: HTTP Error 400: Bad Request
exception raised f

{'no_sequence_in_data': 385,
 'not_in_local': 571,
 'wrong_position': 11,
 'no_uniprot': 0}

In [6]:
write_json(LOCAL_UNIPROT_INFOS_PATH, local_uniprot_infos)


In [7]:
df

Unnamed: 0,pdbs,uniprot,wild_aa,mutation_position,mutated_aa,pH,sequence,length,chain_start,chain_end,AlphaFoldDB,Tm,ddG,dTm
0,1BN6 1BN7 1CQW,P59336,V,244.0,L,,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,0.0,293.0,P59336,52.5,,2.1
1,1BN6 1BN7 1CQW,P59336,L,94.0,V,,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,0.0,293.0,P59336,50.0,,-0.4
2,1BN6 1BN7 1CQW,P59336,C,175.0,F,,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,0.0,293.0,P59336,55.6,,5.2
3,1BN6 1BN7 1CQW,P59336,G,170.0,Q,,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,0.0,293.0,P59336,53.5,,3.1
4,1BN6 1BN7 1CQW,P59336,T,147.0,L,,MSEIGTGFPFDPHYVEVLGERMHYVDVGPRDGTPVLFLHGNPTSSY...,294.0,0.0,293.0,P59336,51.5,,1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34408,1A2T 1A2U 1A3T 1A3U 1A3V 1AEX 1ENA 1ENC 1EQV 1...,P00644,L,206.0,E,9.0,MLVMTEYLLSAGICMAIVSILLIGMAISNVSKGQYAKRFFFFATSC...,231.0,63.0,230.0,P00644,,9.1,
34409,1A2T 1A2U 1A3T 1A3U 1A3V 1AEX 1ENA 1ENC 1EQV 1...,P00644,A,213.0,E,4.9,MLVMTEYLLSAGICMAIVSILLIGMAISNVSKGQYAKRFFFFATSC...,231.0,63.0,230.0,P00644,,5.8,
34410,1A2T 1A2U 1A3T 1A3U 1A3V 1AEX 1ENA 1ENC 1EQV 1...,P00644,A,213.0,E,6.0,MLVMTEYLLSAGICMAIVSILLIGMAISNVSKGQYAKRFFFFATSC...,231.0,63.0,230.0,P00644,,6.8,
34411,1A2T 1A2U 1A3T 1A3U 1A3V 1AEX 1ENA 1ENC 1EQV 1...,P00644,A,213.0,E,7.9,MLVMTEYLLSAGICMAIVSILLIGMAISNVSKGQYAKRFFFFATSC...,231.0,63.0,230.0,P00644,,8.2,
