# Dataset creation
![flowchart](./resources/dataset_creation_flowchart.drawio.png)

In [1]:
import json
import pandas as pd
import numpy as np
import urllib.request
from utils.file_utils import open_json, write_json
from utils.pdb_uniprot import correct_mutation_position


In [None]:
COLUMNS = ["pdbs", "uniprot", "wild_aa", "mutation_position",
           "mutated_aa", "pH",
           "sequence", "length", "chain_start", "chain_end",
           "AlphaFoldDB", "Tm", "ddG", "dTm"]


In [None]:
def get_uniprot_infos_online(uniprot: str):
    try:
        with urllib.request.urlopen(f"https://rest.uniprot.org/uniprotkb/{uniprot}.json") as url:
            data = json.load(url)
    except Exception as e:
        print(f"exception raised for {uniprot}: {e}")
        return {}

    sequence = data.get("sequence", {}).get("value")
    features = data.get("features", [])
    chain_location = next(
        (x for x in features if x["type"] == "Chain"), {}).get("location", {})

    databases = data.get("uniProtKBCrossReferences", [])
    pdb_ids = " ".join([x["id"]
                        for x in databases if (x["database"] == "PDB")])

    return {
        "pdbs": pdb_ids,
        "sequence": sequence,
        "length": len(sequence),
        "chain_start": chain_location.get("start", {}).get("value"),
        "chain_end": chain_location.get("end", {}).get("value"),
        "AlphaFoldDB": " ".join([x["id"] for x in databases if (x["database"] == "AlphaFoldDB")]),
    }


def valid_uniprot(uniprot: str, local_uniprot_infos: dict, wild_aa: str,
                  mutation_position: int, dataset_config: dict):
    data = local_uniprot_infos.get(uniprot, {})

    if data == {}:
        data = get_uniprot_infos_online(uniprot)
        local_uniprot_infos[uniprot] = data

    mutation_position = correct_mutation_position(wild_aa, mutation_position,
                                                  data.get("sequence", ""),
                                                  data.get(
                                                      "chain_start", 0) if dataset_config["positions"]["add_chain_start"] else 0,
                                                  dataset_config["positions"]["position_offset"])

    # validate the data
    if "sequence" not in data:
        return {}, local_uniprot_infos
    if mutation_position is None:
        return {}, local_uniprot_infos

    data["mutation_position"] = mutation_position
    return data, local_uniprot_infos


def apply_valid_uniprot(row, local_uniprot_infos, dataset_config):
    # we get the valid infos
    data, local_uniprot_infos = valid_uniprot(row["uniprot"], local_uniprot_infos,
                                              row["wild_aa"], row["mutation_position"],
                                              dataset_config)
    # we add each data values in the row
    for k, v in data.items():
        row[k]=v
    
    # NB: if infos are not valid

    return row

# df = pd.read_csv("./data/main_dataset_creation/uniprot_infos.csv")
# df = df.fillna("")
# local_uniprot_infos = dict(zip(df["uniprot"],df.to_dict("records")))
# dataset_config = open_json("./individual_dataset_config.json")["fireprotdb"]
# valid_uniprot_infos("P07170", local_uniprot_infos, "M", 1, dataset_config)


In [None]:
def add_missing_column(df):
    for name in COLUMNS:
        if name not in df.columns.to_list():
            df[name] = np.nan

    return df


def save_df(df, name):
    df.to_csv(f"./data/main_dataset/{name}.csv", index=False)


In [7]:
##### FireProtDB #####

# load csv
df = pd.read_csv(
    "./data/FireProtDB/fireprotdb_has_ddg_or_dtm_is_curated.csv")
# rename columns
df.rename(columns={"pdb_id": "pdbs",
                   "uniprot_id": "uniprot",
                   "wild_type": "wild_aa",
                   "position": "mutation_position",
                   "mutation": "mutated_aa",
                   "tm": "Tm"}, 
                   inplace=True)
# add missing ones
df = add_missing_column(df)
# keep only COLUMNS
df = df[COLUMNS]
# drop duplicates
df.drop_duplicates(inplace=True)


MSTKKKPLTQEQLEDARRLKAIYEKKKNELGLSQESVADKMGMGQSGVGALFNGINALNAYNAALLAKILKVSVEEFSPSIAREIYEMYEAVSMQPSLRSEYEYPVFSHVQAGMFSPELRTFTKGDAERWVSTTKKASDSAFWLEVEGNSMTAPTGSKPSFPDGMLILVDPEQAVEPGDFCIARLGGDEFTFKKLIRDSGQVFLQPLNPQYPMIPCNESCSVVGKVIASQWPEETFG


{'sequence': 'MSTKKKPLTQEQLEDARRLKAIYEKKKNELGLSQESVADKMGMGQSGVGALFNGINALNAYNAALLAKILKVSVEEFSPSIAREIYEMYEAVSMQPSLRSEYEYPVFSHVQAGMFSPELRTFTKGDAERWVSTTKKASDSAFWLEVEGNSMTAPTGSKPSFPDGMLILVDPEQAVEPGDFCIARLGGDEFTFKKLIRDSGQVFLQPLNPQYPMIPCNESCSVVGKVIASQWPEETFG',
 'mutation_position': 0,
 'length': 237,
 'chain_start': 2,
 'chain_end': 237,
 'AlphaFoldDB': '',
 'pdbs': '1F39 1KCA 1LLI 1LMB 1LRP 1RIO 3BDN 3KZ3 3WOA 5ZCA 7JVT'}