### Notebook Purpose

**Loading data from GoFCards dataset and transforming it. Specifically, log-normalizing the p-score column, and inserting a columns relating to mechanistic predictions in the form of a 4D feature vector.** 

In [1]:
import pandas as pd
import numpy as np

In [9]:
# Section 1 -> Load the GoFCards dataset from an Excel file.
df = pd.read_excel("gofcards_data_download.xlsx")

df.head()

Unnamed: 0,Order numbe,genesymbol,transcript,chr,hg19start,hg19end,ref,alt,Function,Pathways proteins involved,Disorder involved,PMID,Animal model,Cell model,Pscore
0,1,TP53,TP53.cAug10,17,7569552,7569552,G,A,Mutations in the p53 gene in cancer cells not ...,P53EGFR (Epidermal Growth Factor Receptor) sig...,Gastric cancer,2950500,N,N,2.523719
1,2,ZAP70,NM_001079,2,98350044,98350044,A,T,Enhanced the ability of ZAP70 to reconstitute ...,The antigen receptormediated signal transducti...,_,8943331,N,Y,3.0
2,3,RET,NM_020975,10,43613840,43613840,G,C,_,The activation of the Ras pathway,Familial medullary thyroid carcinoma (FMTC),9242375,N,Y,3.0
3,4,RET,NM_020975,10,43614996,43614996,G,C,_,The activation of the Ras pathway,Familial medullary thyroid carcinoma (FMTC),9242375,N,Y,3.0
4,5,CASR,NM_000388,3,122003119,122003119,T,G,The phenotype can vary from mild to life threa...,_,Sporadic hypoparathyroidism,9253358,N,Y,3.0


In [13]:
# Section 2 -> Log Normalization of P-score. 

def log_normalize_pscore(pscore, max_pscore):
    """ Normalize P-score using log transformation (scaled 0-1). """
    if pd.isna(pscore) or pscore <= 0:
        return 0
    return np.log2(1 + pscore) / np.log2(1 + max_pscore)

In [17]:
# Section 3 -> Apply DataFrame transformations.

max_pscore = df["Pscore"].max()

df["LoF"], df["GoF"], df["DN"] = 0, 1, 0

df["Pscore_Normalized"] = df["Pscore"].apply(lambda x: log_normalize_pscore(x, max_pscore))

df["Mechanism_Vector_4D"] = df.apply(
    lambda row: [row["LoF"], row["GoF"], row["DN"], row["Pscore_Normalized"]],
    axis=1
)

In [21]:
# Section 4 -> Resulting data frame.

df[[
    "genesymbol",
    "Disorder involved",
    "Pscore",
    "Pscore_Normalized",
    "LoF", "GoF", "DN",
    "Mechanism_Vector_4D"
]].head(20)


Unnamed: 0,genesymbol,Disorder involved,Pscore,Pscore_Normalized,LoF,GoF,DN,Mechanism_Vector_4D
0,TP53,Gastric cancer,2.523719,0.373808,0,1,0,"[0, 1, 0, 0.37380848690658947]"
1,ZAP70,_,3.0,0.411434,0,1,0,"[0, 1, 0, 0.41143439101965634]"
2,RET,Familial medullary thyroid carcinoma (FMTC),3.0,0.411434,0,1,0,"[0, 1, 0, 0.41143439101965634]"
3,RET,Familial medullary thyroid carcinoma (FMTC),3.0,0.411434,0,1,0,"[0, 1, 0, 0.41143439101965634]"
4,CASR,Sporadic hypoparathyroidism,3.0,0.411434,0,1,0,"[0, 1, 0, 0.41143439101965634]"
5,CASR,Sporadic hypoparathyroidism,3.0,0.411434,0,1,0,"[0, 1, 0, 0.41143439101965634]"
6,TSHR,Multinodular goiter,3.0,0.411434,0,1,0,"[0, 1, 0, 0.41143439101965634]"
7,TSHR,Multinodular goiter,3.785579,0.464652,0,1,0,"[0, 1, 0, 0.4646520595242455]"
8,TSHR,Multinodular goiter,3.0,0.411434,0,1,0,"[0, 1, 0, 0.41143439101965634]"
9,TP53,LiFraumeni syndrome (LFS),28.061603,1.0,0,1,0,"[0, 1, 0, 1.0]"
