### Notebook Purpose

**Transform the gene-disease data from the G2P database to include additional mechanistic encoding columns in the form of LoF, GoF, DN, and a confidence score assessing the prediction.**

In [29]:
import pandas as pd
import numpy as np

In [33]:
# Section 1 -> Loading data

df = pd.read_csv("G2P_Eye_2025-03-08.csv")

df.head()

Unnamed: 0,g2p id,gene symbol,gene mim,hgnc id,previous gene symbols,disease name,disease mim,disease MONDO,allelic requirement,cross cutting modifier,...,variant consequence,variant types,molecular mechanism,molecular mechanism categorisation,molecular mechanism evidence,phenotypes,publications,panel,comments,date of last review
0,G2P00001,HMX1,142992.0,5017,H6; NKX5-3,HMX1-related oculoauricular syndrome,612109,,biallelic_autosomal,,...,absent gene product,,loss of function,inferred,,HP:0000647; HP:0004328; HP:0000639; HP:0000482...,18423520; 25574057; 29140751,DD; Eye,,2019-09-26 16:23:46+00:00
1,G2P00008,YAP1,606608.0,16262,YAP-1; YAP65,"YAP1-related coloboma, ocular, with or without...",120433,,monoallelic_autosomal,,...,absent gene product,,loss of function,inferred,,HP:0000407; HP:0003829; HP:0002006; HP:0001249...,24462371; 27267789,DD; Eye,,2017-05-12 10:48:00+00:00
2,G2P00009,BBS9,607968.0,30000,B1; PTHB1,BBS9-related Bardet-Biedl syndrome,615986,,biallelic_autosomal,,...,absent gene product,,loss of function,inferred,,HP:0007707; HP:0001773; HP:0001162; HP:0000750...,16380913; 22353939,DD; Eye,,2025-01-21 22:09:55+00:00
3,G2P00010,TRIM32,602290.0,16380,BBS11; HT2A; LGMD2H; TATIP,TRIM32-related Bardet-Biedl syndrome,615988,,biallelic_autosomal,,...,uncertain,,undetermined,inferred,,HP:0007707; HP:0001773; HP:0001162; HP:0000750...,16606853,DD; Eye,,2025-01-21 22:12:35+00:00
4,G2P00011,PAX2,167409.0,8616,PAX-2,PAX2-related papillorenal syndrome,120330,,monoallelic_autosomal,,...,absent gene product,,loss of function,inferred,,HP:0000480; HP:0000588; HP:0002171; HP:0001144...,9760197; 3377002; 11461952; 2644560; 7795640; ...,DD; Eye,,2015-07-22 16:14:08+00:00


In [35]:
# Section 2 -> Key/Column mapping.

mechanism_map = {
    "loss of function": (1, 0, 0),
    "gain of function": (0, 1, 0),
    "dominant negative": (0, 0, 1),
    "undetermined": (0, 0, 0)  # Default: No specific mechanism
}

confidence_map = {
    "definitive": 5,
    "strong": 4,
    "moderate": 3,
    "limited": 2,
    "disputed": 1,
    "refuted": 0
}

allelic_requirement_map = {
    "biallelic_autosomal": "AR",
    "monoallelic_autosomal": "AD",
    "monoallelic_X": "X-linked",
    "monoallelic_Y_hemizygous": "Y-linked",
    "mitochondrial": "mtDNA"
}

In [45]:
# Section 3 -> Helper functions.

def parse_mechanism_cat(mech_str):
    if pd.isna(mech_str):
        return (0, 0, 0)
    return mechanism_map.get(mech_str.strip().lower(), (0, 0, 0))

def parse_confidence(conf_str):
    if pd.isna(conf_str):
        return 0
    return confidence_map.get(conf_str.strip().lower(), 0)

def parse_allelic_requirement(ar_str):
    if pd.isna(ar_str):
        return ""
    return allelic_requirement_map.get(ar_str.strip(), ar_str)

def log_normalize(series):
    if series.max() == 0:
        return series  
    return np.log1p(series) / np.log1p(series.max())

In [39]:
# Section 4 -> DF transformation.

df["LoF"], df["GoF"], df["DN"] = zip(*df["molecular mechanism"].apply(parse_mechanism_cat))

df["ConfidenceScore"] = df["confidence"].apply(parse_confidence) if "confidence" in df.columns else 0

df["Mechanism_Vector_4D"] = df.apply(lambda row: np.array([
    row["LoF"],
    row["GoF"],
    row["DN"],
    row["ConfidenceScore"]
]), axis=1)

if "allelic requirement" in df.columns:
    df["AllelicReq_Simplified"] = df["allelic requirement"].apply(parse_allelic_requirement)

In [47]:
# Section 5 -> Result and export.

df[[
    "gene symbol",
    "disease name",
    "molecular mechanism",
    "LoF", "GoF", "DN",
    "ConfidenceScore",
    "Mechanism_Vector_4D"
]].head(20)

# df.to_csv("G2P_annotated.csv", index=False)

Unnamed: 0,gene symbol,disease name,molecular mechanism,LoF,GoF,DN,ConfidenceScore,Mechanism_Vector_4D
0,HMX1,HMX1-related oculoauricular syndrome,loss of function,1,0,0,5,"[1, 0, 0, 5]"
1,YAP1,"YAP1-related coloboma, ocular, with or without...",loss of function,1,0,0,4,"[1, 0, 0, 4]"
2,BBS9,BBS9-related Bardet-Biedl syndrome,loss of function,1,0,0,5,"[1, 0, 0, 5]"
3,TRIM32,TRIM32-related Bardet-Biedl syndrome,undetermined,0,0,0,2,"[0, 0, 0, 2]"
4,PAX2,PAX2-related papillorenal syndrome,loss of function,1,0,0,5,"[1, 0, 0, 5]"
5,FGFR3,FGFR3-related lacrimo-auriculo-dento-digital s...,undetermined,0,0,0,5,"[0, 0, 0, 5]"
6,FLNB,FLNB-related spondylocarpotarsal synostosis sy...,loss of function,1,0,0,5,"[1, 0, 0, 5]"
7,AGPS,AGPS-related rhizomelic chondrodysplasia punctata,undetermined,0,0,0,5,"[0, 0, 0, 5]"
8,RPGRIP1,RPGRIP1-related Leber congenital amaurosis,loss of function,1,0,0,5,"[1, 0, 0, 5]"
9,LAMA1,LAMA1-related cerebellar dysplasia with cysts ...,loss of function,1,0,0,5,"[1, 0, 0, 5]"
