# Building a boolean network from a knowledge graph

Use SigNOR

Given a list of genes for which we want to construct a BN:

1. Find the Steiner subgraph for the given list of genes.
2. For each node in the subgraph, find all if its input nodes (i.e. all edges leading into that node).
3. For each such node, there is an activating relation if the edge is "up-regulates", and a repressing relation if the edge is "down-regulates".
4. Combine all of the edges with a "joiner function" - AND, OR, repressor wins, etc.

Or, given an existing model (with some number of genes), a list of nodes, and a knowledge graph, how do we incorporate the new nodes into the model?


In [1]:
import BNMPy

In [2]:
bn_string, relations = BNMPy.load_signor_network(['TNFAIP3', 'CCND1', 'MACF1'])

number of genes found: 3
[7128, 595, 23499]


In [3]:
print(bn_string)

CCND1 = (! GSK3B) # Scores: GSK3B_inhibit:0.783
GSK3B = (GSK3B) & (! MACF1) # Scores: GSK3B_activate:0.2; MACF1_inhibit:0.436
MACF1 = (! GSK3B) # Scores: GSK3B_inhibit:0.436
TNFAIP3 = TNFAIP3
TRAF6 = (! GSK3B) & (TRAF6) & (! TNFAIP3) # Scores: GSK3B_inhibit:0.48; TRAF6_activate:0.2; TNFAIP3_inhibit:0.701


In [4]:
bn_string, relations = BNMPy.load_signor_network(['TNFAIP3', 'CCND1', 'MACF1'], score_cutoff = 0.5)
print(bn_string)

Applied score cutoff 0.5, filtered to 19342/40940 edges
number of genes found: 3
[7128, 595, 23499]
CCND1 = (CTNNB1) # Scores: CTNNB1_activate:0.8
CTNNB1 = (! SRC) # Scores: SRC_inhibit:0.76
SRC = SRC
TNFAIP3 = TNFAIP3
TRAF6 = (SRC) & (! TNFAIP3) # Scores: SRC_activate:0.566; TNFAIP3_inhibit:0.701


In [6]:
bn_string, relations = BNMPy.load_signor_network(['TNFAIP3', 'CCND1', 'MACF1'], joiner='majority')
print(bn_string)

number of genes found: 3
[7128, 595, 23499]
CCND1 = 0 # Scores: GSK3B_inhibit:0.783
GSK3B = (GSK3B & !MACF1) # Scores: GSK3B_activate:0.2; MACF1_inhibit:0.436
MACF1 = 0 # Scores: GSK3B_inhibit:0.436
TNFAIP3 = TNFAIP3
TRAF6 = (TRAF6 & !GSK3B & !TNFAIP3) # Scores: GSK3B_inhibit:0.48; TRAF6_activate:0.2; TNFAIP3_inhibit:0.701


In [None]:
bn_string, relations = BNMPy.load_signor_network(['KRAS', 'GNAS', 'TP53', 'SMAD4', 'CDKN2A', 'RNF43'], joiner='&')
print(bn_string)

number of genes found: 6
[3845, 2778, 7157, 4089, 1029, 54894]
CDKN2A = (! MYC) # inhibit_MYC:0.765
GNAS = (! MDM2) # inhibit_MDM2:0.395
GSK3B = (GSK3B) # activate_GSK3B:0.2
KRAS = (SRC) # activate_SRC:0.656
MDM2 = (TP53) # activate_TP53:0.968
MYC = (! GSK3B) & (! SMAD4) # inhibit_GSK3B:0.719; inhibit_SMAD4:0.638
RNF43 = RNF43
SMAD4 = (! GSK3B) # inhibit_GSK3B:0.397
SRC = (GNAS) & (GSK3B) & (SRC) # activate_GNAS:0.506; activate_GSK3B:0.383; activate_SRC:0.2
TP53 = (! MDM2) & (GSK3B) & (! SRC) & (! RNF43) # inhibit_MDM2:0.968; activate_GSK3B:0.727; inhibit_SRC:0.524; inhibit_RNF43:0.452


In [None]:
bn_string, relations = BNMPy.load_signor_network(['KRAS', 'GNAS', 'TP53', 'SMAD4', 'CDKN2A', 'RNF43'], joiner='&', score_cutoff = 0.5)
print(bn_string)

number of genes found: 6
[3845, 2778, 7157, 4089, 1029, 54894]
CDKN2A = (! MYC) # inhibit_MYC:0.765
GNAS = GNAS
KRAS = (SRC) # activate_SRC:0.656
MAPK1 = MAPK1
MYC = (MAPK1) & (! SMAD4) # activate_MAPK1:0.733; inhibit_SMAD4:0.638
SMAD4 = (MAPK1) # activate_MAPK1:0.511
SRC = (GNAS) # activate_GNAS:0.506
TP53 = (! SRC) & (MAPK1) # inhibit_SRC:0.524; activate_MAPK1:0.777


In [17]:
bn_string, relations = BNMPy.load_signor_network(['KRAS', 'GNAS', 'TP53', 'SMAD4', 'CDKN2A', 'RNF43'], joiner='|')
print(bn_string)

number of genes found: 6
[3845, 2778, 7157, 4089, 1029, 54894]
CDKN2A = (! MYC) # Scores: MYC_inhibit:0.765
GNAS = (! MDM2) # Scores: MDM2_inhibit:0.395
GSK3B = (GSK3B) # Scores: GSK3B_activate:0.2
KRAS = (SRC) # Scores: SRC_activate:0.656
MDM2 = (TP53) # Scores: TP53_activate:0.968
MYC = (! GSK3B) | (! SMAD4) # Scores: GSK3B_inhibit:0.719; SMAD4_inhibit:0.638
RNF43 = RNF43
SMAD4 = (! GSK3B) # Scores: GSK3B_inhibit:0.397
SRC = (GNAS) | (GSK3B) | (SRC) # Scores: GNAS_activate:0.506; GSK3B_activate:0.383; SRC_activate:0.2
TP53 = (! MDM2) | (GSK3B) | (! SRC) | (! RNF43) # Scores: MDM2_inhibit:0.968; GSK3B_activate:0.727; SRC_inhibit:0.524; RNF43_inhibit:0.452


In [18]:
bn_string, relations = BNMPy.load_signor_network(['KRAS', 'GNAS', 'TP53', 'SMAD4', 'CDKN2A', 'RNF43'],
                                                            joiner='inhibitor_wins')
print(bn_string)

number of genes found: 6
[3845, 2778, 7157, 4089, 1029, 54894]
CDKN2A = !MYC # Scores: MYC_inhibit:0.765
GNAS = !MDM2 # Scores: MDM2_inhibit:0.395
GSK3B = GSK3B # Scores: GSK3B_activate:0.2
KRAS = SRC # Scores: SRC_activate:0.656
MDM2 = TP53 # Scores: TP53_activate:0.968
MYC = (!GSK3B & !SMAD4) # Scores: GSK3B_inhibit:0.719; SMAD4_inhibit:0.638
RNF43 = RNF43
SMAD4 = !GSK3B # Scores: GSK3B_inhibit:0.397
SRC = (GNAS | GSK3B | SRC) # Scores: GNAS_activate:0.506; GSK3B_activate:0.383; SRC_activate:0.2
TP53 = (!MDM2 & !SRC & !RNF43) & GSK3B # Scores: MDM2_inhibit:0.968; GSK3B_activate:0.727; SRC_inhibit:0.524; RNF43_inhibit:0.452


In [19]:
relations

[('MDM2', 'GNAS', 'inhibit', 0.395),
 ('TP53', 'MDM2', 'activate', 0.968),
 ('GSK3B', 'GSK3B', 'activate', 0.2),
 ('GNAS', 'SRC', 'activate', 0.506),
 ('GSK3B', 'SRC', 'activate', 0.383),
 ('SRC', 'SRC', 'activate', 0.2),
 ('MDM2', 'TP53', 'inhibit', 0.968),
 ('GSK3B', 'TP53', 'activate', 0.727),
 ('SRC', 'TP53', 'inhibit', 0.524),
 ('RNF43', 'TP53', 'inhibit', 0.452),
 ('GSK3B', 'MYC', 'inhibit', 0.719),
 ('SMAD4', 'MYC', 'inhibit', 0.638),
 ('MYC', 'CDKN2A', 'inhibit', 0.765),
 ('SRC', 'KRAS', 'activate', 0.656),
 ('GSK3B', 'SMAD4', 'inhibit', 0.397)]

In [20]:
bn_string, relations = BNMPy.load_signor_network(['KRAS', 'GNAS', 'TP53', 'SMAD4', 'CDKN2A', 'RNF43'],
                                                            joiner='majority')
print(bn_string)

number of genes found: 6
[3845, 2778, 7157, 4089, 1029, 54894]
CDKN2A = 0 # Scores: MYC_inhibit:0.765
GNAS = 0 # Scores: MDM2_inhibit:0.395
GSK3B = (GSK3B) # Scores: GSK3B_activate:0.2
KRAS = (SRC) # Scores: SRC_activate:0.656
MDM2 = (TP53) # Scores: TP53_activate:0.968
MYC = 0 # Scores: GSK3B_inhibit:0.719; SMAD4_inhibit:0.638
RNF43 = RNF43
SMAD4 = 0 # Scores: GSK3B_inhibit:0.397
SRC = (GNAS & !GSK3B & !SRC) | (!GNAS & GSK3B & !SRC) | (GNAS & GSK3B & !SRC) | (!GNAS & !GSK3B & SRC) | (GNAS & !GSK3B & SRC) | (!GNAS & GSK3B & SRC) | (GNAS & GSK3B & SRC) # Scores: GNAS_activate:0.506; GSK3B_activate:0.383; SRC_activate:0.2
TP53 = (GSK3B & !MDM2 & !SRC & !RNF43) # Scores: MDM2_inhibit:0.968; GSK3B_activate:0.727; SRC_inhibit:0.524; RNF43_inhibit:0.452


In [21]:
bn_string, relations = BNMPy.load_signor_network(['KRAS', 'GNAS', 'TP53', 'SMAD4', 'CDKN2A', 'RNF43'],
                                                            joiner='plurality')
print(bn_string)

number of genes found: 6
[3845, 2778, 7157, 4089, 1029, 54894]
CDKN2A = (!MYC) # Scores: MYC_inhibit:0.765
GNAS = (!MDM2) # Scores: MDM2_inhibit:0.395
GSK3B = (!GSK3B) | (GSK3B) # Scores: GSK3B_activate:0.2
KRAS = (!SRC) | (SRC) # Scores: SRC_activate:0.656
MDM2 = (!TP53) | (TP53) # Scores: TP53_activate:0.968
MYC = (!GSK3B & !SMAD4) # Scores: GSK3B_inhibit:0.719; SMAD4_inhibit:0.638
RNF43 = RNF43
SMAD4 = (!GSK3B) # Scores: GSK3B_inhibit:0.397
SRC = (!GNAS & !GSK3B & !SRC) | (GNAS & !GSK3B & !SRC) | (!GNAS & GSK3B & !SRC) | (GNAS & GSK3B & !SRC) | (!GNAS & !GSK3B & SRC) | (GNAS & !GSK3B & SRC) | (!GNAS & GSK3B & SRC) | (GNAS & GSK3B & SRC) # Scores: GNAS_activate:0.506; GSK3B_activate:0.383; SRC_activate:0.2
TP53 = (!GSK3B & !MDM2 & !SRC & !RNF43) | (GSK3B & !MDM2 & !SRC & !RNF43) | (GSK3B & MDM2 & !SRC & !RNF43) | (GSK3B & !MDM2 & SRC & !RNF43) | (GSK3B & !MDM2 & !SRC & RNF43) # Scores: MDM2_inhibit:0.968; GSK3B_activate:0.727; SRC_inhibit:0.524; RNF43_inhibit:0.452


## Combining KG knowledge graphs

In [None]:
# load the Vundavalli KG
file = './input_files/Vundavilli2020_standardized.txt'
network = BNMPy.load_network_from_file(file)
genes = network.nodeDict.keys()
print(f"number of genes: {len(genes)}")
genes

No initial state provided, using a random initial state
Network loaded successfully. There are 38 genes in the network.
number of genes: 38


dict_keys(['EGF', 'HBEGF', 'IGF1', 'NRG1', 'PTEN', 'STK11', 'EGFR', 'ERBB4', 'IGF1R', 'ERBB2', 'JAK1', 'STAT3', 'IRS1', 'GRB2', 'KRAS', 'MAP3K1', 'RAF1', 'MAP2K4', 'MAP2K1', 'PIK3CA', 'MAPK8', 'MAPK3', 'PIP3', 'PDPK1', 'AKT1', 'PRKAA1', 'GSK3B', 'TSC1', 'RHEB', 'MTOR', 'RPS6KB1', 'BAD', 'CCND1', 'BCL2', 'ELK1', 'FOS', 'ELK4', 'SP1'])

In [None]:
bn_string, relations = BNMPy.load_signor_network(genes, joiner='inhibitor_wins', score_cutoff=0.5)
print(bn_string)

Applied score cutoff 0.5, filtered to 19342/40940 edges
number of genes found: 37
[1950, 1839, 3479, 3084, 5728, 6794, 1956, 2066, 3480, 2064, 3716, 6774, 3667, 2885, 3845, 4214, 5894, 6416, 5604, 5290, 5599, 5595, 5170, 207, 5562, 2932, 7248, 6008, 2475, 6198, 572, 595, 596, 2002, 2353, 2005, 5669]
AKT1 = !PTEN & (PDPK1 | MTOR | PIK3CA) # Scores: PDPK1_activate:0.749; MTOR_activate:0.929; PTEN_inhibit:0.634; PIK3CA_activate:0.816
BAD = (!MAPK8 & !AKT1 & !RAF1) # Scores: MAPK8_inhibit:0.686; AKT1_inhibit:0.823; RAF1_inhibit:0.66
BCL2 = (!MAPK8 & !BAD) & MAPK3 # Scores: MAPK8_inhibit:0.581; BAD_inhibit:0.801; MAPK3_activate:0.559
CCND1 = !GSK3B & STAT3 # Scores: GSK3B_inhibit:0.783; STAT3_activate:0.787
EGF = EGF
EGFR = !MAPK3 & (ERBB2 | EGF | HBEGF) # Scores: ERBB2_activate:0.614; MAPK3_inhibit:0.556; EGF_activate:0.949; HBEGF_activate:0.767
ELK1 = (MAPK8 | MAPK3) # Scores: MAPK8_activate:0.512; MAPK3_activate:0.6
ERBB2 = (EGFR | NRG1 | EGF) # Scores: EGFR_activate:0.614; NRG1_activate

In [11]:
KG = BNMPy.load_network_from_string(bn_string)
# Merge the networks using inhibitor wins
merged_network_string = BNMPy.merge_networks([network, KG], method="Inhibitor Wins", descriptive=True)

No initial state provided, using a random initial state
Network loaded successfully. There are 34 genes in the network.
Merging Method: Inhibitor Wins
Total Genes in Merged Network: 38
Number of Genes in Each Individual Model:
  Model 1: 38 genes
  Model 2: 34 genes
Overlapping Genes: 34
Overlapping Genes List: AKT1, BAD, BCL2, CCND1, EGF, EGFR, ELK1, ERBB2, ERBB4, FOS, GRB2, GSK3B, HBEGF, IGF1, IGF1R, IRS1, JAK1, KRAS, MAP2K1, MAP2K4, MAP3K1, MAPK3, MAPK8, MTOR, NRG1, PDPK1, PIK3CA, PRKAA1, PTEN, RAF1, RPS6KB1, STAT3, STK11, TSC1

Gene: AKT1
  Model 1 Function: PIP3
  Model 2 Function: !PTEN & ( MTOR | PDPK1 | PIK3CA )
  Merged Function: !PTEN & ( MTOR | PDPK1 | PIK3CA | PIP3 )

Gene: BAD
  Model 1 Function: ! ( AKT1 | RPS6KB1 )
  Model 2 Function: !AKT1 & !MAPK8 & !RAF1
  Merged Function: !AKT1 & !MAPK8 & !RAF1 & RPS6KB1

Gene: BCL2
  Model 1 Function: !BAD & STAT3
  Model 2 Function: !BAD & MAPK3 & !MAPK8
  Merged Function: !BAD & !MAPK8 & ( MAPK3 | STAT3 )

Gene: CCND1
  Model 1 Fu

In [13]:
print(merged_network_string)

AKT1 = !PTEN & ( MTOR | PDPK1 | PIK3CA | PIP3 )
BAD = !AKT1 & !MAPK8 & !RAF1 & RPS6KB1
BCL2 = !BAD & !MAPK8 & ( MAPK3 | STAT3 )
CCND1 = !GSK3B & STAT3
EGF = EGF
EGFR = !MAPK3 & ( EGF | ERBB2 | HBEGF )
ELK1 = MAPK3 | MAPK8 | RPS6KB1
ELK4 = MAPK3 & RPS6KB1
ERBB2 = EGF | EGFR | NRG1
ERBB4 = !MAPK3 & ( EGF | ERBB2 | HBEGF | NRG1 )
FOS = MAPK3 | MAPK8 | RPS6KB1
GRB2 = EGFR | ERBB2 | ERBB4 | IGF1R | IRS1
GSK3B = !AKT1
HBEGF = HBEGF
IGF1 = IGF1
IGF1R = IGF1
IRS1 = !MAPK3 & !MAPK8 & !MTOR & !PIK3CA & !PTEN & !RPS6KB1 & ( IGF1R | JAK1 )
JAK1 = EGFR
KRAS = GRB2 | KRAS
MAP2K1 = MAP3K1 | RAF1
MAP2K4 = !AKT1 & MAP3K1
MAP3K1 = KRAS | MAP3K1
MAPK3 = MAP2K1
MAPK8 = MAP2K4
MTOR = !RPS6KB1 & !TSC1 & ( PIK3CA | RHEB )
NRG1 = NRG1
PDPK1 = PIP3 & !RPS6KB1
PIK3CA = !PTEN & ( ERBB2 | ERBB4 | IRS1 | KRAS | STAT3 )
PIP3 = PIK3CA | !PTEN
PRKAA1 = STK11
PTEN = PTEN & !STK11
RAF1 = !AKT1 & !MAPK3 & ( KRAS | MAP2K1 )
RHEB = !TSC1
RPS6KB1 = !PTEN & ( MAPK3 | MTOR | PDPK1 )
SP1 = MAPK3
STAT3 = EGFR | JAK1 | MAPK3 | 

In [None]:
# Merge the networks using PBN
# Here, a probability of 0.9 is used for rules from the original network
merged_network_string = BNMPy.merge_networks([network, KG], method="PBN", prob=0.9)
print(merged_network_string)

AKT1 = !PTEN & ( MTOR | PDPK1 | PIK3CA ), 0.1
AKT1 = PIP3, 0.9
BAD = ! ( AKT1 | RPS6KB1 ), 0.9
BAD = !AKT1 & !MAPK8 & !RAF1, 0.1
BCL2 = !BAD & MAPK3 & !MAPK8, 0.1
BCL2 = !BAD & STAT3, 0.9
CCND1 = !GSK3B & STAT3, 0.1
CCND1 = !GSK3B, 0.9
EGF = EGF, 0.9
EGFR = !MAPK3 & ( EGF | ERBB2 | HBEGF ), 0.1
EGFR = EGF, 0.9
ELK1 = MAPK3 & RPS6KB1, 0.9
ELK1 = MAPK3 | MAPK8, 0.1
ELK4 = MAPK3 & RPS6KB1, 1.0
ERBB2 = EGF | EGFR | NRG1, 0.1
ERBB2 = NRG1, 0.9
ERBB4 = !MAPK3 & ( ERBB2 | HBEGF | NRG1 ), 0.1
ERBB4 = EGF | HBEGF, 0.9
FOS = MAPK3, 0.1
FOS = MAPK8 & RPS6KB1, 0.9
GRB2 = EGFR | ERBB2 | ERBB4 | IGF1R, 0.9
GRB2 = ERBB2 | ERBB4 | IRS1, 0.1
GSK3B = !AKT1, 0.9
HBEGF = HBEGF, 0.9
IGF1 = IGF1, 0.9
IGF1R = IGF1, 0.9
IRS1 = !MAPK3 & !MAPK8 & !MTOR & !PIK3CA & !PTEN & !RPS6KB1 & ( IGF1R | JAK1 ), 0.1
IRS1 = IGF1R, 0.9
JAK1 = EGFR, 0.9
KRAS = GRB2, 0.9
KRAS = KRAS, 0.1
MAP2K1 = MAP3K1 | RAF1, 0.1
MAP2K1 = RAF1, 0.9
MAP2K4 = !AKT1 & MAP3K1, 0.1
MAP2K4 = MAP3K1, 0.9
MAP3K1 = KRAS, 0.9
MAP3K1 = MAP3K1, 0.1
MAPK

## Visualization

In [16]:
pbn = BNMPy.load_pbn_from_string(merged_network_string)
BNMPy.vis_network(pbn, output_html="Vundavilli2020_extendedPBN.html", interactive=True)

No initial state provided, using a random initial state
PBN loaded successfully. There are 38 genes in the network.
Network visualization saved to Vundavilli2020_extendedPBN.html


# Calculate phenotype score using KG

In [1]:
import pandas as pd
file_path = 'KG_files/significant_paths_to_phenotypes.txt'
df = pd.read_csv(file_path, sep='\t')
print(f'There are {df["EndNode"].nunique()} phenotypes')
print(f'There are {df["QueryNode"].nunique()} genes')

There are 201 phenotypes
There are 4905 genes


In [35]:
print(df["EndNode"].unique())

['ACROSOME_ASSEMBLY' 'ACTIN_CYTOSKELETON_REORGANIZATION'
 'ACTION_POTENTIAL_' 'ADIPOGENESIS' 'ALTERNATIVE_SPLICING_REGULATION'
 'AMYLOID_FIBRIL_FORMATION' 'ANGIOGENESIS' 'APOPTOSIS' 'ARDS'
 'AUTOPHAGOSOME_FORMATION' 'AUTOPHAGY' 'AXONAL_GROWTH_CONE_FORMATION'
 'B_CELL_MATURATION' 'B-LYMPHOCYTE_DIFF' 'BASOPHIL_DIFF'
 'BONE_MINERALIZATION' 'BROWN_ADIPOGENESIS' 'CARTILAGE_DEVELOPMENT'
 'CELL_ADHESION' 'CELL_CYCLE_BLOCK' 'CELL_CYCLE_EXIT'
 'CELL_CYCLE_PROGRESS_' 'CELL_DEATH' 'CELL_GROWTH' 'CELL_KILLING'
 'CELL_MIGRATION' 'CELL_POLARITY' 'CELL_SHAPE' 'CENTROMERE_ASSEMBLY'
 'CENTROSOME_SEPARATION' 'CEREBRAL_CORTEX_DEVELOPMENT'
 'CHAPERONE-MEDIATED_AUTOPHAGY' 'CHAPERONE-MEDIATED_PROTEIN_FOLDING'
 'CHEMOATTRACTION_OF_AXON' 'CHEMOREPULSION_OF_AXON' 'CHEMOTAXIS'
 'CHROMATINE_CONDENSATION' 'CHROMOSOME_SEGREGATION' 'CILIUM_ASSEMBLY'
 'CILIUM_MOVEMENT' 'CITRIC_ACID_CYCLE'
 'CLEARANCE_OF_FOREIGN_INTRACELLULAR_DNA' 'COLLOID' 'CYTOKINE_PRODUCTION'
 'CYTOSKELETON_ORGANIZATION' 'CYTOTOXIC_T-LYMPHOCYTE_AC

In [None]:
def proxpath(genes, phenotypes = ['APOPTOSIS', 'DIFFERENTIATION', 'PROLIFERATION'], file_path = 'KG_files/significant_paths_to_phenotypes.txt'):
    # Load the ProxPath file
    df = pd.read_csv(file_path, sep='\t')
    
    # Function to find the closest gene to the EndNode
    def closest_gene(path_string, genes):
        # Split the path into components and reverse it (to start from the EndNode)
        components = path_string.split('--')[::-1]
        for component in components:
            # Check if the component contains any of the genes
            for gene in genes:
                if gene in component:
                    return gene
        return None
    
    # Filter rows for the given genes and phenotypes
    filtered_df = df[df['QueryNode'].isin(genes) & df['EndNode'].isin(phenotypes)].copy()
    
    # find the closest gene to the phenotype
    filtered_df['Closest_Gene'] = filtered_df.apply(lambda row: closest_gene(row['Path_String'], genes), axis=1)
    
    # Filter rows where QueryNode is the closest gene to the phenotype
    pheno_df = filtered_df[filtered_df['QueryNode'] == filtered_df['Closest_Gene']]
    pheno_df = pheno_df.drop(columns=['Closest_Gene'])

    # sort
    pheno_df = pheno_df.sort_values(by=['EndNode', 'QueryNode'])

    # remove rows where Final_Effect is 0
    pheno_df = pheno_df[pheno_df['Final_Effect'] != 0]
    
    # save the filtered data to a new file
    # pheno_df.to_csv('Phenotypes.txt', sep='\t', index=False)

    return pheno_df

pheno_df = proxpath(genes = ['KRAS', 'GNAS', 'TP53', 'SMAD4', 'CDKN2A', 'RNF43'], phenotypes = ['PROLIFERATION'])
pheno_df

Unnamed: 0,EndPathways,QueryNode,EndNode,Path_String,relations_path,Path_Score,Path_Length,Final_Effect,Effect,n,mean,sd,zscore
53815,PROLIFERATION,CDKN2A,PROLIFERATION,CDKN2A--|CDK4--[]CYCLIND/CDK4--|RB1--|PROLIFER...,SIGNOR-44554;SIGNOR-32301;SIGNOR-250762;SIGNOR...,0.577,4,-1,down-regulates,107507,1.681607,0.506006,-2.182991
53913,PROLIFERATION,CDKN2A,PROLIFERATION,CDKN2A--|PROLIFERATION,SIGNOR-259406,0.3,1,-1,down-regulates,107507,1.681607,0.506006,-2.730415
54033,PROLIFERATION,CDKN2A,PROLIFERATION,CDKN2A--|CYCLIND/CDK4--|RB1--|PROLIFERATION,SIGNOR-245459;SIGNOR-250762;SIGNOR-262533,0.633,3,-1,down-regulates,107507,1.681607,0.506006,-2.07232
54152,PROLIFERATION,CDKN2A,PROLIFERATION,CDKN2A--|CDK6--|RB1--|PROLIFERATION,SIGNOR-44557;SIGNOR-135189;SIGNOR-262533,0.672,3,-1,down-regulates,107507,1.681607,0.506006,-1.995246
54306,PROLIFERATION,CDKN2A,PROLIFERATION,CDKN2A--|CDK4--|RB1--|PROLIFERATION,SIGNOR-44554;SIGNOR-200483;SIGNOR-262533,0.464,3,-1,down-regulates,107507,1.681607,0.506006,-2.406308
54518,PROLIFERATION,CDKN2A,PROLIFERATION,CDKN2A--|CDK6--|CDKN1A--|CYCLINE/CDK2-->PROLIF...,SIGNOR-44557;SIGNOR-144832;SIGNOR-245462;SIGNO...,0.651,4,-1,down-regulates,107507,1.681607,0.506006,-2.036748
54124,PROLIFERATION,KRAS,PROLIFERATION,KRAS-->PIK3CA--[]PI3K-->PROLIFERATION,SIGNOR-175204;SIGNOR-255299;SIGNOR-255577,0.45,3,1,up-regulates,107507,1.681607,0.506006,-2.433976
54127,PROLIFERATION,KRAS,PROLIFERATION,KRAS-->PIK3CA-->AKT-->PROLIFERATION,SIGNOR-175204;SIGNOR-244429;SIGNOR-254353,0.6,3,1,up-regulates,107507,1.681607,0.506006,-2.137537
54568,PROLIFERATION,KRAS,PROLIFERATION,KRAS-->PIK3CA--[]PI3K-->AKT-->PROLIFERATION,SIGNOR-175204;SIGNOR-255299;SIGNOR-254950;SIGN...,0.681,4,1,up-regulates,107507,1.681607,0.506006,-1.97746
54569,PROLIFERATION,KRAS,PROLIFERATION,KRAS-->PIK3CA--[]PI3K-->AKT1-->PROLIFERATION,SIGNOR-175204;SIGNOR-255299;SIGNOR-255106;SIGN...,0.681,4,1,up-regulates,107507,1.681607,0.506006,-1.97746


In [30]:
# there may be different Final_Effect values for the same QueryNode and EndNode
for gene in pheno_df['QueryNode'].unique():
    for phenotype in pheno_df['EndNode'].unique():
        if pheno_df[(pheno_df['QueryNode'] == gene) & (pheno_df['EndNode'] == phenotype)]['Final_Effect'].nunique() > 1:
            print(f"{gene} has dual effects on {phenotype}")

# Keep only the rows with the lowest Path_Score for each gene-phenotype pair
pheno_df.loc[pheno_df.groupby(['QueryNode', 'EndNode'])['Path_Score'].idxmin()]

Unnamed: 0,EndPathways,QueryNode,EndNode,Path_String,relations_path,Path_Score,Path_Length,Final_Effect,Effect,n,mean,sd,zscore
53913,PROLIFERATION,CDKN2A,PROLIFERATION,CDKN2A--|PROLIFERATION,SIGNOR-259406,0.3,1,-1,down-regulates,107507,1.681607,0.506006,-2.730415
54124,PROLIFERATION,KRAS,PROLIFERATION,KRAS-->PIK3CA--[]PI3K-->PROLIFERATION,SIGNOR-175204;SIGNOR-255299;SIGNOR-255577,0.45,3,1,up-regulates,107507,1.681607,0.506006,-2.433976
53529,PROLIFERATION,TP53,PROLIFERATION,TP53--|PROLIFERATION,SIGNOR-255669,0.3,1,-1,down-regulates,107507,1.681607,0.506006,-2.730415


In [37]:
pheno_df = proxpath(genes = ['TP53', 'KRAS', 'MYC', 'SMAD4', 'BCL2', 'BAX'], phenotypes = ['APOPTOSIS'])
pheno_df

Unnamed: 0,EndPathways,QueryNode,EndNode,Path_String,relations_path,Path_Score,Path_Length,Final_Effect,Effect,n,mean,sd,zscore
2903,APOPTOSIS,BAX,APOPTOSIS,BAX-->APOPTOSIS,SIGNOR-261494,0.3,1,1,up-regulates,75493,1.737625,0.526944,-2.728233
2913,APOPTOSIS,BCL2,APOPTOSIS,BCL2--|APOPTOSIS,SIGNOR-249611,0.3,1,-1,down-regulates,75493,1.737625,0.526944,-2.728233
3437,APOPTOSIS,BCL2,APOPTOSIS,BCL2--|BAK1-->APOPTOSIS,SIGNOR-152980;SIGNOR-261493,0.589,2,-1,down-regulates,75493,1.737625,0.526944,-2.179787
2948,APOPTOSIS,KRAS,APOPTOSIS,KRAS-->PIK3CA-->AKT--|FOXO-->APOPTOSIS,SIGNOR-175204;SIGNOR-244429;SIGNOR-252824;SIGN...,0.698,4,-1,down-regulates,75493,1.737625,0.526944,-1.972934
3343,APOPTOSIS,KRAS,APOPTOSIS,KRAS-->PIK3CA--[]PI3K-->AKT--|APOPTOSIS,SIGNOR-175204;SIGNOR-255299;SIGNOR-254950;SIGN...,0.681,4,-1,down-regulates,75493,1.737625,0.526944,-2.005196
3539,APOPTOSIS,KRAS,APOPTOSIS,KRAS-->PIK3CA-->AKT--|APOPTOSIS,SIGNOR-175204;SIGNOR-244429;SIGNOR-260215,0.6,3,-1,down-regulates,75493,1.737625,0.526944,-2.158912
2894,APOPTOSIS,TP53,APOPTOSIS,TP53-->APOPTOSIS,SIGNOR-255678,0.3,1,1,up-regulates,75493,1.737625,0.526944,-2.728233
3195,APOPTOSIS,TP53,APOPTOSIS,TP53-->BAK1-->APOPTOSIS,SIGNOR-124122;SIGNOR-261493,0.615,2,1,up-regulates,75493,1.737625,0.526944,-2.130446


In [None]:
def pheno_scores(simulation_results, pheno):
    
    # Keep only the rows with the lowest Path_Score for each gene-phenotype pair
    pheno_unique = pheno.loc[pheno.groupby(['QueryNode', 'EndNode'])['Path_Score'].idxmin()]

    phenotypes = ['APOPTOSIS', 'DIFFERENTIATION', 'PROLIFERATION']    
    # Loop through each phenotype
    for phenotype in phenotypes:
        
        # Filter the 'pheno' dataframe for the current phenotype
        filtered_pheno = pheno_unique[pheno_unique['EndNode'] == phenotype]

        # Loop through each row in the filtered 'pheno' dataframe
        for idx, row in filtered_pheno.iterrows():
            gene = row['QueryNode']
            effect = row['Final_Effect']
            
            # Check if the gene is in the simulation results
            if gene in simulation_results.index:
                # Multiply the gene's simulation result by its effect (1 or -1)
                simulation_results.loc[phenotype] += simulation_results.loc[gene] * effect

    return simulation_results

In [3]:
from BNMPy.phenotype_score import phenotype_scores, get_phenotypes
get_phenotypes()

There are 201 phenotypes
There are 4905 genes
Available phenotypes: ['ACROSOME_ASSEMBLY' 'ACTIN_CYTOSKELETON_REORGANIZATION'
 'ACTION_POTENTIAL_' 'ADIPOGENESIS' 'ALTERNATIVE_SPLICING_REGULATION'
 'AMYLOID_FIBRIL_FORMATION' 'ANGIOGENESIS' 'APOPTOSIS' 'ARDS'
 'AUTOPHAGOSOME_FORMATION' 'AUTOPHAGY' 'AXONAL_GROWTH_CONE_FORMATION'
 'B_CELL_MATURATION' 'B-LYMPHOCYTE_DIFF' 'BASOPHIL_DIFF'
 'BONE_MINERALIZATION' 'BROWN_ADIPOGENESIS' 'CARTILAGE_DEVELOPMENT'
 'CELL_ADHESION' 'CELL_CYCLE_BLOCK' 'CELL_CYCLE_EXIT'
 'CELL_CYCLE_PROGRESS_' 'CELL_DEATH' 'CELL_GROWTH' 'CELL_KILLING'
 'CELL_MIGRATION' 'CELL_POLARITY' 'CELL_SHAPE' 'CENTROMERE_ASSEMBLY'
 'CENTROSOME_SEPARATION' 'CEREBRAL_CORTEX_DEVELOPMENT'
 'CHAPERONE-MEDIATED_AUTOPHAGY' 'CHAPERONE-MEDIATED_PROTEIN_FOLDING'
 'CHEMOATTRACTION_OF_AXON' 'CHEMOREPULSION_OF_AXON' 'CHEMOTAXIS'
 'CHROMATINE_CONDENSATION' 'CHROMOSOME_SEGREGATION' 'CILIUM_ASSEMBLY'
 'CILIUM_MOVEMENT' 'CITRIC_ACID_CYCLE'
 'CLEARANCE_OF_FOREIGN_INTRACELLULAR_DNA' 'COLLOID' 'CYTOKINE

In [None]:
phenotype_scores(
    genes = ['TP53', 'KRAS', 'MYC', 'SMAD4', 'BCL2', 'BAX'], 
    phenotypes = ['APOPTOSIS', 'DIFFERENTIATION', 'PROLIFERATION'], 
    file_path = 'KG_files/significant_paths_to_phenotypes.txt', 
    simulation_results=None
    )

Path found for 3 phenotypes: ['APOPTOSIS' 'PROLIFERATION' 'DIFFERENTIATION']


{'APOPTOSIS': 'BAX -  BCL2 -  KRAS + TP53',
 'PROLIFERATION': 'KRAS + MYC -  TP53',
 'DIFFERENTIATION': 'MYC + TP53'}

In [None]:
phenotype_scores(
    genes = ['TP53', 'KRAS', 'MYC', 'SMAD4', 'BCL2', 'BAX'], 
    phenotypes = ['APOPTOSIS', 'DIFFERENTIATION', 'PROLIFERATION'], 
    file_path = 'KG_files/significant_paths_to_phenotypes.txt', 
    simulation_results=None
    )

# Test

In [None]:
import pandas as pd
signor_file = 'KG_files/SIGNOR_2025_08_14.tsv'
graph_table = pd.read_csv(signor_file, index_col=None, sep='\t')
print(graph_table.columns)

Index(['subject_id', 'object_id', 'subject_id_prefix', 'object_id_prefix',
       'subject_name', 'object_name', 'predicate', 'Primary_Knowledge_Source',
       'Knowledge_Source', 'publications', 'subject_category',
       'object_category', 'score'],
      dtype='object')


In [4]:
graph_table['score'].describe()

count    40935.000000
mean         0.501978
std          0.235996
min          0.100000
25%          0.278000
50%          0.468000
75%          0.727000
max          1.000000
Name: score, dtype: float64