In [62]:
## Load necessary packages
import os
import pandas as pd
import glob
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

## load the TCT related packages
from TCT import node_normalizer
from TCT import name_resolver
from TCT import translator_metakg
from TCT import translator_kpinfo
from TCT import translator_query
from TCT import TCT

## Define the version number
version_number = "08_10_2025"
deployment_date = "2025-08-10"

In [63]:
## Load the Biolink category and predicate dictionary for mapping subject, object, and predicate types
%run ./Biolink_category_and_predication_dictionary.ipynb

Date of last update:  2025-07-08
Order is to always process Node/category map first, since the Edeg/predicate map depends on biolink-complainat node values
-----------------------------------------------------------------------------------------------------------------------------
Dictionary: category_map, Key template: Subject_category or Object_category
------------------------------------------------------------------------------------------
Dictionary: predicate_map, Key template: (Subject_category, Object_category, Predicate)


In [64]:
# print(category_map)

## Load files and convert them into separate node & edge files
* check all imported file structure

In [65]:
## Notice!! Please change the file path of following codes into your own
raw_files_path = '/Users/Weiqi0/ISB_working/Ilya_lab/Translator/Pharmagenomics_KG/files/Pharmacogenetic_Associations_table/'

## Define the output path for node & edge files after formatting
download_path_node_file = f'/Users/Weiqi0/ISB_working/Ilya_lab/Translator/Pharmagenomics_KG/files/parsed/Pharmacogenetic_Associations_table_parsed_node_{version_number}.tsv'
download_path_edge_file = f'/Users/Weiqi0/ISB_working/Ilya_lab/Translator/Pharmagenomics_KG/files/parsed/Pharmacogenetic_Associations_table_parsed_edge_{version_number}.tsv'

In [66]:
## Check all node files being read
## Read all BigGIM node csv file in group 1

for f in os.listdir(raw_files_path):
    if f.endswith('.csv'):
        print(f)

Pharmacogenetic_associations_table.csv


In [67]:
## Read each individual csv files
source_df = pd.read_csv(raw_files_path + 'Pharmacogenetic_associations_table.csv')

## Drop nan values
source_df = source_df.dropna(subset=['Drug', 'Gene'])

source_df.head(10)

Unnamed: 0,Drug,Gene,Affected_subgroups,Description_of_Gene_Drug_Interaction,Type
0,Abacavir,HLA-B,*57:01 allele positive,Results in higher adverse reaction risk (hyper...,Data Support Therapeutic Management Recommenda...
1,Abrocitinib,CYP2C19,poor metabolizers,Results in higher systemic concentrations and ...,Data Support Therapeutic Management Recommenda...
2,Amifampridine,NAT2,poor metabolizers,Results in higher systemic concentrations and ...,Data Support Therapeutic Management Recommenda...
3,Amifampridine Phosphate,NAT2,poor metabolizers,Results in higher systemic concentrations. Use...,Data Support Therapeutic Management Recommenda...
4,Amphetamine,CYP2D6,poor metabolizers,May affect systemic concentrations and adverse...,Data Support Therapeutic Management Recommenda...
5,Aripiprazole,CYP2D6,poor metabolizers,Results in higher systemic concentrations and ...,Data Support Therapeutic Management Recommenda...
6,Aripiprazole Lauroxil,CYP2D6,poor metabolizers,Results in higher systemic concentrations. Dos...,Data Support Therapeutic Management Recommenda...
7,Atomoxetine,CYP2D6,poor metabolizers,Results in higher systemic concentrations and ...,Data Support Therapeutic Management Recommenda...
8,Azathioprine,TPMT and/or NUDT15,intermediate or poor metabolizers,Alters systemic active metabolite concentratio...,Data Support Therapeutic Management Recommenda...
9,Belinostat,UGT1A1,*28/*28 (poor metabolizers),May result in higher systemic concentrations a...,Data Support Therapeutic Management Recommenda...


In [68]:
## check all unique values in the Description_of_Gene_Drug_Interaction column
unique_description_values = source_df['Description_of_Gene_Drug_Interaction'].unique()
# print(unique_description_values)

In [69]:
## check all unique values in the Description_of_Gene_Drug_Interaction column
unique_drug_values = source_df['Drug'].unique()
print(unique_drug_values)

['Abacavir' 'Abrocitinib' 'Amifampridine' 'Amifampridine Phosphate'
 'Amphetamine' 'Aripiprazole' 'Aripiprazole Lauroxil' 'Atomoxetine'
 'Azathioprine' 'Belinostat' 'Belzutifan' 'Brexpiprazole' 'Brivaracetam'
 'Capecitabine' 'Carbamazepine' 'Celecoxib' 'Citalopram' 'Clobazam'
 'Clopidogrel' 'Clozapine' 'Codeine' 'Deutetrabenazine' 'Dronabinol'
 'Eliglustat' 'Erdafitinib' 'Flibanserin' 'Flurbiprofen' 'Fluorouracil'
 'Fosphenytoin' 'Gefitinib' 'Iloperidone' 'Irinotecan' 'Lofexidine'
 'Meclizine' 'Meloxicam' 'Metoclopramide' 'Mercaptopurine' 'Mivacurium'
 'Nateglinide' 'Oliceridine' 'Pantoprazole' 'Phenytoin' 'Pimozide'
 'Piroxicam' 'Pitolisant' 'Propafenone' 'Sacituzumab Govitecan-hziy'
 'Siponimod' 'Succinylcholine' 'Tacrolimus' 'Tetrabenazine' 'Thioguanine'
 'Thioridazine' 'Tramadol' 'Valbenazine' 'Venlafaxine' 'Vortioxetine'
 'Warfarin' 'Allopurinol' 'Carvedilol' 'Cevimeline' 'Efavirenz'
 'Isoniazid' 'Lapatinib' 'Mavacamten' 'Nilotinib' 'Oxcarbazepine'
 'Pazopanib' 'Perphenazine' 'Pro

In [70]:
## check all unique values in the Description_of_Gene_Drug_Interaction column
unique_gene_values = source_df['Gene'].unique()
print(unique_gene_values)

['HLA-B' 'CYP2C19' 'NAT2' 'CYP2D6' 'TPMT and/or NUDT15' 'UGT1A1'
 'CYP2C19 and/or UGT2B17' 'DPYD' 'CYP2C9' 'BCHE' 'CYP3A5' 'CYP4F2'
 'VKORC1' 'HLA-A' 'CYP2B6' 'Nonspecific (NAT)' 'HLA-DRB1' 'HLA-DQA1'
 'SLCO1B1']


In [71]:
## Add new columns subject_category and object_category
source_df['subject_category'] = 'biolink:Drug'
source_df['object_category'] = 'biolink:Gene'
source_df['predicate'] = 'biolink:DrugToGeneAssociation'

## change the name of columns from col(Drug) to col(subject_name) and col(Gene) to col(object_name) for pandas dataframe full_df
source_df = source_df.rename(columns={
    'Drug': 'subject_name',
    'Gene': 'object_name'
})

In [72]:
source_df.head(5)

Unnamed: 0,subject_name,object_name,Affected_subgroups,Description_of_Gene_Drug_Interaction,Type,subject_category,object_category,predicate
0,Abacavir,HLA-B,*57:01 allele positive,Results in higher adverse reaction risk (hyper...,Data Support Therapeutic Management Recommenda...,biolink:Drug,biolink:Gene,biolink:DrugToGeneAssociation
1,Abrocitinib,CYP2C19,poor metabolizers,Results in higher systemic concentrations and ...,Data Support Therapeutic Management Recommenda...,biolink:Drug,biolink:Gene,biolink:DrugToGeneAssociation
2,Amifampridine,NAT2,poor metabolizers,Results in higher systemic concentrations and ...,Data Support Therapeutic Management Recommenda...,biolink:Drug,biolink:Gene,biolink:DrugToGeneAssociation
3,Amifampridine Phosphate,NAT2,poor metabolizers,Results in higher systemic concentrations. Use...,Data Support Therapeutic Management Recommenda...,biolink:Drug,biolink:Gene,biolink:DrugToGeneAssociation
4,Amphetamine,CYP2D6,poor metabolizers,May affect systemic concentrations and adverse...,Data Support Therapeutic Management Recommenda...,biolink:Drug,biolink:Gene,biolink:DrugToGeneAssociation


## Execute name resolver to try to find all corresponding identifiers in Translator
* use name_resolver.loopup() function
* use name_resolver.batch_lookup() function for batch mapping

In [73]:
## test case - Name resolver red flag...
name = 'Nonspecific (NAT)'
input_node_info = name_resolver.lookup(name)
print(input_node_info)

TranslatorNode(curie='CHEBI:93457', label='5MCA-NAT', types=['biolink:SmallMolecule', 'biolink:MolecularEntity', 'biolink:ChemicalEntity', 'biolink:PhysicalEssence', 'biolink:ChemicalOrDrugOrTreatment', 'biolink:ChemicalEntityOrGeneOrGeneProduct', 'biolink:ChemicalEntityOrProteinOrPolypeptide', 'biolink:NamedThing', 'biolink:Entity', 'biolink:PhysicalEssenceOrOccurrent'], synonyms=None, curie_synonyms=None)


In [77]:
## test case - another red flag?
name = 'CYP2C19 and/or UGT2B17'
input_node_info = name_resolver.lookup(name)
print(input_node_info)

TranslatorNode(curie='MONDO:0015912', label='macrothrombocytopenia and granulocyte inclusions with or without nephritis or sensorineural hearing loss', types=['biolink:Disease', 'biolink:DiseaseOrPhenotypicFeature', 'biolink:BiologicalEntity', 'biolink:ThingWithTaxon', 'biolink:NamedThing', 'biolink:Entity'], synonyms=None, curie_synonyms=None)


In [79]:
## test case - correct
name = 'BCHE'
input_node_info = name_resolver.lookup(name)
print(input_node_info)

TranslatorNode(curie='NCBIGene:590', label='BCHE', types=['biolink:Gene', 'biolink:GeneOrGeneProduct', 'biolink:GenomicEntity', 'biolink:ChemicalEntityOrGeneOrGeneProduct', 'biolink:PhysicalEssence', 'biolink:OntologyClass', 'biolink:BiologicalEntity', 'biolink:ThingWithTaxon', 'biolink:NamedThing', 'biolink:Entity', 'biolink:PhysicalEssenceOrOccurrent', 'biolink:MacromolecularMachineMixin'], synonyms=None, curie_synonyms=None)


In [75]:
print(input_node_info.curie)

CHEBI:93457


In [78]:
## Apply name_resolver.lookup and extract .curie for subject name
# source_df['subject'] = source_df['subject_name'].apply(lambda name: name_resolver.lookup(name).curie if name_resolver.lookup(name) else None)

## if True, then the function will only give the first matched row/record
## return_top_response = False

## switch to use batch_lookup?
import pandas as pd

# Get all names
names = source_df['subject_name'].tolist()

# Break into batches of 25
batch_size = 25
batches = [names[i:i + batch_size] for i in range(0, len(names), batch_size)]

# Run batch lookups and collect results
results = {}
for batch in batches:
    lookup_results = name_resolver.batch_lookup(batch)  # Expected to return a dict: {name: result or None}
    for name, result in lookup_results.items():
        results[name] = result.curie if result else None

# Map the resolved CURIEs back to the DataFrame
source_df['subject'] = source_df['subject_name'].map(results)