In [83]:
## Load necessary packages
import os
import pandas as pd
import glob
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

## Define the version number
version_number = "07_08_2025"
deployment_date = "2025-07-08"

## Load files and convert them into separate node & edge files
* check all imported file structure

In [84]:
## Notice!! Please change the file path of following codes into your own
raw_files_path = '/Users/Weiqi0/ISB_working/Ilya_lab/Translator/Pharmagenomics_KG/files/pharmGKB/files/'

## Define the output path for node & edge files after formatting
download_path_node_file = f'/Users/Weiqi0/ISB_working/Ilya_lab/Translator/Pharmagenomics_KG/files/parsed/PharmGKB_parsed_node_{version_number}.tsv'
download_path_edge_file = f'/Users/Weiqi0/ISB_working/Ilya_lab/Translator/Pharmagenomics_KG/files/parsed/PharmGKB_parsed_edge_{version_number}.tsv'

In [85]:
## Check all node files being read
## Read all BigGIM node csv file in group 1

for f in os.listdir(raw_files_path):
    if f.endswith('.tsv'):
        print(f)

relationships.tsv
phenotypes.tsv
genes.tsv
chemicals.tsv
drugs.tsv
variants.tsv


In [86]:
## Read each individual csv files
edges_df = pd.read_csv(raw_files_path + 'relationships.tsv', sep = '\t')

## count rows
print(len(edges_df))

62016


In [87]:
edges_df.columns

Index(['Entity1_id', 'Entity1_name', 'Entity1_type', 'Entity2_id',
       'Entity2_name', 'Entity2_type', 'Evidence', 'Association', 'PK', 'PD',
       'PMIDs'],
      dtype='object')

In [88]:
## check unique relation
unique_relation_values = edges_df['Association'].unique()
print("All possible relation are here: " , unique_relation_values)

All possible relation are here:  ['associated' 'ambiguous' 'not associated']


In [89]:
## filter to only include the associated relation
edges_filtered_df = edges_df[edges_df['Association'] == 'associated']
## count rows
print(len(edges_filtered_df))

52050


In [90]:
## Then only keep the needed columns
edges_filtered_df2 = edges_filtered_df[['Entity1_id', 'Entity1_name', 'Entity1_type', 'Entity2_id',
       'Entity2_name', 'Entity2_type', 'Evidence', 'Association','PMIDs']]

edges_filtered_df2.head()

Unnamed: 0,Entity1_id,Entity1_name,Entity1_type,Entity2_id,Entity2_name,Entity2_type,Evidence,Association,PMIDs
0,PA142672624,ANKFN1,Gene,PA447288,Essential hypertension,Disease,ClinicalAnnotation,associated,25695618
1,PA142672624,ANKFN1,Gene,PA449899,hydrochlorothiazide,Chemical,ClinicalAnnotation,associated,25695618
3,PA31744,NQO1,Gene,PA128406956,fluorouracil,Chemical,ClinicalAnnotation,associated,18511948;25545243
4,PA31744,NQO1,Gene,PA131285527,oxaliplatin,Chemical,"ClinicalAnnotation,Literature,MultilinkAnnotation",associated,24924344;25545243
5,PA31744,NQO1,Gene,PA151958383,Gastrointestinal Stromal Tumors,Disease,ClinicalAnnotation,associated,30237583


In [91]:
## Check the knowledge_source column again
## Count occurrences of each unique value in 'knowledge_source'
counts = edges_filtered_df2['Entity1_type'].value_counts()

print(counts)

Entity1_type
Chemical     14804
Gene         12296
Variant      10873
Disease       9471
Haplotype     4606
Name: count, dtype: int64


In [92]:
## Check the knowledge_source column again
## Count occurrences of each unique value in 'knowledge_source'
counts = edges_filtered_df2['Entity2_type'].value_counts()

print(counts)

Entity2_type
Chemical     14804
Gene         12296
Variant      10873
Disease       9471
Haplotype     4606
Name: count, dtype: int64


In [93]:
## Read in corresponding node files
## Read each individual tsv files
genes_df = pd.read_csv(raw_files_path + 'genes.tsv', sep = '\t')

## select only needed columns and separate into node & edge files
genes_node_subject_df = genes_df[['PharmGKB Accession Id', 'NCBI Gene ID']]

chemicals_df = pd.read_csv(raw_files_path + 'chemicals.tsv', sep = '\t')

## select only needed columns and separate into node & edge files
chemicals_node_subject_df = chemicals_df[['PharmGKB Accession Id', 'PubChem Compound Identifiers']]
chemicals_node_subject_df['PubChem Compound Identifiers'] = pd.to_numeric(chemicals_node_subject_df['PubChem Compound Identifiers'], errors='coerce').astype('Int64')

drugs_df = pd.read_csv(raw_files_path + 'drugs.tsv', sep = '\t')

## select only needed columns and separate into node & edge files
drugs_node_subject_df = drugs_df[['PharmGKB Accession Id', 'PubChem Compound Identifiers']]
drugs_node_subject_df['PubChem Compound Identifiers'] = pd.to_numeric(drugs_node_subject_df['PubChem Compound Identifiers'], errors='coerce').astype('Int64')

phenotypes_df = pd.read_csv(raw_files_path + 'phenotypes.tsv', sep = '\t')

## select only needed columns and separate into node & edge files
phenotypes_node_subject_df = phenotypes_df[['PharmGKB Accession Id', 'External Vocabulary']]

variants_df = pd.read_csv(raw_files_path + 'variants.tsv', sep = '\t')

## select only needed columns and separate into node & edge files
variants_node_subject_df = variants_df[['Variant ID', 'Gene IDs']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chemicals_node_subject_df['PubChem Compound Identifiers'] = pd.to_numeric(chemicals_node_subject_df['PubChem Compound Identifiers'], errors='coerce').astype('Int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugs_node_subject_df['PubChem Compound Identifiers'] = pd.to_numeric(drugs_node_subject_df['PubChem Compound Identifiers'], errors='coerce').astype('Int64')


In [94]:
genes_node_subject_df.head()

Unnamed: 0,PharmGKB Accession Id,NCBI Gene ID
0,PA24356,1
1,PA165392995,503538
2,PA162375098,29974
3,PA24357,2
4,PA142670460,144568


## Dealing with gene df

In [95]:
## Create a biolink_id column for genes df
## format NCBIGene:XX
genes_node_subject_df['biolink_id'] = 'NCBIGene:' + genes_node_subject_df['NCBI Gene ID'].astype(str)


## rename those columns into desired format
genes_node_subject_df.rename(columns={'PharmGKB Accession Id': 'PharmGKB_id'}, inplace=True)

genes_final_df = genes_node_subject_df[['PharmGKB_id', 'biolink_id']]
# genes_final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genes_node_subject_df['biolink_id'] = 'NCBIGene:' + genes_node_subject_df['NCBI Gene ID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genes_node_subject_df.rename(columns={'PharmGKB Accession Id': 'PharmGKB_id'}, inplace=True)


In [96]:
chemicals_node_subject_df.head()

Unnamed: 0,PharmGKB Accession Id,PubChem Compound Identifiers
0,PA166250381,59272813
1,PA166250405,117693284
2,PA166250403,59272796
3,PA166178620,10
4,PA166131395,101987375


## Dealing with chemicals df

In [97]:
## Create a biolink_id column for genes df
## format NCBIGene:XX
chemicals_node_subject_df['biolink_id'] = 'PUBCHEM.COMPOUND:' + chemicals_node_subject_df['PubChem Compound Identifiers'].astype(str)


## rename those columns into desired format
chemicals_node_subject_df.rename(columns={'PharmGKB Accession Id': 'PharmGKB_id'}, inplace=True)

chemicals_final_df = chemicals_node_subject_df[['PharmGKB_id', 'biolink_id']]
chemicals_final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chemicals_node_subject_df['biolink_id'] = 'PUBCHEM.COMPOUND:' + chemicals_node_subject_df['PubChem Compound Identifiers'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chemicals_node_subject_df.rename(columns={'PharmGKB Accession Id': 'PharmGKB_id'}, inplace=True)


Unnamed: 0,PharmGKB_id,biolink_id
0,PA166250381,PUBCHEM.COMPOUND:59272813
1,PA166250405,PUBCHEM.COMPOUND:117693284
2,PA166250403,PUBCHEM.COMPOUND:59272796
3,PA166178620,PUBCHEM.COMPOUND:10
4,PA166131395,PUBCHEM.COMPOUND:101987375


In [98]:
phenotypes_node_subject_df.head()

Unnamed: 0,PharmGKB Accession Id,External Vocabulary
0,PA446220,"MeSH:D015746(Abdominal Pain), SnoMedCT:2072050..."
1,PA443224,"""MeSH:D000014(Abnormalities, Drug-Induced)"", ""..."
2,PA444117,"MeSH:D005124(Eye Abnormalities), SnoMedCT:1941..."
3,PA443227,"""MeSH:D000022(Abortion, Spontaneous)"", ""SnoMed..."
4,PA443235,"MeSH:D000038(Abscess), SnoMedCT:128477000(Absc..."


## Dealing with drugs df

In [99]:
## Create a biolink_id column for genes df
## format NCBIGene:XX
drugs_node_subject_df['biolink_id'] = 'PUBCHEM.COMPOUND:' + drugs_node_subject_df['PubChem Compound Identifiers'].astype(str)


## rename those columns into desired format
drugs_node_subject_df.rename(columns={'PharmGKB Accession Id': 'PharmGKB_id'}, inplace=True)

drugs_final_df = drugs_node_subject_df[['PharmGKB_id', 'biolink_id']]
drugs_final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugs_node_subject_df['biolink_id'] = 'PUBCHEM.COMPOUND:' + drugs_node_subject_df['PubChem Compound Identifiers'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugs_node_subject_df.rename(columns={'PharmGKB Accession Id': 'PharmGKB_id'}, inplace=True)


Unnamed: 0,PharmGKB_id,biolink_id
0,PA166238901,PUBCHEM.COMPOUND:23671798
1,PA166238881,PUBCHEM.COMPOUND:9547222
2,PA166238883,PUBCHEM.COMPOUND:23671797
3,PA166238882,PUBCHEM.COMPOUND:23671799
4,PA166238921,PUBCHEM.COMPOUND:71316159


## Dealing with phenotypes df
* https://github.com/TranslatorSRI/NodeNormalization/blob/master/documentation/NodeNormalization.ipynb
* all following concepts are acceptable
<!-- * "UMLS": "225822",
      "SNOMEDCT": "152614",
      "MEDDRA": "23228",
      "NCIT": "39158",
      "MONDO": "44526",
      "ORPHANET": "18282",
      "MESH": "21066",
      "HP": "3478",
      "DOID": "19624",
      "OMIM": "28954",
      "EFO": "3820",
      "ICD10": "24",
      "ICD9": "12",
      "MP": "4",
      "medgen": "4" -->

In [100]:
# Define regex pattern for the prefixes
pattern = r'(MeSH:[^\(]*|UMLS:[^\(]*|SnoMedCT:[^\(]*|MONDO:[^\(]*|HP:[^\(]*|MedDRA:[^\(]*)(?=\()'

# Extract all matches (as lists)
phenotypes_node_subject_df['matched_codes'] = phenotypes_node_subject_df['External Vocabulary'].str.findall(pattern)

phenotypes_node_subject_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotypes_node_subject_df['matched_codes'] = phenotypes_node_subject_df['External Vocabulary'].str.findall(pattern)


Unnamed: 0,PharmGKB Accession Id,External Vocabulary,matched_codes
0,PA446220,"MeSH:D015746(Abdominal Pain), SnoMedCT:2072050...","[MeSH:D015746, SnoMedCT:207205003, SnoMedCT:21..."
1,PA443224,"""MeSH:D000014(Abnormalities, Drug-Induced)"", ""...","[MeSH:D000014, UMLS:C0000771, MONDO:MONDO:0016..."
2,PA444117,"MeSH:D005124(Eye Abnormalities), SnoMedCT:1941...","[MeSH:D005124, SnoMedCT:19416009, SnoMedCT:204..."
3,PA443227,"""MeSH:D000022(Abortion, Spontaneous)"", ""SnoMed...","[MeSH:D000022, SnoMedCT:17369002, SnoMedCT:198..."
4,PA443235,"MeSH:D000038(Abscess), SnoMedCT:128477000(Absc...","[MeSH:D000038, SnoMedCT:128477000, SnoMedCT:20..."


In [101]:
pd.set_option('display.max_rows', None)

# Define ordered prefix list (priority)
# Define priority list
priority_order = ['MeSH', 'UMLS', 'MONDO', 'HP', 'MedDRA', 'SnoMedCT']

# Function to select first matching code by prefix priority
def select_first_by_priority(code_list):
    if not isinstance(code_list, list):
        return None
    for prefix in priority_order:
        for code in code_list:
            if isinstance(code, str) and code.startswith(prefix + ':'):
                return code
    return None

# Apply function to new column
phenotypes_node_subject_df['first_code'] = phenotypes_node_subject_df['matched_codes'].apply(select_first_by_priority)

# print(phenotypes_node_subject_df[['PharmGKB Accession Id', 'first_code']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotypes_node_subject_df['first_code'] = phenotypes_node_subject_df['matched_codes'].apply(select_first_by_priority)


In [102]:
import re

# Function to remove repeated prefix
def remove_duplicate_prefix(code):
    if isinstance(code, str):
        return re.sub(r'^([A-Z]+):\1:', r'\1:', code)
    return code

# Apply to 'first_code' column
phenotypes_node_subject_df['first_code'] = phenotypes_node_subject_df['first_code'].apply(remove_duplicate_prefix)

# print(phenotypes_node_subject_df[['PharmGKB Accession Id', 'first_code']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotypes_node_subject_df['first_code'] = phenotypes_node_subject_df['first_code'].apply(remove_duplicate_prefix)


In [103]:
## rename those columns into desired format
phenotypes_node_subject_df.rename(columns={'PharmGKB Accession Id': 'PharmGKB_id', 'first_code': 'biolink_id'}, inplace=True)

phenotypes_final_df = phenotypes_node_subject_df[['PharmGKB_id', 'biolink_id']]

## exclude rows with Null values in the biolink_id column
phenotypes_final_df = phenotypes_final_df.dropna(subset=['biolink_id'])

phenotypes_final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotypes_node_subject_df.rename(columns={'PharmGKB Accession Id': 'PharmGKB_id', 'first_code': 'biolink_id'}, inplace=True)


Unnamed: 0,PharmGKB_id,biolink_id
0,PA446220,MeSH:D015746
1,PA443224,MeSH:D000014
2,PA444117,MeSH:D005124
3,PA443227,MeSH:D000022
4,PA443235,MeSH:D000038


In [104]:
variants_node_subject_df.head()

Unnamed: 0,Variant ID,Gene IDs
0,PA166156302,PA395
1,PA166156746,PA142671652
2,PA166195421,PA361
3,PA166177121,
4,PA166156636,


## Dealing with variants df 
* not implemented for now

In [105]:
## Left join on both columns
# variants_merged_df = pd.merge(variants_node_subject_df, df2, how='left', on=['PharmGKB_id', 'Gene IDs'])

## Concatenate those parsed dataframes
* genes_final_df
* drugs_final_df
* chemicals_final_df
* phenotypes_final_df

In [106]:
## vertical concatenation
all_node_df = pd.concat([genes_final_df, drugs_final_df, chemicals_final_df, phenotypes_final_df])
print(len(all_node_df))

all_node_df = all_node_df.drop_duplicates()
print(len(all_node_df))

all_node_df.head()

35533
31762


Unnamed: 0,PharmGKB_id,biolink_id
0,PA24356,NCBIGene:1
1,PA165392995,NCBIGene:503538
2,PA162375098,NCBIGene:29974
3,PA24357,NCBIGene:2
4,PA142670460,NCBIGene:144568


In [107]:
edges_filtered_df2.head()

Unnamed: 0,Entity1_id,Entity1_name,Entity1_type,Entity2_id,Entity2_name,Entity2_type,Evidence,Association,PMIDs
0,PA142672624,ANKFN1,Gene,PA447288,Essential hypertension,Disease,ClinicalAnnotation,associated,25695618
1,PA142672624,ANKFN1,Gene,PA449899,hydrochlorothiazide,Chemical,ClinicalAnnotation,associated,25695618
3,PA31744,NQO1,Gene,PA128406956,fluorouracil,Chemical,ClinicalAnnotation,associated,18511948;25545243
4,PA31744,NQO1,Gene,PA131285527,oxaliplatin,Chemical,"ClinicalAnnotation,Literature,MultilinkAnnotation",associated,24924344;25545243
5,PA31744,NQO1,Gene,PA151958383,Gastrointestinal Stromal Tumors,Disease,ClinicalAnnotation,associated,30237583


In [108]:
edges_filtered_df2.columns

Index(['Entity1_id', 'Entity1_name', 'Entity1_type', 'Entity2_id',
       'Entity2_name', 'Entity2_type', 'Evidence', 'Association', 'PMIDs'],
      dtype='object')

In [109]:
## Then left join with the edge df to obtain subject id and object ids

## Left join on two columns with different names
edges_subject_id_df = pd.merge(
    edges_filtered_df2, all_node_df, how='left',
    left_on=['Entity1_id'],
    right_on = ['PharmGKB_id'],
)

print(len(edges_subject_id_df))

## exclude rows with Null values in the PharmGKB_id column
edges_subject_id_df = edges_subject_id_df.dropna(subset=['PharmGKB_id'])

print(len(edges_subject_id_df))

52050
36401


In [110]:
## rename the added biolink_id to be subject
## rename those columns into desired format
edges_subject_id_df.rename(columns={'biolink_id': 'subject', 'Entity1_name': "subject_name"}, inplace=True)

## drop not needed columns
edges_subject_id_df = edges_subject_id_df.drop(columns=['Entity1_id', 'PharmGKB_id'])

edges_subject_id_df.head()

Unnamed: 0,subject_name,Entity1_type,Entity2_id,Entity2_name,Entity2_type,Evidence,Association,PMIDs,subject
0,ANKFN1,Gene,PA447288,Essential hypertension,Disease,ClinicalAnnotation,associated,25695618,NCBIGene:162282
1,ANKFN1,Gene,PA449899,hydrochlorothiazide,Chemical,ClinicalAnnotation,associated,25695618,NCBIGene:162282
2,NQO1,Gene,PA128406956,fluorouracil,Chemical,ClinicalAnnotation,associated,18511948;25545243,NCBIGene:1728
3,NQO1,Gene,PA131285527,oxaliplatin,Chemical,"ClinicalAnnotation,Literature,MultilinkAnnotation",associated,24924344;25545243,NCBIGene:1728
4,NQO1,Gene,PA151958383,Gastrointestinal Stromal Tumors,Disease,ClinicalAnnotation,associated,30237583,NCBIGene:1728


In [111]:
## Then left join with the edge df to obtain subject id and object ids

## Left join on two columns with different names
edges_object_id_df = pd.merge(
    edges_subject_id_df, all_node_df, how='left',
    left_on=['Entity2_id'],
    right_on = ['PharmGKB_id'],
)

print(len(edges_object_id_df))

## exclude rows with Null values in the PharmGKB_id column
edges_object_id_df = edges_object_id_df.dropna(subset=['PharmGKB_id'])

print(len(edges_object_id_df))

36401
20958


In [112]:
## rename the added biolink_id to be subject
## rename those columns into desired format
edges_object_id_df.rename(columns={'biolink_id': 'object', 'Entity2_name': "object_name", 'PMIDs':'publications'}, inplace=True)

## drop not needed columns
edges_object_id_df = edges_object_id_df.drop(columns=['Entity2_id', 'PharmGKB_id', 'Evidence'])

edges_object_id_df.head()

Unnamed: 0,subject_name,Entity1_type,object_name,Entity2_type,Association,publications,subject,object
0,ANKFN1,Gene,Essential hypertension,Disease,associated,25695618,NCBIGene:162282,MeSH:D000075222
1,ANKFN1,Gene,hydrochlorothiazide,Chemical,associated,25695618,NCBIGene:162282,PUBCHEM.COMPOUND:3639
2,NQO1,Gene,fluorouracil,Chemical,associated,18511948;25545243,NCBIGene:1728,PUBCHEM.COMPOUND:3385
3,NQO1,Gene,oxaliplatin,Chemical,associated,24924344;25545243,NCBIGene:1728,PUBCHEM.COMPOUND:6857599
4,NQO1,Gene,Gastrointestinal Stromal Tumors,Disease,associated,30237583,NCBIGene:1728,MeSH:D046152


In [113]:
## Load the Biolink category and predicate dictionary for mapping subject, object, and predicate types
%run ./Biolink_category_and_predication_dictionary.ipynb

Date of last update:  2025-07-08
Order is to always process Node/category map first, since the Edeg/predicate map depends on biolink-complainat node values
-----------------------------------------------------------------------------------------------------------------------------
Dictionary: category_map, Key template: Subject_category or Object_category
------------------------------------------------------------------------------------------
Dictionary: predicate_map, Key template: (Subject_category, Object_category, Predicate)


In [114]:
## deep copy a df
edges_deep_copy_df = edges_object_id_df.copy(deep=True)

## deprecated
# edges_deep_copy_df['subject_category'] = 'biolink:' + edges_deep_copy_df['Entity1_type'].astype(str)
# edges_deep_copy_df['object_category'] = 'biolink:' + edges_deep_copy_df['Entity2_type'].astype(str)

## assign biolink node category using dictionary directly
edges_deep_copy_df['subject_category'] = (
    edges_deep_copy_df['Entity1_type'].map(category_map)
)

edges_deep_copy_df['object_category'] = (
    edges_deep_copy_df['Entity2_type'].map(category_map)
)

In [115]:
## Group by predicate
grouped = edges_deep_copy_df.groupby('Association')

## For each predicate, output unique (subject_category, object_category) pairs
for predicate, group in grouped:
    print(f"\nPredicate: {predicate}")
    pairs = group[['subject_category', 'object_category']].drop_duplicates()
    for _, row in pairs.iterrows():
        print(f"  ({row['subject_category']}, {row['object_category']})")


Predicate: associated
  (biolink:Gene, biolink:Disease)
  (biolink:Gene, biolink:ChemicalEntity)
  (biolink:ChemicalEntity, biolink:Gene)
  (biolink:ChemicalEntity, biolink:ChemicalEntity)
  (biolink:Disease, biolink:Gene)
  (biolink:Gene, biolink:Gene)
  (biolink:Disease, biolink:Disease)


In [116]:

## depracted
# ## create predicates column
# list_of_display_relations = ['associated']

# relation_predicate_map = {
#     'associated': 'biolink:associated_with',
# }

# ## Initialize new column with default values (original id as string)
# edges_deep_copy_df['predicate'] = edges_deep_copy_df['Association'].astype(str)

# ## Mask
# mask = edges_deep_copy_df['Association'].isin(list_of_display_relations)
# edges_deep_copy_df.loc[mask, 'predicate'] = (
#     edges_deep_copy_df.loc[mask, 'Association'].map(relation_predicate_map)
# )

edges_deep_copy_df['predicate'] = edges_deep_copy_df.apply(
    lambda row: predicate_map.get((row['subject_category'], row['object_category'], row['Association'])),
    axis=1
)

## drop not needed columns
edges_deep_copy_df = edges_deep_copy_df.drop(columns=['Entity1_type', 'Entity2_type', "Association"])

## add a new knowledge_souce column and set value to be "PrimeKG"
edges_deep_copy_df['knowledge_source'] = 'PharmGKB'
## add a new knowledge_level column and set value to be 'knowledge_assertion'
edges_deep_copy_df['knowledge_level'] = 'knowledge_assertion'
## add a new agent_type column and set value to be 'manual_agent'
edges_deep_copy_df['agent_type'] = 'automated_agent'

edges_deep_copy_df['deploy_date'] = deployment_date

## create a context_qualifier column and fill na
## if all of them are empty then fill na
edges_deep_copy_df['context_qualifier'] = np.nan

edges_deep_copy_df.head(5)

Unnamed: 0,subject_name,object_name,publications,subject,object,subject_category,object_category,predicate,knowledge_source,knowledge_level,agent_type,deploy_date,context_qualifier
0,ANKFN1,Essential hypertension,25695618,NCBIGene:162282,MeSH:D000075222,biolink:Gene,biolink:Disease,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,
1,ANKFN1,hydrochlorothiazide,25695618,NCBIGene:162282,PUBCHEM.COMPOUND:3639,biolink:Gene,biolink:ChemicalEntity,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,
2,NQO1,fluorouracil,18511948;25545243,NCBIGene:1728,PUBCHEM.COMPOUND:3385,biolink:Gene,biolink:ChemicalEntity,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,
3,NQO1,oxaliplatin,24924344;25545243,NCBIGene:1728,PUBCHEM.COMPOUND:6857599,biolink:Gene,biolink:ChemicalEntity,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,
4,NQO1,Gastrointestinal Stromal Tumors,30237583,NCBIGene:1728,MeSH:D046152,biolink:Gene,biolink:Disease,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,


In [117]:
import uuid
import pandas as pd

## generate uuid from column combination
def generate_uuid_from_columns(df, column_list, namespace=uuid.NAMESPACE_DNS):
    """
    Generates UUIDs based on the values in a specified column of a Pandas DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_list (list): List of all names of columns to use for UUID generation.
        namespace (uuid.UUID): A UUID namespace (default is uuid.NAMESPACE_DNS).

    Returns:
        pd.Series: A Pandas Series containing the generated UUIDs.
    """
    return df[column_list].apply(lambda x: uuid.uuid5(namespace, str(x)).hex)

def generate_uuid(row):
    """
    Generates a UUID based on the combined values of multiple columns.
    """
    combined_string = ''.join(row.astype(str))
    return uuid.uuid5(uuid.NAMESPACE_DNS, combined_string)

In [118]:
## copy to a final df
edge_df = edges_deep_copy_df.copy(deep = True)

### Add resources_id column, checking whether edge is already
column_list = ['subject', 'predicate', 'object', 'context_qualifier', 'deploy_date']
# Apply the function to each row to generate UUIDs
edge_df['id'] = edge_df[column_list].apply(generate_uuid, axis=1)

# edge_df['id'] = generate_uuid_from_columns(edge_df, column_list)
edge_df.head(5)

Unnamed: 0,subject_name,object_name,publications,subject,object,subject_category,object_category,predicate,knowledge_source,knowledge_level,agent_type,deploy_date,context_qualifier,id
0,ANKFN1,Essential hypertension,25695618,NCBIGene:162282,MeSH:D000075222,biolink:Gene,biolink:Disease,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,,3f4622cd-cc81-5c60-9fce-4d25214b9b86
1,ANKFN1,hydrochlorothiazide,25695618,NCBIGene:162282,PUBCHEM.COMPOUND:3639,biolink:Gene,biolink:ChemicalEntity,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,,2ebd74fd-1cb6-519c-9051-4e0e5880a57d
2,NQO1,fluorouracil,18511948;25545243,NCBIGene:1728,PUBCHEM.COMPOUND:3385,biolink:Gene,biolink:ChemicalEntity,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,,3e9e9eee-e33e-5903-b6c0-fd6f960ab809
3,NQO1,oxaliplatin,24924344;25545243,NCBIGene:1728,PUBCHEM.COMPOUND:6857599,biolink:Gene,biolink:ChemicalEntity,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,,5dfcff5b-dea4-53f7-b9ff-1db8d23309df
4,NQO1,Gastrointestinal Stromal Tumors,30237583,NCBIGene:1728,MeSH:D046152,biolink:Gene,biolink:Disease,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,,6149843b-5b1d-562b-b2e9-82391a552db3


In [119]:
## Drop rows where 'name' is NaN, None, or empty string
edge_df = edge_df[~edge_df['predicate'].isna() & (edge_df['predicate'].str.strip() != '')]

## throw away those rows which either subject_category, object_category, or predicate is not start with "biolink:" prefix
## since they cannot be biolink-compliant converted
## Keep only rows where all three columns start with 'biolink:'
edge_df = edge_df[
    edge_df['subject_category'].str.startswith('biolink:') &
    edge_df['object_category'].str.startswith('biolink:') &
    edge_df['predicate'].str.startswith('biolink:')
]

### Now create the corresponding node file
* only need three columns: id, name, category

In [120]:
node_subject_df = edge_df[['subject', 'subject_name', 'subject_category']]
node_object_df = edge_df[['object', 'object_name', 'object_category']]

## rename those columns into desired format
node_subject_df.rename(columns={'subject': 'id', 'subject_name': 'name', 'subject_category': 'category'}, inplace=True)
node_object_df.rename(columns={'object': 'id', 'object_name': 'name', 'object_category': 'category'}, inplace=True)

concat_node_df = pd.concat([node_subject_df, node_object_df]).drop_duplicates(keep='first')

concat_node_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  node_subject_df.rename(columns={'subject': 'id', 'subject_name': 'name', 'subject_category': 'category'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  node_object_df.rename(columns={'object': 'id', 'object_name': 'name', 'object_category': 'category'}, inplace=True)


Unnamed: 0,id,name,category
0,NCBIGene:162282,ANKFN1,biolink:Gene
2,NCBIGene:1728,NQO1,biolink:Gene
21,NCBIGene:6813,STXBP2,biolink:Gene
22,PUBCHEM.COMPOUND:33,chloroacetaldehyde,biolink:ChemicalEntity
23,NCBIGene:8000,PSCA,biolink:Gene


In [121]:
## drop not needed columns
edge_final_df = edge_df.drop(columns=['subject_name', 'object_name'])

edge_final_df.head(5)

Unnamed: 0,publications,subject,object,subject_category,object_category,predicate,knowledge_source,knowledge_level,agent_type,deploy_date,context_qualifier,id
0,25695618,NCBIGene:162282,MeSH:D000075222,biolink:Gene,biolink:Disease,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,,3f4622cd-cc81-5c60-9fce-4d25214b9b86
1,25695618,NCBIGene:162282,PUBCHEM.COMPOUND:3639,biolink:Gene,biolink:ChemicalEntity,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,,2ebd74fd-1cb6-519c-9051-4e0e5880a57d
2,18511948;25545243,NCBIGene:1728,PUBCHEM.COMPOUND:3385,biolink:Gene,biolink:ChemicalEntity,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,,3e9e9eee-e33e-5903-b6c0-fd6f960ab809
3,24924344;25545243,NCBIGene:1728,PUBCHEM.COMPOUND:6857599,biolink:Gene,biolink:ChemicalEntity,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,,5dfcff5b-dea4-53f7-b9ff-1db8d23309df
4,30237583,NCBIGene:1728,MeSH:D046152,biolink:Gene,biolink:Disease,biolink:associated_with,PharmGKB,knowledge_assertion,automated_agent,2025-07-08,,6149843b-5b1d-562b-b2e9-82391a552db3


In [122]:
## Create a graph from the DataFrame
graph = nx.from_pandas_edgelist(edge_final_df, 'subject', 'object', edge_attr='predicate')

## Print graph information
print('Number of nodes', len(set(graph.nodes)))
print('Number of edges', len(set(graph.edges)))
print('Average degree', sum(dict(graph.degree).values()) / len(graph.nodes))

Number of nodes 3217
Number of edges 9919
Average degree 6.16661485856388


## Now download the concatenated node & edge files

In [123]:
## download both node and edge files
## Download the result df
concat_node_df.to_csv(download_path_node_file, sep ='\t', index=False)
edge_final_df.to_csv(download_path_edge_file, sep ='\t', index=False)

In [124]:
print("The formatted node file will be saved in this path: ", download_path_node_file)
print("The formatted edge file will be saved in this path: ", download_path_edge_file)

The formatted node file will be saved in this path:  /Users/Weiqi0/ISB_working/Ilya_lab/Translator/Pharmagenomics_KG/files/parsed/PharmGKB_parsed_node_07_08_2025.tsv
The formatted edge file will be saved in this path:  /Users/Weiqi0/ISB_working/Ilya_lab/Translator/Pharmagenomics_KG/files/parsed/PharmGKB_parsed_edge_07_08_2025.tsv


## following codes are used for quality control and sanity check
* check and confirm all subject & object types are correctly formatted

In [125]:
## check all unique predicate values
counts = edge_final_df['subject_category'].value_counts()

print(counts)

subject_category
biolink:Gene              11702
biolink:ChemicalEntity     5907
biolink:Disease            3349
Name: count, dtype: int64


In [126]:
## check all unique predicate values
counts = edge_final_df['object_category'].value_counts()

print(counts)

object_category
biolink:Gene              11702
biolink:ChemicalEntity     5907
biolink:Disease            3349
Name: count, dtype: int64


In [127]:
## check all unique predicate values
counts = edge_final_df['predicate'].value_counts()

print(counts)

predicate
biolink:associated_with    20958
Name: count, dtype: int64
