In [1]:
import pandas as pd
import json
import os

# Configuration
DRUG_NAME = "Pralsetinib"
CID = "129073603"
DATA_FILEPATH = 'data/'

print("Libraries loaded. Project configured for:", DRUG_NAME)

Libraries loaded. Project configured for: Pralsetinib


In [2]:
def process_bioactivity(filename):
    print(f"Processing Bioactivity: {filename}...")
    try:
        df = pd.read_csv(filename)
        
        # 1. Select relevant columns based on inspection
        # 'Activity_Value' is usually in uM (Micromolar) in these exports
        df = df[['Target_Name', 'Activity_Type', 'Activity_Value', 'BioAssay_AID']].copy()
        
        # 2. Normalize to nM (Nanomolar)
        # Assumption: Raw values are uM (e.g., 0.0003 uM = 0.3 nM)
        df['value_nM'] = df['Activity_Value'] * 1000 
        
        # 3. Filter for valid data
        df = df.dropna(subset=['value_nM'])
        
        # 4. Create Edge List
        edges = pd.DataFrame({
            'source': DRUG_NAME,
            'relation': 'inhibits', # Defaulting to 'inhibits' for bioactivity
            'target': df['Target_Name'],
            'value': df['value_nM'],
            'unit': 'nM',
            'metadata': df['Activity_Type'] + " (AID: " + df['BioAssay_AID'].astype(str) + ")",
            'source_file': 'Bioactivity',
            'target_type': 'Protein'
        })
        
        print(f"  -> Extracted {len(edges)} bioactivity records.")
        return edges
        
    except FileNotFoundError:
        print("File not found. Skipping.")
        return pd.DataFrame()

# Run
df_bio = process_bioactivity(DATA_FILEPATH + "pubchem_cid_129073603_bioactivity.csv")
df_bio.head()

Processing Bioactivity: data/pubchem_cid_129073603_bioactivity.csv...
  -> Extracted 56 bioactivity records.


Unnamed: 0,source,relation,target,value,unit,metadata,source_file,target_type
0,Pralsetinib,inhibits,RET - ret proto-oncogene (human),0.3,nM,IC50 (AID: 1934929),Bioactivity,Protein
1,Pralsetinib,inhibits,RET - ret proto-oncogene (human),0.3,nM,IC50 (AID: 1895756),Bioactivity,Protein
2,Pralsetinib,inhibits,RET - ret proto-oncogene (human),0.4,nM,IC50 (AID: 1934930),Bioactivity,Protein
3,Pralsetinib,inhibits,RET - ret proto-oncogene (human),0.4,nM,IC50 (AID: 1934928),Bioactivity,Protein
4,Pralsetinib,inhibits,,0.4,nM,IC50 (AID: 1895757),Bioactivity,Protein


In [3]:
def process_targets(filename):
    print(f"Processing Targets: {filename}...")
    try:
        df = pd.read_csv(filename)
        
        # Columns: 'Gene', 'Action', 'Source_Target'
        action_series = df['Action'].fillna('targets').astype(str).str.lower()
        action_map = {
            'inhibitor': 'inhibits',
            'inhibition': 'inhibits',
            'antagonist': 'inhibits',
            'agonist': 'activates',
            'activator': 'activates'
        }
        normalized_action = action_series.map(action_map).fillna('targets')

        edges = pd.DataFrame({
            'source': DRUG_NAME,
            'relation': normalized_action,
            'target': df['Gene'], # Using Gene Symbol as the node ID
            'value': 'Active',
            'unit': 'N/A',
            'metadata': df['Source_Target'],
            'source_file': 'ConsolidatedTargets',
            'target_type': 'Gene/Protein'
        })
        
        print(f"  -> Extracted {len(edges)} target records.")
        return edges

    except FileNotFoundError:
        print("File not found. Skipping.")
        return pd.DataFrame()

# Run
df_targets = process_targets(DATA_FILEPATH + "pubchem_cid_129073603_consolidatedcompoundtarget.csv")
df_targets.head()

Processing Targets: data/pubchem_cid_129073603_consolidatedcompoundtarget.csv...
  -> Extracted 17 target records.


Unnamed: 0,source,relation,target,value,unit,metadata,source_file,target_type
0,Pralsetinib,inhibits,RET,Active,,Proto-oncogene c-Ret (RET),ConsolidatedTargets,Gene/Protein
1,Pralsetinib,targets,RET,Active,,RET,ConsolidatedTargets,Gene/Protein
2,Pralsetinib,inhibits,PTK2B,Active,,PTK2B,ConsolidatedTargets,Gene/Protein
3,Pralsetinib,inhibits,RET,Active,,ret proto-oncogene,ConsolidatedTargets,Gene/Protein
4,Pralsetinib,targets,EML4,Active,,EML4,ConsolidatedTargets,Gene/Protein


In [4]:
def process_clinical_and_indications(file_clinical, file_opentargets):
    edges_list = []
    
    # 1. Clinical Trials
    if os.path.exists(file_clinical):
        print(f"Processing: {file_clinical}")
        df1 = pd.read_csv(file_clinical)
        # Extract 'Conditions' and 'Phase'
        temp1 = df1[['Conditions', 'Phase', 'CTID']].copy()
        temp1['Conditions'] = temp1['Conditions'].fillna('').str.split('|')
        temp1 = temp1.explode('Conditions')
        temp1['Conditions'] = temp1['Conditions'].str.strip()
        temp1 = temp1[temp1['Conditions'] != '']

        temp1 = pd.DataFrame({
            'source': DRUG_NAME,
            'relation': 'treats',
            'target': temp1['Conditions'],
            'value': temp1['Phase'],
            'unit': 'Phase',
            'metadata': temp1['CTID'],
            'source_file': 'ClinicalTrials',
            'target_type': 'Disease'
        })
        edges_list.append(temp1)
        
    # 2. OpenTargets
    if os.path.exists(file_opentargets):
        print(f"Processing: {file_opentargets}")
        df2 = pd.read_csv(file_opentargets)
        # Extract 'Indication' and 'Max_Phase'
        temp2 = df2[['Indication', 'Max_Phase']].copy()
        temp2['Indication'] = temp2['Indication'].fillna('').str.split('|')
        temp2 = temp2.explode('Indication')
        temp2['Indication'] = temp2['Indication'].str.strip()
        temp2 = temp2[temp2['Indication'] != '']

        temp2 = pd.DataFrame({
            'source': DRUG_NAME,
            'relation': 'treats',
            'target': temp2['Indication'],
            'value': temp2['Max_Phase'],
            'unit': 'Phase',
            'metadata': 'OpenTargets',
            'source_file': 'OpenTargets',
            'target_type': 'Disease'
        })
        edges_list.append(temp2)

    if edges_list:
        combined = pd.concat(edges_list, ignore_index=True)
        print(f"  -> Extracted {len(combined)} clinical records.")
        return combined
    else:
        return pd.DataFrame()

# Run
df_clinical = process_clinical_and_indications(
    DATA_FILEPATH + "pubchem_cid_129073603_clinicaltrials.csv",
    DATA_FILEPATH + "pubchem_cid_129073603_opentargetsdrugindication.csv"
)
df_clinical.head()

Processing: data/pubchem_cid_129073603_clinicaltrials.csv
Processing: data/pubchem_cid_129073603_opentargetsdrugindication.csv
  -> Extracted 79 clinical records.


Unnamed: 0,source,relation,target,value,unit,metadata,source_file,target_type
0,Pralsetinib,treats,Locally Advanced Thyroid Gland Carcinoma,Phase 2,Phase,NCT06482086,ClinicalTrials,Disease
1,Pralsetinib,treats,Solid Tumor,,Phase,NCT05525858,ClinicalTrials,Disease
2,Pralsetinib,treats,Advanced Solid Tumor,,Phase,NCT05525858,ClinicalTrials,Disease
3,Pralsetinib,treats,Metastatic Cancer,,Phase,NCT05525858,ClinicalTrials,Disease
4,Pralsetinib,treats,Advanced Unresectable or Metastatic Solid Mali...,Phase 2,Phase,NCT04632992,ClinicalTrials,Disease


In [5]:
def process_json_cooc(filename, relation_label, target_type):
    print(f"Processing JSON: {filename}...")
    try:
        with open(filename, 'r') as f:
            data = json.load(f)
            
        extracted_data = []
        
        # 1. Navigate to the actual list of data
        # Structure: root -> LinkDataSet -> LinkData -> [List of Items]
        link_dataset = data.get('LinkDataSet', {})
        
        # Handle case where LinkDataSet might be the list itself or a dict containing LinkData
        if isinstance(link_dataset, list):
            source_list = link_dataset
        else:
            source_list = link_dataset.get('LinkData', [])
            
        # 2. Iterate through items to find NeighborName
        for item in source_list:
            if not isinstance(item, dict):
                continue
                
            evidence = item.get('Evidence', {})
            
            # Check for Chemical Neighbor path
            if 'ChemicalNeighbor' in evidence:
                name = evidence['ChemicalNeighbor'].get('NeighborName')
                if name: extracted_data.append(name)
            
            # Check for Gene Neighbor path
            elif 'ChemicalGeneSymbolNeighbor' in evidence:
                name = evidence['ChemicalGeneSymbolNeighbor'].get('NeighborName')
                if name: extracted_data.append(name)
                
            # Fallback: Check direct property if structure varies
            elif 'NeighborName' in item:
                extracted_data.append(item['NeighborName'])

        # 3. Create DataFrame
        if extracted_data:
            edges = pd.DataFrame({
                'source': DRUG_NAME,
                'relation': relation_label,
                'target': extracted_data,
                'value': 'Co-occurrence',
                'unit': 'Text Mining',
                'metadata': 'PubChem Neighbor',
                'source_file': filename,
                'target_type': target_type
            })
            print(f"  -> Extracted {len(edges)} records.")
            return edges
        else:
            print("  No data extracted. JSON structure might not match expected paths.")
            return pd.DataFrame()

    except FileNotFoundError:
        print("  File not found. Skipping.")
        return pd.DataFrame()
    except Exception as e:
        print(f"  Error parsing JSON: {e}")
        return pd.DataFrame()

# Run the fixed function
df_lit = process_json_cooc(
    DATA_FILEPATH + "Chemical_Co-Occurrences-in-Literature_CID_129073603.json",
    "co_occurs_with_chemical",
    "Chemical"
)
df_gene = process_json_cooc(
    DATA_FILEPATH + "Chemical_Gene-Co-Occurrences-in-Literature_CID_129073603.json",
    "co_occurs_with_gene",
    "Gene"
)

Processing JSON: data/Chemical_Co-Occurrences-in-Literature_CID_129073603.json...
  -> Extracted 100 records.
Processing JSON: data/Chemical_Gene-Co-Occurrences-in-Literature_CID_129073603.json...
  -> Extracted 100 records.


In [6]:
df_lit.head()

Unnamed: 0,source,relation,target,value,unit,metadata,source_file,target_type
0,Pralsetinib,co_occurs_with_chemical,Selpercatinib,Co-occurrence,Text Mining,PubChem Neighbor,data/Chemical_Co-Occurrences-in-Literature_CID...,Chemical
1,Pralsetinib,co_occurs_with_chemical,Cabozantinib,Co-occurrence,Text Mining,PubChem Neighbor,data/Chemical_Co-Occurrences-in-Literature_CID...,Chemical
2,Pralsetinib,co_occurs_with_chemical,Vandetanib,Co-occurrence,Text Mining,PubChem Neighbor,data/Chemical_Co-Occurrences-in-Literature_CID...,Chemical
3,Pralsetinib,co_occurs_with_chemical,L-Tyrosine,Co-occurrence,Text Mining,PubChem Neighbor,data/Chemical_Co-Occurrences-in-Literature_CID...,Chemical
4,Pralsetinib,co_occurs_with_chemical,2-Amino-4-chloropyridine,Co-occurrence,Text Mining,PubChem Neighbor,data/Chemical_Co-Occurrences-in-Literature_CID...,Chemical


In [7]:
df_gene.head()

Unnamed: 0,source,relation,target,value,unit,metadata,source_file,target_type
0,Pralsetinib,co_occurs_with_gene,ret proto-oncogene,Co-occurrence,Text Mining,PubChem Neighbor,data/Chemical_Gene-Co-Occurrences-in-Literatur...,Gene
1,Pralsetinib,co_occurs_with_gene,tyrosine kinase non receptor 1,Co-occurrence,Text Mining,PubChem Neighbor,data/Chemical_Gene-Co-Occurrences-in-Literatur...,Gene
2,Pralsetinib,co_occurs_with_gene,kinesin family member 5b,Co-occurrence,Text Mining,PubChem Neighbor,data/Chemical_Gene-Co-Occurrences-in-Literatur...,Gene
3,Pralsetinib,co_occurs_with_gene,coiled-coil domain containing 6,Co-occurrence,Text Mining,PubChem Neighbor,data/Chemical_Gene-Co-Occurrences-in-Literatur...,Gene
4,Pralsetinib,co_occurs_with_gene,zinc finger mynd-type containing 10,Co-occurrence,Text Mining,PubChem Neighbor,data/Chemical_Gene-Co-Occurrences-in-Literatur...,Gene


In [8]:
def process_adverse_events(filename):
    print(f"Processing Adverse Events in: {filename}...")
    try:
        df = pd.read_csv(filename)
        
        # 1. Define the "Dirty" Keywords (Common Kinase Side Effects)
        ae_keywords = {
            'hypertension': 'Hypertension',
            'high blood pressure': 'Hypertension',
            'neutropenia': 'Neutropenia',
            'anemia': 'Anemia',
            'pneumonitis': 'Pneumonitis',
            'hepatotoxicity': 'Hepatotoxicity',
            'fatigue': 'Fatigue',
            'diarrhea': 'Diarrhea',
            'constipation': 'Constipation'
        }
        
        found_events = []
        
        # 2. Scan Titles and Abstracts
        for _, row in df.iterrows():
            # Combine text fields, handle NaNs
            text = (str(row.get('Title', '')) + " " + str(row.get('Abstract', ''))).lower()
            
            for keyword, standardized_term in ae_keywords.items():
                if keyword in text:
                    found_events.append({
                        'source': DRUG_NAME,
                        'relation': 'associated_with',
                        'target': standardized_term,
                        'value': 'Text Mining',
                        'unit': 'Mention',
                        'metadata': f"PMID: {row.get('PMID', 'N/A')}",
                        'source_file': 'Literature_Abstracts',
                        'target_type': 'Adverse Event'
                    })
        
        if found_events:
            edges = pd.DataFrame(found_events)
            print(f"  -> Extracted {len(edges)} adverse event mentions.")
            return edges
        else:
            print("  No adverse events found with current keywords.")
            return pd.DataFrame()

    except FileNotFoundError:
        print("  File not found. Skipping.")
        return pd.DataFrame()

# Run the function
df_ae = process_adverse_events(DATA_FILEPATH + "pubchem_cid_129073603_literature.csv")

Processing Adverse Events in: data/pubchem_cid_129073603_literature.csv...
  -> Extracted 54 adverse event mentions.


In [9]:
# 1. Gather all dataframes
# We check if variables exist (in case cells were skipped)
# ADDED 'df_ae' TO THIS LIST BELOW
possible_dfs = ['df_bio', 'df_targets', 'df_clinical', 'df_lit', 'df_gene', 'df_ae']

dataframes_to_merge = []
for df_name in possible_dfs:
    if df_name in locals() and not locals()[df_name].empty:
        dataframes_to_merge.append(locals()[df_name])

if not dataframes_to_merge:
    print("No data available to merge.")
else:
    # 2. Concatenate
    kg_edges = pd.concat(dataframes_to_merge, ignore_index=True)

    # 3. Clean Data
    kg_edges = kg_edges.dropna(subset=['target'])
    kg_edges = kg_edges[kg_edges['target'] != '']
    kg_edges = kg_edges.drop_duplicates()

    # 4. Create Node List
    nodes_src = kg_edges[['source']].rename(columns={'source': 'id'})
    nodes_src['type'] = 'Drug'

    if 'target_type' in kg_edges.columns:
        nodes_tgt = kg_edges[['target', 'target_type']].rename(
            columns={'target': 'id', 'target_type': 'type'}
        )
    else:
        nodes_tgt = kg_edges[['target', 'relation']].rename(columns={'target': 'id'})

        def assign_type(rel):
            rel = str(rel).lower()
            if 'inhibits' in rel or 'inhibitor' in rel or 'inhibition' in rel:
                return 'Protein'
            if 'targets' in rel:
                return 'Gene/Protein'
            if 'treats' in rel:
                return 'Disease'
            if 'gene' in rel:
                return 'Gene'
            if 'chemical' in rel:
                return 'Chemical'
            if 'side_effect' in rel or 'adverse' in rel or 'associated_with' in rel:
                return 'Adverse Event'
            return 'Entity'

        nodes_tgt['type'] = nodes_tgt['relation'].apply(assign_type)
        nodes_tgt = nodes_tgt.drop(columns=['relation'])

    kg_nodes = pd.concat([nodes_src, nodes_tgt]).drop_duplicates(subset=['id'])

    # 5. Save Final Files
    print("--- Summary ---")
    print(f"Total Edges: {len(kg_edges)}")
    print(f"Total Nodes: {len(kg_nodes)}")

    if 'target_type' in kg_edges.columns:
        kg_edges = kg_edges.drop(columns=['target_type'])

    kg_edges.to_csv(DATA_FILEPATH + "kg_edges_v2.csv", index=False)
    kg_nodes.to_csv(DATA_FILEPATH + "kg_nodes_v2.csv", index=False)

    print("DONE! Files 'kg_edges_v2.csv' and 'kg_nodes_v2.csv' are ready.")

--- Summary ---
Total Edges: 375
Total Nodes: 285
DONE! Files 'kg_edges_v2.csv' and 'kg_nodes_v2.csv' are ready.
