In [1]:
from google.colab import drive
import os
import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Configuration
DRIVE_FOLDER = "/content/drive/My Drive/MsDSAI/January 2025/NLP/Project/Data/"
DRUGBANK_PATH = os.path.join(DRIVE_FOLDER, "drugbank.xml")

Parse the XML File in Python

In [4]:
import xml.etree.ElementTree as ET

tree = ET.parse(DRUGBANK_PATH)
root = tree.getroot()

Explore XML Structure

In [5]:
# Print root tag and immediate children
print("Root tag:", root.tag)

Root tag: {http://www.drugbank.ca}drugbank


In [6]:
# Check the first few elements
for i, child in enumerate(root):
    print(f"\nDrug {i+1}")
    for elem in child:
        print(f"  {elem.tag}: {elem.text}")
    if i == 2:  # Just preview first 3 drugs
        break


Drug 1
  {http://www.drugbank.ca}drugbank-id: DB00001
  {http://www.drugbank.ca}drugbank-id: BTD00024
  {http://www.drugbank.ca}drugbank-id: BIOD00024
  {http://www.drugbank.ca}name: Lepirudin
  {http://www.drugbank.ca}description: Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] 

Lepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin

Strip Namespace Helper

In [7]:
def strip_ns(tag):
    return tag.split("}")[-1]

In [8]:
for drug in root.findall("./{http://www.drugbank.ca}drug")[:1]:
    for elem in drug:
        print(strip_ns(elem.tag), elem.text)

drugbank-id DB00001
drugbank-id BTD00024
drugbank-id BIOD00024
name Lepirudin
description Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] 

Lepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the product

Extract Relevant Fields for RAG

In [16]:
drug_data = []
ns = "{http://www.drugbank.ca}"

# Extracting drug information
for drug in root.findall(f"./{ns}drug"):
    data = {}

    # Extract each field, ensuring that missing fields are handled
    for elem in drug:
        tag = strip_ns(elem.tag)  # Remove namespace
        data[tag] = elem.text if elem.text else ''  # Default to empty string if the field is missing

    # Handle any additional fields like drugbank-ids (since it's a list)
    data["drugbank_ids"] = [d.text for d in drug.findall(f"{ns}drugbank-id")]

    # Append the drug data to the list
    drug_data.append(data)

Convert to dataframe and save

In [17]:
# Convert the list of drugs to a DataFrame
df = pd.DataFrame(drug_data)

In [18]:
# Get the list of all columns (tags)
all_columns = set()
for row in drug_data:
    all_columns.update(row.keys())

# Ensure that all rows have the same columns, filling in missing columns with empty strings
all_columns = sorted(all_columns)  # Sorting columns to maintain a consistent order

In [19]:
# Update all rows to ensure consistency with column names
for row in drug_data:
    for col in all_columns:
        if col not in row:
            row[col] = ''  # Fill missing fields with empty string

In [20]:
# Create or overwrite the CSV file
csv_path = os.path.join(DRIVE_FOLDER, "drugbank_clean_v2.csv")
json_path = os.path.join(DRIVE_FOLDER, "drugbank_clean_v2.json")

# Save as CSV
df.to_csv(csv_path, index=False, columns=all_columns)

# Save as JSON
df.to_json(json_path, orient="records", indent=2)

print(f"CSV file saved to {csv_path}")
print(f"JSON file saved to {json_path}")

CSV file saved to /content/drive/My Drive/MsDSAI/January 2025/NLP/Project/Data/drugbank_clean_v2.csv
JSON file saved to /content/drive/My Drive/MsDSAI/January 2025/NLP/Project/Data/drugbank_clean_v2.json


In [21]:
# Save the first 10 rows as a separate CSV file
first_10_rows_csv_path = os.path.join(DRIVE_FOLDER, "drugbank_first_10_rows.csv")

# Slice the DataFrame to get the first 10 rows
df_first_10 = df.head(10)

# Save the first 10 rows to CSV
df_first_10.to_csv(first_10_rows_csv_path, index=False, columns=all_columns)

print(f"First 10 rows saved to {first_10_rows_csv_path}")

First 10 rows saved to /content/drive/My Drive/MsDSAI/January 2025/NLP/Project/Data/drugbank_first_10_rows.csv


EDA

In [63]:
# Select relevant columns
df_cleaned = df[['name', 'description', 'indication', 'mechanism-of-action', 'toxicity', 'state']]

In [64]:
df_cleaned.isnull().sum()

Unnamed: 0,0
name,0
description,0
indication,0
mechanism-of-action,0
toxicity,0
state,8050


In [65]:
df_cleaned.head()

Unnamed: 0,name,description,indication,mechanism-of-action,toxicity,state
0,Lepirudin,Lepirudin is a recombinant hirudin formed by 6...,Lepirudin is indicated for anticoagulation in ...,Lepirudin is a direct thrombin inhibitor used ...,The acute toxicity of intravenous lepirudin wa...,solid
1,Cetuximab,Cetuximab is a recombinant chimeric human/mous...,Cetuximab indicated for the treatment of local...,The epidermal growth factor receptor (EGFR) is...,The intravenous LD50 is > 300 mg/kg in mice an...,liquid
2,Dornase alfa,Dornase alfa is a biosynthetic form of human d...,Used as adjunct therapy in the treatment of cy...,Dornase alfa is a biosynthetic form of human D...,Adverse reactions occur at a frequency of < 1/...,liquid
3,Denileukin diftitox,Denileukin diftitox is an IL2-receptor-directe...,Denileukin diftitox was previously indicated f...,Denileukin diftitox is a fusion protein compos...,There is limited information regarding the acu...,liquid
4,Etanercept,Dimeric fusion protein consisting of the extra...,Etanercept is indicated for the treatment of m...,There are two distinct receptors for TNF (TNFR...,,liquid


In [66]:
# Simplify 'mechanism-of-action' for readability
def simplify_mechanism(mechanism):
    if pd.isna(mechanism):
        return 'Mechanism information not available.'
    # Example of simplification: turn complex terms into user-friendly sentences
    return f"This medicine works by {mechanism.split('.')[0].lower()}."

In [67]:
# Simplify 'toxicity' for readability
def simplify_toxicity(toxicity):
    if pd.isna(toxicity):
        return 'No toxicity information available.'
    return toxicity.split('.')[0]  # Simplify to just a few key symptoms if necessary

In [68]:
# Apply simplifications using .loc to avoid the SettingWithCopyWarning
df_cleaned.loc[:, 'simplified_mechanism'] = df_cleaned['mechanism-of-action'].apply(simplify_mechanism)
df_cleaned.loc[:, 'simplified_toxicity'] = df_cleaned['toxicity'].apply(simplify_toxicity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:, 'simplified_mechanism'] = df_cleaned['mechanism-of-action'].apply(simplify_mechanism)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:, 'simplified_toxicity'] = df_cleaned['toxicity'].apply(simplify_toxicity)


In [69]:
# Remove original 'mechanism-of-action' and 'toxicity' columns as we've simplified them
df_cleaned = df_cleaned.drop(columns=['mechanism-of-action', 'toxicity'])

In [70]:
# Replace missing data with appropriate placeholders for clarity
# df_cleaned['description'] = df_cleaned['description'].fillna('Description not available.')
# df_cleaned['indication'] = df_cleaned['indication'].fillna('Indication not available.')
df_cleaned['state'] = df_cleaned['state'].fillna('State not available.')

In [71]:
df_cleaned.isnull().sum()

Unnamed: 0,0
name,0
description,0
indication,0
state,0
simplified_mechanism,0
simplified_toxicity,0


In [72]:
# Display the first few rows to ensure it's cleaned correctly
df_cleaned.head()

Unnamed: 0,name,description,indication,state,simplified_mechanism,simplified_toxicity
0,Lepirudin,Lepirudin is a recombinant hirudin formed by 6...,Lepirudin is indicated for anticoagulation in ...,solid,This medicine works by lepirudin is a direct t...,The acute toxicity of intravenous lepirudin wa...
1,Cetuximab,Cetuximab is a recombinant chimeric human/mous...,Cetuximab indicated for the treatment of local...,liquid,This medicine works by the epidermal growth fa...,The intravenous LD50 is > 300 mg/kg in mice an...
2,Dornase alfa,Dornase alfa is a biosynthetic form of human d...,Used as adjunct therapy in the treatment of cy...,liquid,This medicine works by dornase alfa is a biosy...,Adverse reactions occur at a frequency of < 1/...
3,Denileukin diftitox,Denileukin diftitox is an IL2-receptor-directe...,Denileukin diftitox was previously indicated f...,liquid,This medicine works by denileukin diftitox is ...,There is limited information regarding the acu...
4,Etanercept,Dimeric fusion protein consisting of the extra...,Etanercept is indicated for the treatment of m...,liquid,This medicine works by there are two distinct ...,


In [73]:
import re
from bs4 import BeautifulSoup
import html  # For handling HTML entities

# Function to clean HTML tags and make it more human-readable
def clean_html(text):
    if text is not None:
        # Remove HTML tags using BeautifulSoup
        clean_text = BeautifulSoup(text, "html.parser").get_text()
        # Decode HTML entities like &lt;, &gt;, &amp;, etc.
        clean_text = html.unescape(clean_text)
        # Additional formatting if necessary, e.g., replace <sub> and <sup> with plain text
        clean_text = re.sub(r'<sub>(.*?)</sub>', r'\1 (subscript)', clean_text)
        clean_text = re.sub(r'<sup>(.*?)</sup>', r'\1 (superscript)', clean_text)
        return clean_text.strip()
    return ""

In [74]:
# Apply the clean_html function to all relevant columns
columns_to_clean = ['name', 'description', 'indication', 'simplified_mechanism', 'simplified_toxicity']
for col in columns_to_clean:
    df_cleaned[col] = df_cleaned[col].apply(clean_html)

In [75]:
# Display the first few rows to ensure it's cleaned correctly
df_cleaned.head()

Unnamed: 0,name,description,indication,state,simplified_mechanism,simplified_toxicity
0,Lepirudin,Lepirudin is a recombinant hirudin formed by 6...,Lepirudin is indicated for anticoagulation in ...,solid,This medicine works by lepirudin is a direct t...,The acute toxicity of intravenous lepirudin wa...
1,Cetuximab,Cetuximab is a recombinant chimeric human/mous...,Cetuximab indicated for the treatment of local...,liquid,This medicine works by the epidermal growth fa...,The intravenous LD50 is > 300 mg/kg in mice an...
2,Dornase alfa,Dornase alfa is a biosynthetic form of human d...,Used as adjunct therapy in the treatment of cy...,liquid,This medicine works by dornase alfa is a biosy...,Adverse reactions occur at a frequency of < 1/...
3,Denileukin diftitox,Denileukin diftitox is an IL2-receptor-directe...,Denileukin diftitox was previously indicated f...,liquid,This medicine works by denileukin diftitox is ...,There is limited information regarding the acu...
4,Etanercept,Dimeric fusion protein consisting of the extra...,Etanercept is indicated for the treatment of m...,liquid,This medicine works by there are two distinct ...,


In [76]:
# Save the cleaned dataframe to a new CSV file
cleaned_file_path = os.path.join(DRIVE_FOLDER, "cleaned_drugbank_data.csv")
df_cleaned.to_csv(cleaned_file_path, index=False)