In [None]:
from google.colab import drive
import os
import xml.etree.ElementTree as ET
import pandas as pd

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Configuration
DRIVE_FOLDER = "/content/drive/My Drive/MsDSAI/January 2025/NLP/Project/Data/"
DRUGBANK_PATH = os.path.join(DRIVE_FOLDER, "drugbank.xml")

Parse the XML File in Python

In [None]:
import xml.etree.ElementTree as ET

tree = ET.parse(DRUGBANK_PATH)
root = tree.getroot()

Explore XML Structure

In [None]:
# Print root tag and immediate children
print("Root tag:", root.tag)

Root tag: {http://www.drugbank.ca}drugbank


In [None]:
# Check the first few elements
for i, child in enumerate(root):
    print(f"\nDrug {i+1}")
    for elem in child:
        print(f"  {elem.tag}: {elem.text}")
    if i == 2:  # Just preview first 3 drugs
        break


Drug 1
  {http://www.drugbank.ca}drugbank-id: DB00001
  {http://www.drugbank.ca}drugbank-id: BTD00024
  {http://www.drugbank.ca}drugbank-id: BIOD00024
  {http://www.drugbank.ca}name: Lepirudin
  {http://www.drugbank.ca}description: Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] 

Lepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin

Strip Namespace Helper

In [None]:
def strip_ns(tag):
    return tag.split("}")[-1]

In [None]:
for drug in root.findall("./{http://www.drugbank.ca}drug")[:1]:
    for elem in drug:
        print(strip_ns(elem.tag), elem.text)

drugbank-id DB00001
drugbank-id BTD00024
drugbank-id BIOD00024
name Lepirudin
description Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] 

Lepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the product

Extract Relevant Fields for RAG

In [None]:
drug_data = []
ns = "{http://www.drugbank.ca}"

# Extracting drug information
for drug in root.findall(f"./{ns}drug"):
    data = {}

    # Extract each field, ensuring that missing fields are handled
    for elem in drug:
        tag = strip_ns(elem.tag)  # Remove namespace
        data[tag] = elem.text if elem.text else ''  # Default to empty string if the field is missing

    # Handle any additional fields like drugbank-ids (since it's a list)
    data["drugbank_ids"] = [d.text for d in drug.findall(f"{ns}drugbank-id")]

    # Append the drug data to the list
    drug_data.append(data)

Convert to dataframe and save

In [None]:
# Convert the list of drugs to a DataFrame
df = pd.DataFrame(drug_data)

In [None]:
# Get the list of all columns (tags)
all_columns = set()
for row in drug_data:
    all_columns.update(row.keys())

# Ensure that all rows have the same columns, filling in missing columns with empty strings
all_columns = sorted(all_columns)  # Sorting columns to maintain a consistent order

In [None]:
# Update all rows to ensure consistency with column names
for row in drug_data:
    for col in all_columns:
        if col not in row:
            row[col] = ''  # Fill missing fields with empty string

In [None]:
# Create or overwrite the CSV file
csv_path = os.path.join(DRIVE_FOLDER, "drugbank_clean_v2.csv")
json_path = os.path.join(DRIVE_FOLDER, "drugbank_clean_v2.json")

# Save as CSV
df.to_csv(csv_path, index=False, columns=all_columns)

# Save as JSON
df.to_json(json_path, orient="records", indent=2)

print(f"CSV file saved to {csv_path}")
print(f"JSON file saved to {json_path}")

CSV file saved to /content/drive/My Drive/MsDSAI/January 2025/NLP/Project/Data/drugbank_clean_v2.csv
JSON file saved to /content/drive/My Drive/MsDSAI/January 2025/NLP/Project/Data/drugbank_clean_v2.json


In [None]:
# Save the first 10 rows as a separate CSV file
first_10_rows_csv_path = os.path.join(DRIVE_FOLDER, "drugbank_first_10_rows.csv")

# Slice the DataFrame to get the first 10 rows
df_first_10 = df.head(10)

# Save the first 10 rows to CSV
df_first_10.to_csv(first_10_rows_csv_path, index=False, columns=all_columns)

print(f"First 10 rows saved to {first_10_rows_csv_path}")

First 10 rows saved to /content/drive/My Drive/MsDSAI/January 2025/NLP/Project/Data/drugbank_first_10_rows.csv


EDA

In [None]:
# Select relevant columns
df_cleaned = df[['name', 'description', 'indication', 'mechanism-of-action', 'toxicity', 'state']]

In [None]:
df_cleaned.isnull().sum()

Unnamed: 0,0
name,0
description,0
indication,0
mechanism-of-action,0
toxicity,0
state,8050


In [None]:
df_cleaned.head()

Unnamed: 0,name,description,indication,mechanism-of-action,toxicity,state
0,Lepirudin,Lepirudin is a recombinant hirudin formed by 6...,Lepirudin is indicated for anticoagulation in ...,Lepirudin is a direct thrombin inhibitor used ...,The acute toxicity of intravenous lepirudin wa...,solid
1,Cetuximab,Cetuximab is a recombinant chimeric human/mous...,Cetuximab indicated for the treatment of local...,The epidermal growth factor receptor (EGFR) is...,The intravenous LD50 is > 300 mg/kg in mice an...,liquid
2,Dornase alfa,Dornase alfa is a biosynthetic form of human d...,Used as adjunct therapy in the treatment of cy...,Dornase alfa is a biosynthetic form of human D...,Adverse reactions occur at a frequency of < 1/...,liquid
3,Denileukin diftitox,Denileukin diftitox is an IL2-receptor-directe...,Denileukin diftitox was previously indicated f...,Denileukin diftitox is a fusion protein compos...,There is limited information regarding the acu...,liquid
4,Etanercept,Dimeric fusion protein consisting of the extra...,Etanercept is indicated for the treatment of m...,There are two distinct receptors for TNF (TNFR...,,liquid


In [None]:
# Simplify 'mechanism-of-action' for readability
def simplify_mechanism(mechanism):
    if pd.isna(mechanism):
        return 'Mechanism information not available.'
    # Example of simplification: turn complex terms into user-friendly sentences
    return f"This medicine works by {mechanism.split('.')[0].lower()}."

In [None]:
# Simplify 'toxicity' for readability
def simplify_toxicity(toxicity):
    if pd.isna(toxicity):
        return 'No toxicity information available.'
    return toxicity.split('.')[0]  # Simplify to just a few key symptoms if necessary

In [None]:
# Apply simplifications using .loc to avoid the SettingWithCopyWarning
df_cleaned.loc[:, 'simplified_mechanism'] = df_cleaned['mechanism-of-action'].apply(simplify_mechanism)
df_cleaned.loc[:, 'simplified_toxicity'] = df_cleaned['toxicity'].apply(simplify_toxicity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:, 'simplified_mechanism'] = df_cleaned['mechanism-of-action'].apply(simplify_mechanism)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:, 'simplified_toxicity'] = df_cleaned['toxicity'].apply(simplify_toxicity)


In [None]:
# Remove original 'mechanism-of-action' and 'toxicity' columns as we've simplified them
df_cleaned = df_cleaned.drop(columns=['mechanism-of-action', 'toxicity'])

In [None]:
# Replace missing data with appropriate placeholders for clarity
# df_cleaned['description'] = df_cleaned['description'].fillna('Description not available.')
# df_cleaned['indication'] = df_cleaned['indication'].fillna('Indication not available.')
df_cleaned['state'] = df_cleaned['state'].fillna('State not available.')

In [None]:
df_cleaned.isnull().sum()

Unnamed: 0,0
name,0
description,0
indication,0
state,0
simplified_mechanism,0
simplified_toxicity,0


In [None]:
# Display the first few rows to ensure it's cleaned correctly
df_cleaned.head()

Unnamed: 0,name,description,indication,state,simplified_mechanism,simplified_toxicity
0,Lepirudin,Lepirudin is a recombinant hirudin formed by 6...,Lepirudin is indicated for anticoagulation in ...,solid,This medicine works by lepirudin is a direct t...,The acute toxicity of intravenous lepirudin wa...
1,Cetuximab,Cetuximab is a recombinant chimeric human/mous...,Cetuximab indicated for the treatment of local...,liquid,This medicine works by the epidermal growth fa...,The intravenous LD50 is > 300 mg/kg in mice an...
2,Dornase alfa,Dornase alfa is a biosynthetic form of human d...,Used as adjunct therapy in the treatment of cy...,liquid,This medicine works by dornase alfa is a biosy...,Adverse reactions occur at a frequency of < 1/...
3,Denileukin diftitox,Denileukin diftitox is an IL2-receptor-directe...,Denileukin diftitox was previously indicated f...,liquid,This medicine works by denileukin diftitox is ...,There is limited information regarding the acu...
4,Etanercept,Dimeric fusion protein consisting of the extra...,Etanercept is indicated for the treatment of m...,liquid,This medicine works by there are two distinct ...,


In [None]:
import re
from bs4 import BeautifulSoup
import html  # For handling HTML entities

# Function to clean HTML tags and make it more human-readable
def clean_html(text):
    if text is not None:
        # Remove HTML tags using BeautifulSoup
        clean_text = BeautifulSoup(text, "html.parser").get_text()
        # Decode HTML entities like &lt;, &gt;, &amp;, etc.
        clean_text = html.unescape(clean_text)
        # Additional formatting if necessary, e.g., replace <sub> and <sup> with plain text
        clean_text = re.sub(r'<sub>(.*?)</sub>', r'\1 (subscript)', clean_text)
        clean_text = re.sub(r'<sup>(.*?)</sup>', r'\1 (superscript)', clean_text)
        return clean_text.strip()
    return ""

In [None]:
# Apply the clean_html function to all relevant columns
columns_to_clean = ['name', 'description', 'indication', 'simplified_mechanism', 'simplified_toxicity']
for col in columns_to_clean:
    df_cleaned[col] = df_cleaned[col].apply(clean_html)

In [None]:
# Display the first few rows to ensure it's cleaned correctly
df_cleaned.head()

Unnamed: 0,name,description,indication,state,simplified_mechanism,simplified_toxicity
0,Lepirudin,Lepirudin is a recombinant hirudin formed by 6...,Lepirudin is indicated for anticoagulation in ...,solid,This medicine works by lepirudin is a direct t...,The acute toxicity of intravenous lepirudin wa...
1,Cetuximab,Cetuximab is a recombinant chimeric human/mous...,Cetuximab indicated for the treatment of local...,liquid,This medicine works by the epidermal growth fa...,The intravenous LD50 is > 300 mg/kg in mice an...
2,Dornase alfa,Dornase alfa is a biosynthetic form of human d...,Used as adjunct therapy in the treatment of cy...,liquid,This medicine works by dornase alfa is a biosy...,Adverse reactions occur at a frequency of < 1/...
3,Denileukin diftitox,Denileukin diftitox is an IL2-receptor-directe...,Denileukin diftitox was previously indicated f...,liquid,This medicine works by denileukin diftitox is ...,There is limited information regarding the acu...
4,Etanercept,Dimeric fusion protein consisting of the extra...,Etanercept is indicated for the treatment of m...,liquid,This medicine works by there are two distinct ...,


In [None]:
# Save the cleaned dataframe to a new CSV file
cleaned_file_path = os.path.join(DRIVE_FOLDER, "cleaned_drugbank_data.csv")
df_cleaned.to_csv(cleaned_file_path, index=False)

In [1]:
!pip install faiss-cpu sentence-transformers pandas numpy

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [2]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd

In [3]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os

DRIVE_FOLDER = "/content/drive/My Drive/MsDSAI/January 2025/NLP/Project/Data/"
# Load the updated CSV file
df_loaded = pd.read_csv(os.path.join(DRIVE_FOLDER, "finalized_drugbank_data.csv"))

In [5]:
# Use a SentenceTransformer model to generate embeddings for drug names or descriptions
embedder = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
# Assume we're generating embeddings based on the 'name' column or 'description' column
# embeddings = embedder.encode(df_loaded['name'].tolist())
embeddings = embedder.encode(df_loaded['name'].tolist(), normalize_embeddings=True)

In [10]:
# Save the embeddings as an NPY file
embedding_path = os.path.join(DRIVE_FOLDER, "drug_name_embeddings_v3.npy")
np.save(embedding_path, embeddings)

In [12]:
# Create a FAISS index from the embeddings
dimension = embeddings.shape[1]  # The number of features in the embedding (e.g., 384 for 'MiniLM')
# index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search
index = faiss.IndexFlatIP(dimension)  # IP = Inner Product = Cosine similarity
index.add(embeddings)

In [13]:
# Save the FAISS index to disk
faiss_path = os.path.join(DRIVE_FOLDER, "drug_name_index_v3.faiss")
faiss.write_index(index, faiss_path)

print("FAISS index and NPY embeddings files have been regenerated.")

FAISS index and NPY embeddings files have been regenerated.


In [14]:
# Combine the columns for each drug
drug_info_combined = [
    f"{desc} {ind} {mechanism} {tox}"
    for desc, ind, mechanism, tox in zip(
        df_loaded["description"].fillna(""),
        df_loaded["indication"].fillna(""),
        df_loaded["simplified_mechanism"].fillna(""),
        df_loaded["simplified_toxicity"].fillna("")
    )
]

# Check the first few combined entries
drug_info_combined[:3]

['Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] \r\n\r\nLepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574] Lepirudin is indicated f

In [16]:
# Generate embeddings for drug information (combined description, indication, etc.)
drug_info_embeddings = embedder.encode(drug_info_combined, normalize_embeddings=True)

# Check the shape of the embeddings
print(drug_info_embeddings.shape)

(17441, 384)


In [17]:
import faiss
import numpy as np

# Create a FAISS index for the drug information
# drug_info_index = faiss.IndexFlatL2(drug_info_embeddings.shape[1])
drug_info_index = faiss.IndexFlatIP(drug_info_embeddings.shape[1])  # IP = Inner Product = Cosine similarity

# Add the embeddings to the index
drug_info_index.add(np.array(drug_info_embeddings))

# Verify the index size
print(f"Number of items in FAISS index: {drug_info_index.ntotal}")

Number of items in FAISS index: 17441


In [18]:
# Save the FAISS index to disk
drug_info_faiss_path = os.path.join(DRIVE_FOLDER, "drug_info_index_v3.faiss")
faiss.write_index(drug_info_index, drug_info_faiss_path)

In [19]:
# Save the embeddings for drug information
drug_info_embedding_path = os.path.join(DRIVE_FOLDER, "drug_info_embeddings_v3.npy")
np.save(drug_info_embedding_path, drug_info_embeddings)

In [20]:
# Example search for a drug (using part of the drug information)
query = "Calpol 500 mg"

# Encode the query (already normalized by embedder)
query_embedding = embedder.encode([query])

# Search the FAISS index
D, I = drug_info_index.search(np.array(query_embedding), k=1)

# Get best match index and similarity score
best_idx = I[0][0]
best_similarity = D[0][0]  # Cosine similarity score

# Map to name
best_match = df_loaded.iloc[best_idx]['name']

# Convert similarity to match percentage
match_percentage = best_similarity * 100

# Display result
print(f"Best match for query '{query}': {best_match} (Confidence: {match_percentage:.2f}%)")

Best match for query 'Calpol 500 mg': Calpol 500 mg (Confidence: 72.51%)
