In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# Parse the XML file
tree = ET.parse("extracted_drugs_fixed.xml")
root = tree.getroot()

data = []
columns = set(["drugbank-id", "name"])  # Initial columns

def parse_properties(properties, prefix):
    prop_dict = {}
    prop_counts = {}  # Track counts for duplicate properties

    for prop in properties.findall("property"):
        kind = prop.find("kind").text.strip().lower().replace(" ", "_")
        value = prop.find("value").text.strip()

        # Manage duplicate property names by appending a counter
        if kind in prop_counts:
            prop_counts[kind] += 1
            column_name = f"{prefix}_{kind}_{prop_counts[kind]}"
        else:
            prop_counts[kind] = 1
            column_name = f"{prefix}_{kind}_1"

        prop_dict[column_name] = value
        columns.add(column_name)  # Dynamically add new columns

    return prop_dict

for drug in root.findall("drug"):
    drug_data = {
        "drugbank-id": drug.find("drugbank-id").text.strip(),
        "name": drug.find("name").text.strip()
    }

    # Parse calculated properties
    calc_props = drug.find("calculated-properties")
    if calc_props:
        drug_data.update(parse_properties(calc_props, "c"))

    # Parse experimental properties
    exp_props = drug.find("experimental-properties")
    if exp_props:
        drug_data.update(parse_properties(exp_props, "ex"))

    data.append(drug_data)

# Convert to DataFrame
df = pd.DataFrame(data)

# Ensure all columns are present
df = df.reindex(columns=sorted(columns))

# Save to CSV
df.to_csv("output1.csv", index=False)


  if calc_props:
  if exp_props:


## preprocessing

In [2]:
import pandas as pd

df = pd.read_csv("class_added.csv")
df.describe()

Unnamed: 0,c_h_bond_acceptor_count,c_h_bond_donor_count,c_logp_1,c_logp_2,c_logs,c_molecular_weight,c_monoisotopic_weight,c_number_of_rings,c_physiological_charge,c_pka_(strongest_acidic),c_polar_surface_area_(psa)_1,c_rotatable_bond_count,c_rule_of_five,ex_caco2_permeability,ex_logp_1,ex_logs_1
count,1536.0,1524.0,1766.0,1658.0,1686.0,1701.0,1682.0,1466.0,1481.0,1185.0,1591.0,1549.0,1445.0,32.0,1023.0,113.0
mean,4.355469,1.950787,1.980594,1.81891,-3.285219,352.578052,352.462733,2.542292,0.054018,9.596282,80.948818,5.096191,0.776471,4.207813,2.170498,-2.405575
std,3.45807,2.392716,2.328808,2.834552,1.848222,246.503998,247.410541,1.887606,0.95425,5.438773,71.945887,5.218152,0.416754,36.708671,4.953074,1.861004
min,0.0,0.0,-18.0,-17.0,-9.4,4.0026,4.002603,0.0,-4.0,-9.0,0.0,0.0,0.0,-6.88,-9.609,-6.51
25%,2.0,1.0,0.41,0.22,-4.6,212.2439,211.347023,1.0,0.0,4.43,39.225,2.0,1.0,-6.1525,0.6,-3.68
50%,4.0,1.0,2.1,2.03,-3.5,310.329,309.907951,2.0,0.0,9.96,66.48,4.0,1.0,-4.84,2.07,-2.63
75%,5.0,2.0,3.5375,3.6675,-2.0,415.5656,414.265439,4.0,1.0,13.85,101.44,7.0,1.0,-3.239999,3.5,-1.23
max,33.0,20.0,9.94,17.16,1.08,4113.641,4111.115377,15.0,5.0,19.99,706.71,44.0,1.0,200.0,99.0,3.83


In [4]:
df.head()

Unnamed: 0,drugbank-id,name,c_iupac_name,c_bioavailability,c_ghose_filter,c_h_bond_acceptor_count,c_h_bond_donor_count,c_logp_1,c_logp_2,c_logs,...,c_smiles,c_traditional_iupac_name,c_water_solubility,ex_boiling_point,ex_caco2_permeability,ex_logp_1,ex_logs_1,ex_melting_point,ex_pka,BCS Class
0,DB00007,Leuprolide,(2S)-1-[(2S)-5-carbamimidamido-2-[(2S)-2-[(2R)...,,,,,1.04,-2.4,-4.6,...,Unknown,leuprorelin,3.38e-02 g/l,,,,,150-155,9.6,Class IV
1,DB00014,Goserelin,(2S)-1-[(2S)-2-[(2S)-2-[(2R)-3-(tert-butoxy)-2...,0.0,0.0,18.0,17.0,0.3,-5.1,-4.6,...,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,tetrahydrofolic acid,2.83e-02 g/l,,,-2.0,,,,Class IV
2,DB00027,Gramicidin D,(2R)-N-[(1S)-1-{[(1R)-1-{[(1S)-1-{[(1R)-1-{[(1...,,,,,4.38,5.96,-5.7,...,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,(2R)-N-[(1S)-1-{[(1R)-1-{[(1S)-1-{[(1R)-1-{[(1...,3.90e-03 g/l,,,,,229 °C,,Class II
3,DB00067,Vasopressin,2-({1-[19-amino-13-benzyl-10-(2-carbamoylethyl...,0.0,0.0,16.0,14.0,-1.4,-7.2,-3.9,...,NCCCCC(NC(=O)C1CCCN1C(=O)C1CSSCC(N)C(=O)NC(CC2...,"(2S)-2-{[(2S)-1-[(4R,7S,10S,13S,16S,19R)-19-am...",1.24e-01 g/l,,,,,,,Class III
4,DB00091,Cyclosporine,"(3S,6S,9S,12R,15S,18S,21S,24S,30S,33S)-30-ethy...",0.0,0.0,12.0,5.0,3.64,,,...,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...,"(3S,6S,9S,12R,15S,18S,21S,24S,30S,33S)-30-ethy...",,838.63,-6.05,1.4,,148-151 °C,13.32±0.70,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1768 entries, 0 to 1767
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   drugbank-id                   1768 non-null   object 
 1   name                          1768 non-null   object 
 2   c_iupac_name                  1726 non-null   object 
 3   c_bioavailability             1452 non-null   object 
 4   c_ghose_filter                1436 non-null   object 
 5   c_h_bond_acceptor_count       1536 non-null   float64
 6   c_h_bond_donor_count          1524 non-null   float64
 7   c_logp_1                      1766 non-null   float64
 8   c_logp_2                      1658 non-null   float64
 9   c_logs                        1686 non-null   float64
 10  c_mddr-like_rule              1425 non-null   object 
 11  c_molecular_formula           1657 non-null   object 
 12  c_molecular_weight            1701 non-null   float64
 13  c_m

## adding class

In [11]:
# Function to classify BCS based on given boundaries
def classify_bcs(logS, permeability, alogP):
    if pd.isnull(logS) or pd.isnull(permeability) or pd.isnull(alogP):
        return None  # Keep BCS Class as null if any value is missing
    if logS > -4.0 and permeability > 0.5 and alogP > 2.3:
        return "Class I"
    elif logS <= -4.0 and permeability > 0.5 and alogP > 2.3:
        return "Class II"
    elif logS > -4.0 and permeability <= 0.5 and alogP <= 2.3:
        return "Class III"
    else:
        return "Class IV"


In [53]:
df['c_logp_2']=pd.to_numeric(df["c_logp_2"], errors='coerce')
df['c_logp_1']=pd.to_numeric(df["c_logp_1"], errors='coerce')

In [57]:
df["BCS Class"] = df.apply(
    lambda row: classify_bcs(row["c_logs_1"], row["c_logp_2"], row["c_logp_1"]), axis=1
)

In [59]:
df.head()

Unnamed: 0,drugbank-id,name,c_iupac_name_1,c_bioavailability_1,c_ghose_filter_1,c_h_bond_acceptor_count_1,c_h_bond_donor_count_1,c_logp_1,c_logp_2,c_logs_1,...,c_smiles_1,c_traditional_iupac_name_1,c_water_solubility_1,ex_boiling_point_1,ex_caco2_permeability_1,ex_logp_1,ex_logs_1,ex_melting_point_1,ex_pka_1,BCS Class
0,DB00007,Leuprolide,(2S)-1-[(2S)-5-carbamimidamido-2-[(2S)-2-[(2R)...,,,,,1.04,-2.4,-4.6,...,Unknown,leuprorelin,3.38e-02 g/l,,,,,150-155,9.6,Class IV
1,DB00014,Goserelin,(2S)-1-[(2S)-2-[(2S)-2-[(2R)-3-(tert-butoxy)-2...,0.0,0.0,18.0,17.0,0.3,-5.1,-4.6,...,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,tetrahydrofolic acid,2.83e-02 g/l,,,-2.0,,,,Class IV
2,DB00027,Gramicidin D,(2R)-N-[(1S)-1-{[(1R)-1-{[(1S)-1-{[(1R)-1-{[(1...,,,,,4.38,5.96,-5.7,...,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,(2R)-N-[(1S)-1-{[(1R)-1-{[(1S)-1-{[(1R)-1-{[(1...,3.90e-03 g/l,,,,,229 °C,,Class II
3,DB00067,Vasopressin,2-({1-[19-amino-13-benzyl-10-(2-carbamoylethyl...,0.0,0.0,16.0,14.0,-1.4,-7.2,-3.9,...,NCCCCC(NC(=O)C1CCCN1C(=O)C1CSSCC(N)C(=O)NC(CC2...,"(2S)-2-{[(2S)-1-[(4R,7S,10S,13S,16S,19R)-19-am...",1.24e-01 g/l,,,,,,,Class III
4,DB00091,Cyclosporine,"(3S,6S,9S,12R,15S,18S,21S,24S,30S,33S)-30-ethy...",0.0,0.0,12.0,5.0,3.64,,,...,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...,"(3S,6S,9S,12R,15S,18S,21S,24S,30S,33S)-30-ethy...",,838.63,-6.05,1.4,,148-151 °C,13.32±0.70,


In [8]:
df["BCS Class"].value_counts()

BCS Class
Class II     544
Class IV     484
Class III    398
Class I      232
Name: count, dtype: int64

In [10]:
null_counts = df.isnull().sum()
print(null_counts)

drugbank-id                        0
name                               0
c_iupac_name                      42
c_bioavailability                316
c_ghose_filter                   332
c_h_bond_acceptor_count          232
c_h_bond_donor_count             244
c_logp_1                           2
c_logp_2                         110
c_logs                            82
c_mddr-like_rule                 343
c_molecular_formula              111
c_molecular_weight                67
c_monoisotopic_weight             86
c_number_of_rings                302
c_physiological_charge           287
c_pka_(strongest_acidic)         583
c_pka_(strongest_basic)          400
c_polar_surface_area_(psa)_1     177
c_polarizability                 203
c_refractivity                   191
c_rotatable_bond_count           219
c_rule_of_five                   323
c_smiles                         100
c_traditional_iupac_name          60
c_water_solubility                93
ex_boiling_point                1340
e

In [14]:
'''
Missing values of hydrogen bond acceptors and donor count, molecular weight, psa, rotatable bond count
these can be calculated!
'''
from rdkit import Chem
from rdkit.Chem import Descriptors

def calculate_properties(smiles):  
    if isinstance(smiles, str):  
        mol = Chem.MolFromSmiles(smiles)  
        if mol is not None:  
            h_bond_acceptors = Descriptors.NumHAcceptors(mol)  
            h_bond_donors = Descriptors.NumHDonors(mol)  
            molecular_weight = Descriptors.MolWt(mol)  
            polar_surface_area = Descriptors.TPSA(mol)  
            rotatable_bond_count = Descriptors.NumRotatableBonds(mol)  # Counts rotatable bonds  
            return (h_bond_acceptors, h_bond_donors, molecular_weight, polar_surface_area, rotatable_bond_count)  
    return None, None, None, None, None

 
for index, row in df.iterrows():  
    smiles = row['c_smiles']  
    props = calculate_properties(smiles)
    
    if pd.isnull(row['c_h_bond_acceptor_count']):  
        df.at[index, 'c_h_bond_acceptor_count'] = props[0]  
        
    if pd.isnull(row['c_h_bond_donor_count']):  
        df.at[index, 'c_h_bond_donor_count'] = props[1]  
    
    if pd.isnull(row['c_molecular_weight']):  
        df.at[index, 'c_molecular_weight'] = props[2]  
    
    if pd.isnull(row['c_polar_surface_area_(psa)_1']):  
        df.at[index, 'c_polar_surface_area_(psa)_1'] = props[3] 
    
    if pd.isnull(row['c_rotatable_bond_count']):  
        df.at[index, 'c_rotatable_bond_count'] = props[4]  

[10:20:45] SMILES Parse Error: syntax error while parsing: Unknown
[10:20:45] SMILES Parse Error: Failed parsing SMILES 'Unknown' for input: 'Unknown'
[10:20:46] SMILES Parse Error: syntax error while parsing: Unknown
[10:20:46] SMILES Parse Error: Failed parsing SMILES 'Unknown' for input: 'Unknown'
[10:20:47] SMILES Parse Error: syntax error while parsing: Unknown
[10:20:47] SMILES Parse Error: Failed parsing SMILES 'Unknown' for input: 'Unknown'
[10:20:47] SMILES Parse Error: syntax error while parsing: Unknown
[10:20:47] SMILES Parse Error: Failed parsing SMILES 'Unknown' for input: 'Unknown'


In [16]:
df.isnull().sum()

drugbank-id                        0
name                               0
c_iupac_name                      42
c_bioavailability                316
c_ghose_filter                   332
c_h_bond_acceptor_count          101
c_h_bond_donor_count             101
c_logp_1                           2
c_logp_2                         110
c_logs                            82
c_mddr-like_rule                 343
c_molecular_formula              111
c_molecular_weight                67
c_monoisotopic_weight             86
c_number_of_rings                302
c_physiological_charge           287
c_pka_(strongest_acidic)         583
c_pka_(strongest_basic)          400
c_polar_surface_area_(psa)_1     101
c_polarizability                 203
c_refractivity                   191
c_rotatable_bond_count           101
c_rule_of_five                   323
c_smiles                         100
c_traditional_iupac_name          60
c_water_solubility                93
ex_boiling_point                1340
e

In [18]:
df_cleaned = df.dropna(subset=['c_smiles'])

In [20]:
df_cleaned.isnull().sum()

drugbank-id                        0
name                               0
c_iupac_name                       0
c_bioavailability                219
c_ghose_filter                   235
c_h_bond_acceptor_count            4
c_h_bond_donor_count               4
c_logp_1                           0
c_logp_2                          76
c_logs                            72
c_mddr-like_rule                 246
c_molecular_formula               15
c_molecular_weight                 0
c_monoisotopic_weight              0
c_number_of_rings                205
c_physiological_charge           190
c_pka_(strongest_acidic)         485
c_pka_(strongest_basic)          302
c_polar_surface_area_(psa)_1       4
c_polarizability                 106
c_refractivity                    94
c_rotatable_bond_count             4
c_rule_of_five                   226
c_smiles                           0
c_traditional_iupac_name           0
c_water_solubility                72
ex_boiling_point                1272
e

In [22]:
'''
LogS computation for the missing values.
'''

def compute_solubility(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        AlogP = Descriptors.MolLogP(mol)
        MW = Descriptors.MolWt(mol)
        HBA = Descriptors.NumHAcceptors(mol)
        HBD = Descriptors.NumHDonors(mol)
        RotBonds = Descriptors.NumRotatableBonds(mol)

        # ESOL Model
        logS = 0.16 - (0.63 * AlogP) - (0.0062 * MW) + (0.066 * HBA) + (0.066 * HBD) - (0.74 * RotBonds)
        
        return logS
    return None

for index, row in df_cleaned.iterrows():  
    if pd.isnull(row['c_logs']):  
        smiles = row['c_smiles']  
        log_value = compute_solubility(smiles)  
        if log_value is not None:  
            df_cleaned.at[index, 'c_logs'] = log_value  


In [24]:
df_cleaned.isnull().sum()

drugbank-id                        0
name                               0
c_iupac_name                       0
c_bioavailability                219
c_ghose_filter                   235
c_h_bond_acceptor_count            4
c_h_bond_donor_count               4
c_logp_1                           0
c_logp_2                          76
c_logs                             0
c_mddr-like_rule                 246
c_molecular_formula               15
c_molecular_weight                 0
c_monoisotopic_weight              0
c_number_of_rings                205
c_physiological_charge           190
c_pka_(strongest_acidic)         485
c_pka_(strongest_basic)          302
c_polar_surface_area_(psa)_1       4
c_polarizability                 106
c_refractivity                    94
c_rotatable_bond_count             4
c_rule_of_five                   226
c_smiles                           0
c_traditional_iupac_name           0
c_water_solubility                72
ex_boiling_point                1272
e

In [36]:
'''number of rings: missing value computation
'''
def calculate_number_of_rings(smiles):  
    if smiles:  
        mol = Chem.MolFromSmiles(smiles)  
        if mol:  
            ring_info = mol.GetRingInfo()  
            return ring_info.NumRings()  # Correct way to get the number of rings  
    return None

df_cleaned['c_number_of_rings'] = df_cleaned.apply(  
    lambda row: calculate_number_of_rings(row['c_smiles']) if pd.isnull(row['c_number_of_rings']) else row['c_number_of_rings'],  
    axis=1  
)

[10:30:21] SMILES Parse Error: syntax error while parsing: Unknown
[10:30:21] SMILES Parse Error: Failed parsing SMILES 'Unknown' for input: 'Unknown'
[10:30:21] SMILES Parse Error: syntax error while parsing: Unknown
[10:30:21] SMILES Parse Error: Failed parsing SMILES 'Unknown' for input: 'Unknown'
[10:30:21] SMILES Parse Error: syntax error while parsing: Unknown
[10:30:21] SMILES Parse Error: Failed parsing SMILES 'Unknown' for input: 'Unknown'
[10:30:21] SMILES Parse Error: syntax error while parsing: Unknown
[10:30:21] SMILES Parse Error: Failed parsing SMILES 'Unknown' for input: 'Unknown'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['c_number_of_rings'] = df_cleaned.apply(


In [58]:
df_cleaned.to_csv('partial.csv',index=False)

In [1]:
import pandas as pd
import numpy as np

In [3]:
df_cleaned = pd.read_csv('partial.csv')
df_cleaned.head()

Unnamed: 0,drugbank-id,name,c_iupac_name,c_bioavailability,c_ghose_filter,c_h_bond_acceptor_count,c_h_bond_donor_count,c_logp_1,c_logp_2,c_logs,...,c_smiles,c_traditional_iupac_name,c_water_solubility,ex_boiling_point,ex_caco2_permeability,ex_logp_1,ex_logs_1,ex_melting_point,ex_pka,BCS Class
0,DB00014,Goserelin,(2S)-1-[(2S)-2-[(2S)-2-[(2R)-3-(tert-butoxy)-2...,0.0,0.0,18,17,0.3,-5.1,-4.6,...,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,tetrahydrofolic acid,2.83e-02 g/l,,,-2.0,,,,Class IV
1,DB00027,Gramicidin D,(2R)-N-[(1S)-1-{[(1R)-1-{[(1S)-1-{[(1R)-1-{[(1...,,,16,20,4.38,5.96,-5.7,...,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,(2R)-N-[(1S)-1-{[(1R)-1-{[(1S)-1-{[(1R)-1-{[(1...,3.90e-03 g/l,,,,,229 °C,,Class II
2,DB00067,Vasopressin,2-({1-[19-amino-13-benzyl-10-(2-carbamoylethyl...,0.0,0.0,16,14,-1.4,-7.2,-3.9,...,NCCCCC(NC(=O)C1CCCN1C(=O)C1CSSCC(N)C(=O)NC(CC2...,"(2S)-2-{[(2S)-1-[(4R,7S,10S,13S,16S,19R)-19-am...",1.24e-01 g/l,,,,,,,Class III
3,DB00091,Cyclosporine,"(3S,6S,9S,12R,15S,18S,21S,24S,30S,33S)-30-ethy...",0.0,0.0,12,5,3.64,,-19.333807,...,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...,"(3S,6S,9S,12R,15S,18S,21S,24S,30S,33S)-30-ethy...",,838.63,-6.05,1.4,,148-151 °C,13.32±0.70,
4,DB00115,Cyanocobalamin,"cyano[(1R,2R,3R,4R,6Z,8S,11Z,13S,14S,16Z,18S,1...",,,18,9,2.66,-3.2,-4.8,...,C[C@H](CNC(=O)CC[C@]1(C)[C@@H](CC(N)=O)[C@H]2N...,racemiccalcium pantothenate,2.02e-02 g/l,> 300,,1.897,-4.5,> 300,"1.84, 8.77",Class IV


In [5]:
df_cleaned.isnull().sum()


drugbank-id                        0
name                               0
c_iupac_name                       0
c_bioavailability                216
c_ghose_filter                   232
c_h_bond_acceptor_count            0
c_h_bond_donor_count               0
c_logp_1                           0
c_logp_2                          76
c_logs                             0
c_mddr-like_rule                 243
c_molecular_formula               14
c_molecular_weight                 0
c_monoisotopic_weight              0
c_number_of_rings                  0
c_physiological_charge           186
c_pka_(strongest_acidic)         481
c_pka_(strongest_basic)          299
c_polar_surface_area_(psa)_1       0
c_polarizability                 104
c_refractivity                    91
c_rotatable_bond_count             0
c_rule_of_five                   222
c_smiles                           0
c_traditional_iupac_name           0
c_water_solubility                72
ex_boiling_point                1269
e

In [23]:
df_cleaned.loc[df_cleaned['BCS Class'].isnull(), 'BCS Class'] = df_cleaned.loc[df_cleaned['BCS Class'].isnull()].apply(  
    lambda row: classify_bcs(row["c_logs"], row["c_logp_2"], row["c_logp_1"]), axis=1  
)

In [7]:
#dropping the columns with more null values
df_cleaned = df_cleaned.drop(columns = ['ex_boiling_point','ex_caco2_permeability','ex_logs_1','ex_logp_1','ex_melting_point','ex_pka'])


In [27]:
df_cleaned = df_cleaned.drop(columns = ['c_pka_(strongest_acidic)', 'c_pka_(strongest_basic)'])

In [29]:
df_cleaned.isnull().sum()

drugbank-id                       0
name                              0
c_iupac_name                      0
c_bioavailability               216
c_ghose_filter                  232
c_h_bond_acceptor_count           0
c_h_bond_donor_count              0
c_logp_1                          0
c_logp_2                         76
c_logs                            0
c_mddr-like_rule                243
c_molecular_formula              14
c_molecular_weight                0
c_monoisotopic_weight             0
c_number_of_rings                 0
c_physiological_charge          186
c_polar_surface_area_(psa)_1      0
c_polarizability                104
c_refractivity                   91
c_rotatable_bond_count            0
c_rule_of_five                  222
c_smiles                          0
c_traditional_iupac_name          0
c_water_solubility               72
BCS Class                        76
dtype: int64

In [31]:
df_cleaned.to_csv('preprocessed.csv',index=False)