# **MACHINE LEARNING INSIGHTS FOR PREDICTING ORAL DRUGS PROPERTIES**
##### LUCIANA OLIVEIRA & MARÍA URIBURU GRAY
###### 12/12/2024

## Dataset from Kaggle

###  Wikipedia Molecules Properties Dataset

    https://www.kaggle.com/datasets/thedevastator/wikipedia-molecules-properties-dataset

## 1. Data Cleaning

In [1]:
# Standard Libraries
import numpy as np
import pandas as pd


In [2]:
# Read and import Dataset
df = pd.read_csv('../Molecular-properties-prediction-for-drug-discovery/dataset_molecules/properties.csv')

In [3]:
# Column names normalisation
df.columns = df.columns.str.replace(' ', '_').str.lower()
df = df.rename(columns={'molecule': 'smiles_molecule'})
df = df.rename(columns={"lipinski's_rule_of_five": 'lipinskis_rule_of_five'})

In [4]:
# Null values
df.isnull().sum() # 0 null_values

index                                     0
row_id                                    0
smiles_molecule                           0
molecule_name                             0
mannhold_logp                             0
atomic_polarizabilities                   0
aromatic_atoms_count                      0
aromatic_bonds_count                      0
element_count                             0
bond_polarizabilities                     0
bond_count                                0
eccentric_connectivity_index              0
fragment_complexity                       0
vabc_volume_descriptor                    0
hydrogen_bond_acceptors                   0
hydrogen_bond_donors                      0
largest_chain                             0
largest_pi_chain                          0
petitjean_number                          0
rotatable_bonds_count                     0
lipinskis_rule_of_five                    0
topological_polar_surface_area            0
vertex_adjacency_information_mag

In [5]:
# Change column type
df['atomic_polarizabilities'] = pd.to_numeric(df['atomic_polarizabilities'], errors='coerce')
df['bond_polarizabilities'] = pd.to_numeric(df['bond_polarizabilities'], errors='coerce')
df['vabc_volume_descriptor'] = pd.to_numeric(df['vabc_volume_descriptor'], errors='coerce')
df['topological_polar_surface_area'] = pd.to_numeric(df['topological_polar_surface_area'], errors='coerce')
df['molecular_weight'] = pd.to_numeric(df['molecular_weight'], errors='coerce')
df['molar_mass'] = pd.to_numeric(df['molar_mass'], errors='coerce')

In [6]:
# Null values after changing column types
df.isnull().sum()
# Drop nulls & unnecessary columns
df = df.drop(['row_id', 'molecule_name', 'vabc_volume_descriptor'], axis=1)
df = df.dropna(subset=['molar_mass', 'atomic_polarizabilities', 
                       'bond_polarizabilities', 'molecular_weight', 
                       'topological_polar_surface_area'])  

In [7]:
# Checking duplicated and unique values
df.duplicated().sum()
df.nunique()

index                                     15125
smiles_molecule                           15056
mannhold_logp                               114
atomic_polarizabilities                   10571
aromatic_atoms_count                         60
aromatic_bonds_count                         63
element_count                               273
bond_polarizabilities                      7076
bond_count                                  187
eccentric_connectivity_index               1991
fragment_complexity                        5555
hydrogen_bond_acceptors                      86
hydrogen_bond_donors                         58
largest_chain                                66
largest_pi_chain                             56
petitjean_number                             94
rotatable_bonds_count                       114
lipinskis_rule_of_five                        6
topological_polar_surface_area             4352
vertex_adjacency_information_magnitude      187
molecular_weight                        

In [8]:
# Columns organised
new_order = [
    'index', 'smiles_molecule', 'molecular_formula',
    'molecular_weight', 'molar_mass', 'lipinskis_rule_of_five', 'xlogp', 'mannhold_logp', 'sp3_character', 
    'fragment_complexity', 'hydrogen_bond_acceptors', 'hydrogen_bond_donors', 'topological_polar_surface_area',
    'rotatable_bonds_count', 'rotatable_bonds_count_(non_terminal)', 'largest_chain', 'largest_pi_chain',
    'bond_count', 'aromatic_atoms_count', 'aromatic_bonds_count',
    'bond_polarizabilities', 'atomic_polarizabilities', 
    'eccentric_connectivity_index', 'zagreb_index', 'petitjean_number',
    'vertex_adjacency_information_magnitude',
    'formal_charge', 'formal_charge_(pos)', 'formal_charge_(neg)',
    'heavy_atoms_count', 'element_count'
]
df = df[new_order]

In [9]:
# Save a cleaning copy for EDA analysis
df.to_csv('../Molecular-properties-prediction-for-drug-discovery/dataset_molecules/cleaned_data_properties.csv', index=False)