# **MACHINE LEARNING INSIGHTS FOR PREDICTING ORAL DRUGS PROPERTIES**
##### LUCIANA OLIVEIRA & MARÍA URIBURU GRAY
###### 12/12/2024

## Dataset from Kaggle

###  Wikipedia Molecules Properties Dataset

    https://www.kaggle.com/datasets/thedevastator/wikipedia-molecules-properties-dataset

## 1. Data Cleaning

In [None]:
# Python libraries

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.stats import chi2
from rdkit import Chem


In [None]:
# Read and import Dataset
df = pd.read_csv('../../dataset_molecules/properties.csv')

In [None]:
# Column names normalisation
df.columns = df.columns.str.replace(' ', '_').str.lower()
df = df.rename(columns={'molecule': 'smiles_molecule'})
df = df.rename(columns={"lipinski's_rule_of_five": 'lipinskis_rule_of_five'})

In [None]:
# Null values
df.isnull().sum() # 0 null_values

In [None]:
# Change column type
df['atomic_polarizabilities'] = pd.to_numeric(df['atomic_polarizabilities'], errors='coerce')
df['bond_polarizabilities'] = pd.to_numeric(df['bond_polarizabilities'], errors='coerce')
df['vabc_volume_descriptor'] = pd.to_numeric(df['vabc_volume_descriptor'], errors='coerce')
df['topological_polar_surface_area'] = pd.to_numeric(df['topological_polar_surface_area'], errors='coerce')
df['molecular_weight'] = pd.to_numeric(df['molecular_weight'], errors='coerce')
df['molar_mass'] = pd.to_numeric(df['molar_mass'], errors='coerce')

In [None]:
# Null values after changing column types
df.isnull().sum()
# Drop nulls & unnecessary columns
df = df.drop(['row_id', 'molecule_name', 'son_iguales', 'vabc_volume_descriptor'], axis=1)
df = df.dropna(subset=['molar_mass', 'atomic_polarizabilities', 
                       'bond_polarizabilities', 'molecular_weight', 
                       'topological_polar_surface_area'])  

In [None]:
# Checking duplicated and unique values
df.duplicated().sum()
df.nunique()

In [None]:
# Columns organised
new_order = [
    'index', 'smiles_molecule', 'molecular_formula',
    'molecular_weight', 'molar_mass', 'lipinskis_rule_of_five', 'xlogp', 'mannhold_logp', 'sp3_character', 
    'fragment_complexity', 'hydrogen_bond_acceptors', 'hydrogen_bond_donors', 'topological_polar_surface_area',
    'rotatable_bonds_count', 'rotatable_bonds_count_(non_terminal)', 'largest_chain', 'largest_pi_chain',
    'bond_count', 'aromatic_atoms_count', 'aromatic_bonds_count',
    'bond_polarizabilities', 'atomic_polarizabilities', 
    'eccentric_connectivity_index', 'zagreb_index', 'petitjean_number',
    'vertex_adjacency_information_magnitude',
    'formal_charge', 'formal_charge_(pos)', 'formal_charge_(neg)',
    'heavy_atoms_count', 'element_count'
]
df = df[new_order]

In [None]:
# Save a cleaning copy for EDA analysis
df.to_csv('../../dataset_molecules/cleaned_data_properties.csv', index=False)