# **MACHINE LEARNING INSIGHTS FOR PREDICTING ORAL DRUGS PROPERTIES**
##### LUCIANA OLIVEIRA & MARÍA URIBURU GRAY
###### 12/12/2024

## Dataset from Kaggle

###  Wikipedia Molecules Properties Dataset

    https://www.kaggle.com/datasets/thedevastator/wikipedia-molecules-properties-dataset

## 2. Exploratory Data Analysis

In [None]:
# Import Python libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.stats import chi2

In [None]:
# Read and import Dataset
df = pd.read_csv('../Final_project/dataset_molecules/cleaned_data_properties.csv')

In [None]:
# Correlations map
numeric_df = df.select_dtypes(include=[np.number])
numeric_df = numeric_df.drop(columns=['index'])

correlation_matrix = numeric_df.corr(method='spearman')

plt.figure(figsize=(12, 8))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=False, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation map", fontsize=16)
plt.show()

In [None]:
# Molecules that meet Lipinski's criteria
lipinski_filter = (
    (df["molecular_weight"] <= 500) &
    (df["hydrogen_bond_donors"] <= 5) &
    (df["hydrogen_bond_acceptors"] <= 10) &
    (df["xlogp"] <= 5)
)
df["Cumple_Lipinski"] = lipinski_filter
print(df["Cumple_Lipinski"].value_counts())

# Calcular los valores de cumplimiento
lipinski_counts = df["Cumple_Lipinski"].value_counts()
# Etiquetas para el gráfico
labels = ["Yes", "No"]
# Colores personalizados
colors = ["#1f77b4", "#ff7f0e"]
# Crear el gráfico de pastel
plt.figure(figsize=(6, 6))
plt.pie(lipinski_counts, labels=labels, autopct="%1.1f%%", startangle=90, colors=colors, textprops={'fontsize': 16})
plt.title("Meets Lipinski's criteria", fontsize=18)
plt.show()
plt.savefig("lipinski_pie_chart.png", transparent=True, bbox_inches='tight')