In [28]:
import pandas as pd
import sklearn 


In [64]:
df= pd.read_csv("Acute Toxicity_mouse_intraperitoneal_LD50.csv")

In [65]:
df.head()

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical SMILES,InChIKey,mouse_intraperitoneal_LD50
0,TOX-145,785,"benzene-1,4-diol",Oc1ccc(O)cc1,Oc1ccc(O)cc1,QIGBRXMKCJKVMJ-UHFFFAOYSA-N,3.041835
1,TOX-245,5453,tris(aziridin-1-yl)-sulfanylidene-lambda5-phos...,S=P(N1CC1)(N1CC1)N1CC1,S=P(N1CC1)(N1CC1)N1CC1,FOCVUCIESVLUNU-UHFFFAOYSA-N,4.235584
2,TOX-1273,727,"1,2,3,4,5,6-hexachlorocyclohexane",ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl,ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl,JLYXXMFPNIAWKQ-UHFFFAOYSA-N,3.366732
3,TOX-1279,4091,"3-(diaminomethylidene)-1,1-dimethylguanidine",CN(C)C(=N)N=C(N)N,CN(C)C(=N)N=C(N)N,XZWYZXLIPXDOLR-UHFFFAOYSA-N,2.641604
4,TOX-1282,10364,2-methyl-5-propan-2-ylphenol,Cc1ccc(C(C)C)cc1O,Cc1ccc(C(C)C)cc1O,RECUKUPTGUEGMW-UHFFFAOYSA-N,3.311627


In [66]:
df = df.drop(columns=['TAID', 'Pubchem CID', 'IUPAC Name', 'InChIKey'])
# Or keep only these:
# df = df[['Canonical SMILES', 'mouse_intraperitoneal_LD50']]


In [67]:
print(df.isnull().sum())
df = df.dropna()  # Drop rows with missing SMILES or LD50


SMILES                        0
Canonical SMILES              0
mouse_intraperitoneal_LD50    0
dtype: int64


In [68]:
from rdkit import Chem

df['Mol'] = df['Canonical SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
df = df[df['Mol'].notnull()]


In [69]:
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors

descriptor_names = [desc[0] for desc in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

def compute_all_descriptors(mol):
    return calc.CalcDescriptors(mol)

desc_values = df['Mol'].apply(compute_all_descriptors)
desc_df = pd.DataFrame(desc_values.tolist(), columns=descriptor_names)

# Add the target LD50 column
data = pd.concat([desc_df, df['mouse_intraperitoneal_LD50']], axis=1)


In [70]:
# Remove NaN or infinite columns
desc_df = desc_df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

# Drop constant features
desc_df = desc_df.loc[:, desc_df.nunique() > 1]


In [75]:
X = desc_df.astype(float)
y = pd.to_numeric(df['mouse_intraperitoneal_LD50'], errors='coerce')
print(X.dtypes)  # Should all be float64



MaxAbsEStateIndex    float64
MaxEStateIndex       float64
MinAbsEStateIndex    float64
MinEStateIndex       float64
qed                  float64
                      ...   
fr_thiazole          float64
fr_thiocyan          float64
fr_thiophene         float64
fr_unbrch_alkane     float64
fr_urea              float64
Length: 194, dtype: object


In [100]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
print("Max value in X_train:", X_train.max().max())
print("Min value in X_train:", X_train.min().min())

# Identify which columns have extreme values
extreme_cols = X_train.columns[(X_train > 1e38).any() | (X_train < -1e38).any()]
print("Columns with extreme values:", list(extreme_cols))


Max value in X_train: 1.291717546675529e+41
Min value in X_train: -81.74940919808004
Columns with extreme values: ['Ipc']


In [102]:
X_train = X_train.drop(columns=['Ipc'])
X_test = X_test.drop(columns=['Ipc'])  # if using a test set


In [103]:
print("Max value in X_train:", X_train.max().max())
print("Min value in X_train:", X_train.min().min())

# Identify which columns have extreme values
extreme_cols = X_train.columns[(X_train > 1e38).any() | (X_train < -1e38).any()]
print("Columns with extreme values:", list(extreme_cols))

Max value in X_train: 9507.96
Min value in X_train: -81.74940919808004
Columns with extreme values: []


In [104]:

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


RandomForestRegressor(random_state=42)

In [106]:
important_features = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
top_features = important_features.head(20)
print(top_features)


Chi3n                0.053515
SMR_VSA6             0.051401
Chi2n                0.039048
MolLogP              0.029600
SlogP_VSA2           0.026409
PEOE_VSA1            0.023682
SMR_VSA5             0.019087
VSA_EState2          0.017802
fr_quatN             0.017760
BalabanJ             0.016126
fr_alkyl_halide      0.015988
PEOE_VSA11           0.014377
VSA_EState4          0.014093
MinEStateIndex       0.013266
PEOE_VSA9            0.013085
VSA_EState9          0.013041
MinAbsEStateIndex    0.012282
AvgIpc               0.012269
VSA_EState8          0.012253
PEOE_VSA7            0.012169
dtype: float64


In [107]:
# Subset your cleaned or imputed descriptor dataset using the top 20 features
X_top20 = X_train[top_features.index]
X_test_top20 = X_test[top_features.index]


In [109]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model_top20 = RandomForestRegressor(n_estimators=100, random_state=42)
model_top20.fit(X_top20, y_train)

# Predict on the test set
y_pred = model_top20.predict(X_test_top20)


In [110]:
# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.3f}")


Mean Squared Error (MSE): 0.24
R² Score: 0.501


In [112]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from xgboost import XGBRegressor

# Example: Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_top20, y_train)
print("R² (Ridge):", ridge.score(X_test_top20, y_test))


R² (Ridge): 0.16447419788435336


In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel("Actual LD50")
plt.ylabel("Predicted LD50")
plt.title("Actual vs Predicted LD50 using Top 20 Descriptors")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.grid(True)
plt.show()
