In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import joblib

In [2]:
data = pd.read_csv('../data/train_data.csv')
data.head()

Unnamed: 0,Filename,HOMO,LUMO,SMILES,SAS,LogP,TPSA,scaffold
0,a10,-5.170166,-2.288479,C#Cc1[nH]ccc1-c1csc2c1C(=O)c1ccsc1-2,3.623665,3.9974,32.86,O=C1c2ccsc2-c2scc(-c3cc[nH]c3)c21
1,a100,-5.292617,-2.476237,O=C1c2ccsc2-c2scc(-c3c4ccsc4cc4ccsc34)c21,3.258773,7.1174,17.07,O=C1c2ccsc2-c2scc(-c3c4ccsc4cc4ccsc34)c21
2,a1000,-5.306223,-2.400045,C=C1C=C(c2c(C)sc3c2C(=O)c2ccsc2-3)c2cscc21,3.674717,5.84942,17.07,C=C1C=C(c2csc3c2C(=O)c2ccsc2-3)c2cscc21
3,a10000,-5.99195,-2.91162,O=C1c2ccsc2-c2scc(-c3sc(C(F)(F)F)c4cc([N+](=O)...,3.731936,6.8912,60.21,O=C1c2ccsc2-c2scc(-c3scc4ccsc34)c21
4,a10002,-5.319829,-2.440863,Cc1cc2c(s1)-c1scc(-c3cc(-c4ccsc4)ccc3[N+](=O)[...,3.016199,6.63312,60.21,O=C1c2ccsc2-c2scc(-c3cccc(-c4ccsc4)c3)c21


In [23]:
# Compute Morgan fingerprint from SMILES
def compute_fingerprint(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [0] * n_bits
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=n_bits)
    return list(fp)

fps = data['SMILES'].apply(compute_fingerprint)
fps_df = pd.DataFrame(fps.tolist())

# Target is LogP
target = data['LogP']

# Split the dataset
X_train, X_valid, y_train, y_valid = train_test_split(fps_df, target, test_size=0.2, random_state=42)
'''
# Train a Random Forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)
print(f"Test MSE: {mse:.4f}")
'''




'\n# Train a Random Forest regressor\nrf = RandomForestRegressor(n_estimators=100, random_state=42)\nrf.fit(X_train, y_train)\n\n# Evaluate\ny_pred = rf.predict(X_valid)\nmse = mean_squared_error(y_valid, y_pred)\nprint(f"Test MSE: {mse:.4f}")\n'

In [14]:
from sklearn.neural_network import MLPRegressor
# Train a Random Forest regressor
mlp = MLPRegressor(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# Evaluate
y_pred = mlp.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)
print(f"Test MSE: {mse:.4f}")




Test MSE: 0.2810


In [25]:
# Save model to file
joblib.dump(rgboost, 'gb_model.pkl')

['gb_model.pkl']

# RGboost

In [24]:
from sklearn.ensemble import GradientBoostingRegressor

# Train a Gradient Boosting regressor (RGboost)
rgboost = GradientBoostingRegressor(random_state=42)
rgboost.fit(X_train, y_train)

# Evaluate on validation set
y_pred_rgboost = rgboost.predict(X_valid)
mse_rgboost = mean_squared_error(y_valid, y_pred_rgboost)
print(f"RGboost Test MSE: {mse_rgboost:.4f}")

RGboost Test MSE: 1.3277
