In [7]:
# ORR Catalyst Performance Modeling Notebook

# --- LIBRARIES ---
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# --- DATA ---
df = pd.read_csv("Lipophilicity.csv")
df.head()
df["mol"] = df["smiles"].apply(Chem.MolFromSmiles)
df = df[df["mol"].notnull()]
smiles_list = df['smiles']
y = df['exp'].values

# --- MORGAN FOOTPRINT & MACCS KEYS ---
mol = Chem.MolFromSmiles('smiles')
def morgan_fp(mol, radius=2, nBits=1024):
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
def maccs_fp(mol):
    return MACCSkeys.GenMACCSKeys(mol)
morgan_features = [morgan_fp(m) for m in df["mol"]]
maccs_features = [maccs_fp(m) for m in df["mol"]]
X_morgan = np.array([np.frombuffer(fp.ToBitString().encode('utf-8'), 'u1') - 48 for fp in morgan_features])
X_maccs = np.array([np.frombuffer(fp.ToBitString().encode('utf-8'), 'u1') - 48 for fp in maccs_features])

# --- TRAIN/TEST SPLIT ---
X_train_morgan, X_test_morgan, y_train, y_test = train_test_split(
    X_morgan, y, test_size=0.2, random_state=42
)

X_train_maccs, X_test_maccs, _, _ = train_test_split(
    X_maccs, y, test_size=0.2, random_state=42
)


# --- SCALE ---
scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
y_test_scaled = scaler.transform(y_test.reshape(-1, 1)).ravel()

# --- MPLREGRESSOR ---
mlp_morgan = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp_maccs  = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

mlp_morgan.fit(X_train_morgan, y_train_scaled)
mlp_maccs.fit(X_train_maccs, y_train_scaled)

# --- EVALUATION ---
y_pred_morgan_scaled = mlp_morgan.predict(X_test_morgan)
y_pred_maccs_scaled  = mlp_maccs.predict(X_test_maccs)

y_pred_morgan = scaler.inverse_transform(y_pred_morgan_scaled.reshape(-1, 1)).ravel()
y_pred_maccs  = scaler.inverse_transform(y_pred_maccs_scaled.reshape(-1, 1)).ravel()

rmse_morgan = np.sqrt(mean_squared_error(y_test, y_pred_morgan))
rmse_maccs  = np.sqrt(mean_squared_error(y_test, y_pred_maccs))

print(f"RMSE (Morgan Fingerprints): {rmse_morgan:.3f}")
print(f"RMSE (MACCS Keys):          {rmse_maccs:.3f}")


RMSE (Morgan Fingerprints): 0.813
RMSE (MACCS Keys):          0.947
