# granite.materials.smi-SSED - INFERENCE (Regression)

In [None]:
# Install extra packages for notebook
%pip install seaborn xgboost

In [1]:
import sys
sys.path.append('../inference')

In [14]:
# materials.smi-ssed
from smi_ssed.load import load_smi_ssed

# Data
import torch
import pandas as pd
import numpy as np

# Chemistry
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
PandasTools.RenderImagesInAllDataFrames(True)

In [3]:
# function to canonicalize SMILES
def normalize_smiles(smi, canonical=True, isomeric=False):
    try:
        normalized = Chem.MolToSmiles(
            Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
        )
    except:
        normalized = None
    return normalized

## Import smi-ssed

In [4]:
model_smi_ssed = load_smi_ssed(
    folder='../inference/smi_ssed',
    ckpt_filename='smi-ssed_130.pt'
)



Random Seed: 12345
Vocab size: 2393
[INFERENCE MODE - smi-ssed]


## Lipophilicity Dataset

### Experiments - Data Load

In [5]:
df_train = pd.read_csv("../finetune/moleculenet/lipophilicity/train.csv")
df_test = pd.read_csv("../finetune/moleculenet/lipophilicity/test.csv")

### SMILES canonization

In [6]:
df_train['norm_smiles'] = df_train['smiles'].apply(normalize_smiles)
df_train_normalized = df_train.dropna()
print(df_train_normalized.shape)
df_train_normalized.head()

(3360, 3)


Unnamed: 0,smiles,y,norm_smiles
0,Nc1ncnc2c1c(COc3cccc(Cl)c3)nn2C4CCOCC4,0.814313,Nc1ncnc2c1c(COc1cccc(Cl)c1)nn2C1CCOCC1
1,COc1cc(cc2cnc(Nc3ccc(cc3)[C@@H](C)NC(=O)C)nc12...,0.446346,COc1cc(-c2ccncc2)cc2cnc(Nc3ccc(C(C)NC(C)=O)cc3...
2,CC(=O)Nc1ccc2ccn(c3cc(Nc4ccn(C)n4)n5ncc(C#N)c5...,1.148828,CC(=O)Nc1ccc2ccn(-c3cc(Nc4ccn(C)n4)n4ncc(C#N)c...
3,Oc1ccc(CCNCCS(=O)(=O)CCCOCCSc2ccccc2)c3sc(O)nc13,0.404532,O=S(=O)(CCCOCCSc1ccccc1)CCNCCc1ccc(O)c2nc(O)sc12
4,Clc1ccc2C(=O)C3=C(Nc2c1)C(=O)NN(Cc4cc5ccccc5s4...,-0.164144,O=c1[nH]n(Cc2cc3ccccc3s2)c(=O)c2c(=O)c3ccc(Cl)...


In [7]:
df_test['norm_smiles'] = df_test['smiles'].apply(normalize_smiles)
df_test_normalized = df_test.dropna()
print(df_test_normalized.shape)
df_test_normalized.head()

(420, 3)


Unnamed: 0,smiles,y,norm_smiles
0,N(c1ccccc1)c2ccnc3ccccc23,0.488161,c1ccc(Nc2ccnc3ccccc23)cc1
1,Clc1ccc2Oc3ccccc3N=C(N4CCNCC4)c2c1,0.070017,Clc1ccc2c(c1)C(N1CCNCC1)=Nc1ccccc1O2
2,NC1(CCC1)c2ccc(cc2)c3ncc4cccnc4c3c5ccccc5,-0.41503,NC1(c2ccc(-c3ncc4cccnc4c3-c3ccccc3)cc2)CCC1
3,OC[C@H](O)CN1C(=O)[C@@H](Cc2ccccc12)NC(=O)c3cc...,0.897942,O=C(NC1Cc2ccccc2N(CC(O)CO)C1=O)c1cc2cc(Cl)sc2[...
4,NS(=O)(=O)c1nc2ccccc2s1,-0.707731,NS(=O)(=O)c1nc2ccccc2s1


### Embeddings extraction

### smi-ssed embeddings extraction

In [8]:
with torch.no_grad():
    df_embeddings_train = model_smi_ssed.encode(df_train_normalized['norm_smiles'])
df_embeddings_train.head()

  return bound(*args, **kwds)
100%|████████████████████████████████████████████████████████████████████████████████| 33/33 [00:05<00:00,  5.81it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.080265,0.034904,0.435877,0.027711,0.081429,-0.290111,0.376825,0.226007,0.199301,-0.44446,...,0.279197,-0.368108,0.147418,0.442802,-0.173928,0.073809,-0.016911,-0.261915,-0.128148,0.332847
1,-0.187147,0.029456,0.553535,-0.002413,-0.009272,-0.23451,0.40182,0.295807,0.151423,-0.617558,...,0.327141,-0.351487,0.088823,0.398084,-0.199068,0.272885,-0.031163,-0.239988,0.018098,0.253334
2,-0.18254,0.040501,0.54589,-0.047389,0.10852,-0.327701,0.411576,0.226442,0.145194,-0.583229,...,0.260817,-0.336495,0.209357,0.346143,-0.206949,0.307284,-0.092287,-0.267434,0.085571,0.444158
3,-0.206591,0.008695,0.227075,0.055368,0.014546,-0.354243,0.280954,0.227574,0.243702,-0.452353,...,0.56042,-0.212152,0.056963,0.332285,-0.229991,-0.012139,0.138825,-0.198661,-0.038498,0.227769
4,-0.417161,-0.067443,0.443746,-0.064202,-0.202944,-0.32905,0.264605,0.519117,0.267605,-0.6083,...,0.340117,-0.26546,0.22317,0.324105,-0.227017,0.342013,0.015764,-0.128657,0.210682,0.391563


In [9]:
with torch.no_grad():
    df_embeddings_test = model_smi_ssed.encode(df_test_normalized['norm_smiles'])
df_embeddings_test.head()

  return bound(*args, **kwds)
100%|██████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  5.35it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.320537,0.212095,0.55451,0.075209,-0.150729,-0.347719,0.400961,0.666648,0.255496,-0.591539,...,0.267851,-0.376912,0.038312,0.38066,-0.104633,0.513495,-0.086275,-0.135534,0.093191,0.44429
1,-0.128747,0.044518,0.423002,-0.021135,-0.040059,-0.378097,0.384599,0.22165,0.225655,-0.372175,...,0.286428,-0.338783,0.181483,0.428925,-0.164278,0.341305,-0.059423,-0.172557,-0.039394,0.113462
2,-0.179157,0.02985,0.501458,-0.091178,0.008984,-0.419579,0.489816,0.388657,0.194892,-0.647254,...,0.250965,-0.222426,0.174361,0.45786,-0.155439,0.446646,-0.068179,-0.210553,0.13773,0.352219
3,-0.179847,0.027219,0.39248,0.023168,0.02757,-0.335774,0.258762,0.218043,0.267106,-0.438869,...,0.479541,-0.258187,0.124191,0.391419,-0.259501,0.237198,0.07298,-0.148524,0.066293,0.275586
4,-0.312562,-0.04261,0.409878,0.052937,-0.16694,-0.322114,0.301279,0.423582,0.292005,-0.520209,...,0.23687,-0.351482,0.077355,0.25251,-0.122125,0.270552,-0.05091,-0.192912,0.02742,0.503057


## Experiments - Lipophilicity prediction using smi-ssed latent spaces

### XGBoost prediction using the whole Latent Space

In [10]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [11]:
xgb_predict = XGBRegressor(n_estimators=2000, learning_rate=0.05, max_depth=4)
xgb_predict.fit(df_embeddings_train, df_train_normalized['y'])

In [12]:
# get XGBoost predictions
y_pred = xgb_predict.predict(df_embeddings_test)

In [15]:
rmse = np.sqrt(mean_squared_error(df_test_normalized["y"], y_pred))
print(f"RMSE Score: {rmse:.4f}")

RMSE Score: 0.7275
