# materials.smi-TED - INFERENCE (Regression)

In [None]:
# Install extra packages for notebook
%pip install seaborn xgboost

In [1]:
import sys
sys.path.append('../inference')

In [2]:
# granite.materials.smi-TED (GMsT)
from GMsT_light.load import load_GMsT

# Data
import torch
import pandas as pd
import numpy as np

# Chemistry
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
PandasTools.RenderImagesInAllDataFrames(True)

In [3]:
# function to canonicalize SMILES
def normalize_smiles(smi, canonical=True, isomeric=False):
    try:
        normalized = Chem.MolToSmiles(
            Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
        )
    except:
        normalized = None
    return normalized

### Import GMsT

In [4]:
model_GMsT = load_GMsT(
    folder='../inference/GMsT_light',
    ckpt_filename='GMsT-Light_40.pt'
)

Random Seed: 12345
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Vocab size: 2393
[INFERENCE MODE - GMsT-Light]


## Lipophilicity Dataset

### Experiments - Data Load

In [5]:
df_train = pd.read_csv("../finetune/moleculenet/lipophilicity/train.csv")
df_test = pd.read_csv("../finetune/moleculenet/lipophilicity/test.csv")

### SMILES canonization

In [6]:
df_train['norm_smiles'] = df_train['smiles'].apply(normalize_smiles)
df_train_normalized = df_train.dropna()
print(df_train_normalized.shape)
df_train_normalized.head()

(3360, 3)


Unnamed: 0,smiles,y,norm_smiles
0,Nc1ncnc2c1c(COc3cccc(Cl)c3)nn2C4CCOCC4,0.814313,Nc1ncnc2c1c(COc1cccc(Cl)c1)nn2C1CCOCC1
1,COc1cc(cc2cnc(Nc3ccc(cc3)[C@@H](C)NC(=O)C)nc12...,0.446346,COc1cc(-c2ccncc2)cc2cnc(Nc3ccc(C(C)NC(C)=O)cc3...
2,CC(=O)Nc1ccc2ccn(c3cc(Nc4ccn(C)n4)n5ncc(C#N)c5...,1.148828,CC(=O)Nc1ccc2ccn(-c3cc(Nc4ccn(C)n4)n4ncc(C#N)c...
3,Oc1ccc(CCNCCS(=O)(=O)CCCOCCSc2ccccc2)c3sc(O)nc13,0.404532,O=S(=O)(CCCOCCSc1ccccc1)CCNCCc1ccc(O)c2nc(O)sc12
4,Clc1ccc2C(=O)C3=C(Nc2c1)C(=O)NN(Cc4cc5ccccc5s4...,-0.164144,O=c1[nH]n(Cc2cc3ccccc3s2)c(=O)c2c(=O)c3ccc(Cl)...


In [7]:
df_test['norm_smiles'] = df_test['smiles'].apply(normalize_smiles)
df_test_normalized = df_test.dropna()
print(df_test_normalized.shape)
df_test_normalized.head()

(420, 3)


Unnamed: 0,smiles,y,norm_smiles
0,N(c1ccccc1)c2ccnc3ccccc23,0.488161,c1ccc(Nc2ccnc3ccccc23)cc1
1,Clc1ccc2Oc3ccccc3N=C(N4CCNCC4)c2c1,0.070017,Clc1ccc2c(c1)C(N1CCNCC1)=Nc1ccccc1O2
2,NC1(CCC1)c2ccc(cc2)c3ncc4cccnc4c3c5ccccc5,-0.41503,NC1(c2ccc(-c3ncc4cccnc4c3-c3ccccc3)cc2)CCC1
3,OC[C@H](O)CN1C(=O)[C@@H](Cc2ccccc12)NC(=O)c3cc...,0.897942,O=C(NC1Cc2ccccc2N(CC(O)CO)C1=O)c1cc2cc(Cl)sc2[...
4,NS(=O)(=O)c1nc2ccccc2s1,-0.707731,NS(=O)(=O)c1nc2ccccc2s1


### Embeddings extraction 

#### GMsT embeddings extraction

In [8]:
with torch.no_grad():
    df_embeddings_train = model_GMsT.encode(df_train_normalized['norm_smiles'])
df_embeddings_train.head()

100%|████████████████████████████████████████████████████████████████████████████████| 105/105 [03:13<00:00,  1.84s/it]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.367655,-0.504885,0.040487,0.385318,0.564915,-0.684494,1.160395,0.071209,0.799424,0.181321,...,-1.379997,-0.167225,0.104887,0.239569,-0.744388,0.590427,-0.808955,0.792579,0.550896,-0.176828
1,0.455315,-0.485555,0.062207,0.387993,0.5676,-0.713294,1.144275,-0.057043,0.753026,0.11218,...,-1.332148,-0.096662,0.221948,0.327924,-0.739362,0.659809,-0.775714,0.745849,0.56633,-0.111949
2,0.442306,-0.484732,0.084944,0.384783,0.564756,-0.704134,1.159498,0.021167,0.846541,0.118467,...,-1.324174,-0.110408,0.207824,0.281663,-0.780811,0.693486,-0.832625,0.76309,0.532461,-0.196704
3,0.527965,-0.519151,0.091633,0.353524,0.421796,-0.72421,1.093744,0.148567,0.804038,0.19463,...,-1.358411,-0.111491,0.151696,0.186734,-0.601867,0.64159,-0.747429,0.794229,0.640765,-0.239655
4,0.46443,-0.511091,0.038785,0.346215,0.49292,-0.61939,1.048157,0.095908,0.738605,0.11927,...,-1.223927,-0.109859,0.151281,0.244844,-0.686609,0.759331,-0.756339,0.766428,0.610451,-0.197342


In [9]:
with torch.no_grad():
    df_embeddings_test = model_GMsT.encode(df_test_normalized['norm_smiles'])
df_embeddings_test.head()

100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:29<00:00,  2.31s/it]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.392257,-0.504843,0.056789,0.356299,0.475909,-0.648891,1.157854,-0.022919,0.703236,0.19202,...,-1.208708,-0.094439,0.128844,0.403992,-0.78278,0.541906,-0.707279,0.901036,0.629459,-0.020635
1,0.387423,-0.48114,0.049675,0.353058,0.601173,-0.6461,1.142397,0.06009,0.763807,0.110331,...,-1.248281,-0.139795,0.075586,0.202242,-0.729793,0.705911,-0.771753,0.843174,0.618851,-0.213577
2,0.390986,-0.510057,0.070656,0.380697,0.601485,-0.595825,1.182192,0.011089,0.688095,0.056449,...,-1.294592,-0.164844,0.194436,0.240738,-0.773446,0.608632,-0.74719,0.791914,0.611875,-0.125455
3,0.423929,-0.557328,0.083809,0.328704,0.399579,-0.622808,1.079941,0.097607,0.724022,0.135981,...,-1.412061,-0.106538,0.153314,0.209961,-0.699682,0.648063,-0.716247,0.75798,0.615959,-0.258683
4,0.335578,-0.559588,0.119438,0.36414,0.375473,-0.639831,1.144702,0.077508,0.791757,0.164204,...,-1.279038,-0.186734,0.106965,0.254949,-0.651691,0.594168,-0.680428,0.88748,0.651587,-0.144993


### Experiments - Lipophilicity prediction using GMsT latent spaces

#### XGBoost prediction using the whole Latent Space

In [10]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [11]:
xgb_predict = XGBRegressor(n_estimators=2000, learning_rate=0.05, max_depth=4)
xgb_predict.fit(df_embeddings_train, df_train_normalized['y'])

In [12]:
# get XGBoost predictions
y_pred = xgb_predict.predict(df_embeddings_test)

In [13]:
rmse = np.sqrt(mean_squared_error(df_test_normalized["y"], y_pred))
print(f"RMSE Score: {rmse:.4f}")

RMSE Score: 0.6570
