# granite.materials.smi-SSED - INFERENCE (Classification)

In [None]:
# Install extra packages for notebook
%pip install seaborn xgboost

In [1]:
import sys
sys.path.append('../inference')

In [2]:
# materials.smi-ssed
from smi_ssed.load import load_smi_ssed

# Data
import torch
import pandas as pd

# Chemistry
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
PandasTools.RenderImagesInAllDataFrames(True)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# function to canonicalize SMILES
def normalize_smiles(smi, canonical=True, isomeric=False):
    try:
        normalized = Chem.MolToSmiles(
            Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
        )
    except:
        normalized = None
    return normalized

## Import smi-ssed

In [4]:
model_smi_ssed = load_smi_ssed(
    folder='../inference/smi_ssed',
    ckpt_filename='smi-ssed_130.pt'
)



Random Seed: 12345
Vocab size: 2393
[INFERENCE MODE - smi-ssed]


## BBBP Dataset

### Experiments - Data Load

In [5]:
df_train = pd.read_csv("../finetune/moleculenet/bbbp/train.csv")
df_test = pd.read_csv("../finetune/moleculenet/bbbp/test.csv")

### SMILES canonization

In [6]:
df_train['norm_smiles'] = df_train['smiles'].apply(normalize_smiles)
df_train_normalized = df_train.dropna()
print(df_train_normalized.shape)
df_train_normalized.head()

[14:36:37] Explicit valence for atom # 1 N, 4, is greater than permitted
[14:36:37] Explicit valence for atom # 6 N, 4, is greater than permitted
[14:36:37] Explicit valence for atom # 6 N, 4, is greater than permitted
[14:36:37] Explicit valence for atom # 11 N, 4, is greater than permitted
[14:36:37] Explicit valence for atom # 5 N, 4, is greater than permitted


(1634, 5)




Unnamed: 0,num,name,p_np,smiles,norm_smiles
0,1,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,CC(C)NCC(O)COc1cccc2ccccc12.[Cl]
1,2,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1
2,3,40730,1,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23
3,4,24,1,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C,CC(=O)NCCCOc1cccc(CN2CCCCC2)c1
4,6,cefoperazone,1,CCN1CCN(C(=O)N[C@@H](C(=O)N[C@H]2[C@H]3SCC(=C(...,CCN1CCN(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(CSc...


In [7]:
df_test['norm_smiles'] = df_test['smiles'].apply(normalize_smiles)
df_test_normalized = df_test.dropna()
print(df_test_normalized.shape)
df_test_normalized.head()

[14:36:37] Explicit valence for atom # 12 N, 4, is greater than permitted
[14:36:37] Explicit valence for atom # 5 N, 4, is greater than permitted


(192, 5)




Unnamed: 0,num,name,p_np,smiles,norm_smiles
0,13,18,1,C(Cl)Cl,ClCCl
1,23,SKF-93619,0,c1cc2c(cc(CC3=CNC(=NC3=O)NCCSCc3oc(cc3)CN(C)C)...,CN(C)Cc1ccc(CSCCNc2nc(=O)c(Cc3ccc4ccccc4c3)c[n...
2,36,etomidate,1,CCOC(=O)c1cncn1C(C)c2ccccc2,CCOC(=O)c1cncn1C(C)c1ccccc1
3,37,11a,0,CN(C)c1cc(C2=NC(N)=NN2)ccn1,CN(C)c1cc(-c2nc(N)n[nH]2)ccn1
4,79,compound 45,1,N1(Cc2cc(OCCCNc3oc4ccccc4n3)ccc2)CCCCC1,c1cc(CN2CCCCC2)cc(OCCCNc2nc3ccccc3o2)c1


### Embeddings extraction

### smi-ssed embeddings extraction

In [8]:
with torch.no_grad():
    df_embeddings_train = model_smi_ssed.encode(df_train_normalized['norm_smiles'])
df_embeddings_train.head()

  return bound(*args, **kwds)
100%|████████████████████████████████████████████████████████████████████████████████| 16/16 [00:03<00:00,  4.71it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.20566,-0.111345,0.441314,-0.071542,0.155176,-0.378108,0.360412,0.282598,0.208329,-0.471892,...,0.429639,-0.152912,0.111276,0.535827,-0.096528,0.203259,-0.104669,-0.200333,-0.068437,0.17471
1,-0.017181,-0.065592,0.368753,0.038683,0.224666,-0.336187,0.35354,0.059201,0.010582,-0.242395,...,0.549646,-0.225171,0.105946,0.508142,-0.01344,0.005127,-0.036332,-0.273413,0.007376,0.1282
2,-0.134377,-0.01019,0.470901,0.080473,0.072051,-0.268193,0.234757,0.088561,0.004254,-0.435175,...,0.361545,-0.234929,0.248257,0.219218,-0.14996,0.279441,0.09542,-0.268197,-0.041442,0.243305
3,-0.034948,-0.062695,0.413172,-0.004619,0.280795,-0.3052,0.378292,0.018933,0.069061,-0.24041,...,0.54224,-0.337464,0.014177,0.540148,-0.039547,0.031763,0.027257,-0.215158,-0.070677,0.059504
4,-0.088577,0.053799,0.425407,0.103547,-0.041102,-0.254561,0.114115,-0.02409,-0.009089,-0.416894,...,0.495099,-0.346302,-0.042401,0.247797,-0.148452,0.062143,0.16757,-0.324576,-0.080183,0.196019


In [9]:
with torch.no_grad():
    df_embeddings_test = model_smi_ssed.encode(df_test_normalized['norm_smiles'])
df_embeddings_test.head()

  return bound(*args, **kwds)
100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.89it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.171925,-0.131785,0.343773,-0.284037,0.026675,-0.087566,0.53501,0.26767,0.359609,0.442704,...,-0.416212,-0.376148,0.194237,0.326203,0.112305,0.539833,-0.297891,-0.261284,0.071261,0.203141
1,-0.160587,0.077772,0.462698,-0.023961,0.095067,-0.367896,0.311824,0.239797,0.20978,-0.518445,...,0.386745,-0.185277,0.107852,0.452809,-0.17415,0.323596,0.014795,-0.204783,0.12612,0.323408
2,-0.13389,0.053946,0.464414,-0.030522,0.097208,-0.340851,0.409256,0.351503,0.246436,-0.514199,...,0.435457,-0.308675,0.078139,0.563646,-0.083576,0.058036,-0.092288,-0.272108,-0.110046,0.28861
3,-0.17172,0.079559,0.527576,0.043041,0.013062,-0.268483,0.420246,0.203715,0.188033,-0.515932,...,0.23406,-0.26517,0.071282,0.391368,-0.211012,0.334866,-0.146813,-0.205227,0.056464,0.386205
4,0.079989,0.241748,0.393966,0.115002,0.102797,-0.372177,0.431272,0.251328,0.119341,-0.435023,...,0.336146,-0.251939,0.023058,0.400258,-0.221673,0.347532,0.044417,-0.222342,0.042905,0.106123


## Experiments - BBBP prediction using smi-ssed latent spaces

### XGBoost prediction using the whole Latent Space

In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [11]:
xgb_predict = XGBClassifier(n_estimators=2000, learning_rate=0.04, max_depth=8)
xgb_predict.fit(df_embeddings_train, df_train_normalized['p_np'])

In [12]:
# get XGBoost predictions
y_prob = xgb_predict.predict_proba(df_embeddings_test)[:, 1]

In [13]:
roc_auc = roc_auc_score(df_test_normalized["p_np"], y_prob)
print(f"ROC-AUC Score: {roc_auc:.4f}")

ROC-AUC Score: 0.9152
