In [24]:
import polaris as po
import pandas as pd

In [25]:
from packaging.version import Version
assert Version(po.__version__) >= Version("0.11.6"), "test.as_dataframe does not work in earlier versions of Polaris, please upgrade"

In [26]:
%%capture
# https://polarishub.io/benchmarks/polaris/adme-fang-rppb-1
benchmark = po.load_benchmark("polaris/adme-fang-RPPB-1")

In [27]:
train, test = benchmark.get_train_test_split()
test_df: pd.DataFrame = test.as_dataframe()
train_df: pd.DataFrame = train.as_dataframe()

In [28]:
from mordred import Calculator, descriptors
from rdkit.Chem import MolFromSmiles

In [29]:
calc = Calculator(descriptors, ignore_3D=True)

In [30]:
train_features = calc.pandas(map(MolFromSmiles, train_df["smiles"]), nmols=len(train_df)).fill_missing()
test_features = calc.pandas(map(MolFromSmiles, test_df["smiles"]), nmols=len(test_df)).fill_missing()

100%|██████████| 111/111 [00:02<00:00, 48.82it/s]
  t[t.applymap(is_missing)] = value
100%|██████████| 24/24 [00:00<00:00, 39.73it/s]
  t[t.applymap(is_missing)] = value


In [31]:
train_features

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,19.564049,15.515402,0,0,31.193902,2.460029,4.822181,31.193902,1.247756,4.152028,...,10.200067,74.294388,338.163043,7.194958,1667,40,134.0,158.0,9.451389,5.361111
1,15.653808,13.530766,0,0,26.599236,2.394797,4.789594,26.599236,1.266630,3.950621,...,9.882519,54.636160,290.137890,7.439433,946,33,104.0,121.0,8.027778,4.916667
2,22.494166,17.802912,0,1,37.841697,2.469719,4.850692,37.841697,1.351489,4.291692,...,10.311117,79.272896,372.175025,7.595409,2088,44,154.0,185.0,6.500000,6.083333
3,12.784941,10.806108,0,0,21.020790,2.493634,4.786867,21.020790,1.313799,3.736113,...,9.768069,63.439705,251.007471,10.040299,390,28,88.0,106.0,4.666667,3.444444
4,24.307097,17.264108,0,1,39.555540,2.472253,4.857191,39.555540,1.318518,4.376647,...,10.412081,85.616131,425.152161,8.021739,2831,46,168.0,199.0,8.312500,6.277778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,19.706179,15.948322,0,0,33.628374,2.472681,4.945361,33.628374,1.293399,4.179699,...,10.234840,61.081799,369.088019,8.787810,1644,45,134.0,160.0,9.000000,6.055556
107,20.838915,17.520490,0,0,33.703458,2.553484,5.026050,33.703458,1.248276,4.221641,...,10.312911,76.668244,370.180504,7.403610,1792,46,142.0,170.0,10.472222,5.972222
108,18.022418,14.619949,0,0,28.973289,2.497482,4.833825,28.973289,1.259708,4.070020,...,10.076222,71.785695,345.070261,8.847955,1278,36,122.0,144.0,8.250000,4.972222
109,17.157991,14.060054,0,1,29.114369,2.360026,4.706457,29.114369,1.323380,4.022362,...,9.889845,69.501924,304.189926,6.612824,1150,30,114.0,131.0,6.256944,4.847222


In [32]:
import lightning
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint
import torch
import numpy as np

In [33]:
from nepare.nn import NeuralPairwiseRegressor as NPR
from nepare.data import PairwiseAugmentedDataset, PairwiseAnchoredDataset, PairwiseInferenceDataset
from nepare.inference import predict

In [34]:
X = torch.tensor(train_features.to_numpy(dtype=np.float32), dtype=torch.float32)
y = torch.tensor(train_df["LOG_RPPB"].to_numpy(dtype=np.float32), dtype=torch.float32)[:, None]  # keep it 2d!
X_test = torch.tensor(test_features.to_numpy(dtype=np.float32), dtype=torch.float32)

In [35]:
val_idx = 20  # use 20/110 for validation

In [36]:
from fastprop.data import standard_scale, inverse_standard_scale

In [37]:
X[val_idx:], means, vars = standard_scale(X[val_idx:])
X[:val_idx] = standard_scale(X[:val_idx], means, vars)
X_test = standard_scale(X_test, means, vars)
# sorta-Winsorization
X.clamp_(-3, 3)
X_test.clamp_(-3, 3)

tensor([[-1.4458, -1.5701, -0.2735,  ..., -1.6418, -0.9506, -1.2769],
        [ 0.4938,  0.4699, -0.2735,  ...,  0.4593,  0.1676,  0.5969],
        [ 0.2475,  0.2941, -0.2735,  ...,  0.3074,  0.4769,  0.0381],
        ...,
        [ 0.1190,  0.5346, -0.2735,  ...,  0.0796,  1.2110,  0.1696],
        [ 1.5723,  1.6106, -0.2735,  ...,  1.7502,  1.6189,  1.5284],
        [-1.7866, -1.7675, -0.2735,  ..., -1.4646, -1.7085, -1.9673]])

In [38]:
# do the same for targets
# y, target_means, target_vars = standard_scale(y)

In [39]:
training_dataset = PairwiseAugmentedDataset(X[val_idx:], y[val_idx:], how='full')
validation_dataset = PairwiseAnchoredDataset(X[val_idx:], y[val_idx:], X[:val_idx], y[:val_idx], how='full')
predict_dataset = PairwiseInferenceDataset(X[val_idx:], y[val_idx:], X_test, how='full')
train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=64)
predict_loader = torch.utils.data.DataLoader(predict_dataset, batch_size=64)

In [40]:
npr = NPR(X.shape[1], 100, 2)
early_stopping = EarlyStopping(monitor="validation/loss")
model_checkpoint = ModelCheckpoint(monitor="validation/loss")

In [41]:
trainer = lightning.Trainer(max_epochs=50, log_every_n_steps=1, callbacks=[early_stopping, model_checkpoint])
trainer.fit(npr, train_loader, val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type       | Params | Mode 
--------------------------------------------
0 | fnn  | Sequential | 332 K  | train
--------------------------------------------
332 K     Trainable params
0         Non-trainable params
332 K     Total params
1.332     Total estimated model params size (MB)
6         Modules in train mode
0         Modules in eval mode


Epoch 0:   2%|▏         | 2/130 [00:00<00:01, 115.78it/s, v_num=20, training/loss=1.170]

Epoch 8: 100%|██████████| 130/130 [00:01<00:00, 93.68it/s, v_num=20, training/loss=0.00855, validation/loss=0.400] 


In [42]:
npr = NPR.load_from_checkpoint(model_checkpoint.best_model_path)  # reload best model based on early stopping

In [43]:
y_pred, y_stdev = predict(npr, predict_loader, how="all")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting:   0%|          | 0/69 [00:00<?, ?it/s]

Predicting DataLoader 0: 100%|██████████| 69/69 [00:00<00:00, 318.95it/s]


In [44]:
# y_pred = inverse_standard_scale(torch.tensor(y_pred), target_means, target_vars)

In [45]:
# Evaluate your predictions
results = benchmark.evaluate(y_pred)

# # Submit your results
# results.upload_to_hub(owner="jacksonburns")

In [46]:
results

test_set,target_label,scores
test,LOG_RPPB,r20.7200728351016012mean_squared_error0.24871024317350518explained_var0.7210664599413337spearmanr0.8347826086956521mean_absolute_error0.38838022769908864pearsonr0.8551127426735149
r2,0.7200728351016012,
mean_squared_error,0.24871024317350518,
explained_var,0.7210664599413337,
spearmanr,0.8347826086956521,
mean_absolute_error,0.38838022769908864,
pearsonr,0.8551127426735149,
name,,
description,,
tags,,

test_set,target_label,scores
test,LOG_RPPB,r20.7200728351016012mean_squared_error0.24871024317350518explained_var0.7210664599413337spearmanr0.8347826086956521mean_absolute_error0.38838022769908864pearsonr0.8551127426735149
r2,0.7200728351016012,
mean_squared_error,0.24871024317350518,
explained_var,0.7210664599413337,
spearmanr,0.8347826086956521,
mean_absolute_error,0.38838022769908864,
pearsonr,0.8551127426735149,

0,1
r2,0.7200728351016012
mean_squared_error,0.2487102431735051
explained_var,0.7210664599413337
spearmanr,0.8347826086956521
mean_absolute_error,0.3883802276990886
pearsonr,0.8551127426735149


In [47]:
results.upload_to_hub(owner="jacksonburns")

PolarisHubError: [1mThe request to the Polaris Hub has failed.[0m
----------------------
Error reported was:
{
  "issues": [
    {
      "code": "invalid_type",
      "expected": "string",
      "message": "Required",
      "path": [
        "name"
      ],
      "received": "undefined"
    }
  ]
}