# Colab install packages

In [None]:
!pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu126

In [2]:
import torch
print(torch.__version__)

2.8.0+cu126


In [None]:
pyg_url = f"https://data.pyg.org/whl/torch-{torch.__version__}.html"
!pip install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f $pyg_url
!pip install rdkit

In [None]:
!pip install chembl_structure_pipeline
!pip install pandarallel

# Download code, trained models, and challenge test data

In [None]:
!git clone https://github.com/LongHung-Pham/EUOS25_challenge.git
%cd EUOS25_challenge

In [None]:
# download trained models
!wget https://huggingface.co/datasets/longhung25/EUOS25_challenge/resolve/main/finetuned_models_submission.zip
!unzip finetuned_models_submission.zip -d models

In [None]:
# challenge test data
!wget https://ochem.eu/documents/euos25_challenge_test.csv --no-check-certificate

In [12]:
import chembl_structure_pipeline
from rdkit import Chem

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

def standardize_smi(smi):
    m = Chem.MolFromSmiles(smi)
    standard_record = chembl_structure_pipeline.standardize_mol(m)      # Normalizer and Uncharger
    standard_parent, exclude = chembl_structure_pipeline.get_parent_mol(standard_record)
    standard_smi = Chem.MolToSmiles(standard_parent)
    return standard_smi

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[15:44:26] Initializing Normalizer


In [None]:
import pandas as pd
import numpy as np

# Standardize SMILES and save to file
test = pd.read_csv('euos25_challenge_test.csv')
test['std_smiles'] = test['SMILES'].parallel_apply(lambda x: standardize_smi(x))
test.to_csv('TEST_smiles.csv', index = False, columns = ['std_smiles'])

In [None]:
from data.datasets import PredictionDataset
from data.data_processing import load_drug_smile
from torch_geometric.loader import DataLoader

xd, smile_graph = load_drug_smile('TEST_smiles.csv', smile_col_index = 0)
test_dataset = PredictionDataset(root = 'pytorch_data', dataset = 'TEST_prediction', xd = xd, smile_graph = smile_graph)
test_loader = DataLoader(test_dataset, batch_size = 128, shuffle = False)

# Ensemble predict

In [18]:
from src.model import GNN_net

# Specify the pretraining multitask heads (just for model loading)
tasks = ['y_hl_PC',
       'y_hl_QMUG', 'y_pol_QMUG', 'y_ent_QMUG', 'y_dipol_QMUG', 'y_disp_QMUG',
       'y_hl_QM40', 'y_spa_QM40', 'y_pol_QM40', 'y_dipol_QM40']

def ensemble_predict(models_paths, test_loader, device='cuda'):
    all_fold_probs = []

    for path in models_paths:
        # Load model architecture
        task_heads = {k: 1 for k in tasks}
        task_heads['y_binary_class'] = 1
        model = GNN_net(num_gnn_layers = 4, graph_pooling = 'attention', JK = 'concat',
                        h_dim = 512, ffn_dim = 256,
                        task_heads = task_heads)
        model.load_state_dict(torch.load(path))
        model.set_fine_tuning_mode('y_binary_class')
        model.to(device)
        model.eval()

        model_probs = []
        with torch.no_grad():
            for data in test_loader:
                data = data.to(device)
                logits = model(data)
                probs = torch.sigmoid(logits.squeeze())
                model_probs.extend(probs.cpu().numpy().tolist())

        all_fold_probs.append(model_probs)

    # Average the probabilities across all 10 models
    final_probs = np.mean(all_fold_probs, axis=0)
    return final_probs

In [19]:
y_pred = {}

for tgt in ['Transmittance_340', 'Transmittance_450', 'Fluorescence_340', 'Fluorescence_480']:
    print(f'Making predictions for: {tgt}')

    if tgt in ['Transmittance_340', 'Transmittance_450', 'Fluorescence_340', 'Fluorescence_480']:
        models_list = [f'models/finetuned_models_submission/{tgt}/ensemble_model_{i}.pt' for i in range(10)]

    final_predictions = ensemble_predict(models_list, test_loader)
    y_pred[tgt] = final_predictions
    print(y_pred[tgt])

Making predictions for: Transmittance_340


  self.pool = GlobalAttention(gate_nn = Linear((self.num_gnn_layers) * h_dim + self.node_dim, 1))


[1.12635204e-01 4.25234343e-03 9.69525743e-01 ... 7.72773210e-04
 4.07091965e-03 1.70617237e-02]
Making predictions for: Transmittance_450
[0.05373566 0.12837399 0.09027864 ... 0.00361587 0.00463057 0.06645496]
Making predictions for: Fluorescence_340
[0.3783814  0.18003964 0.9894641  ... 0.17635971 0.94959626 0.0739204 ]
Making predictions for: Fluorescence_480
[0.01472478 0.00098091 0.04820189 ... 0.09120818 0.01644282 0.09577888]


In [21]:
import pandas as pd

test = pd.read_csv('euos25_challenge_test.csv')
print(test.shape)
test.head()

(29420, 2)


Unnamed: 0,ID,SMILES
0,2,COC1=C(OC)C=C2C(=O)N(CCC(=O)NCC3=CC=CC(F)=C3)C...
1,3,O=C(C1CCCCC11OCCO1)N1CCOC(C1)C1=CC=CC=C1
2,5,CC[C@@H](C)[C@@H](CO)NC(=O)CC1=C(C)C2=CC=C(O)C...
3,8,O=C(CCC1NC(=O)N(C1=O)C1=CC=CC2=CC=CC=C12)NCCC1...
4,14,CC1=NN(CCNC(=O)CCC2NC(=O)N(C2=O)C2=CC=CC=C2)C(...


In [22]:
conversion_name = {'Transmittance(340)': y_pred['Transmittance_340'],
                   'Transmittance(450)': y_pred['Transmittance_450'],
                   'Fluorescence(340/450)': y_pred['Fluorescence_340'],
                   'Fluorescence(>480)': y_pred['Fluorescence_480'],
                   }

for key in conversion_name.keys():
  test[key] = conversion_name[key]

test.head()

Unnamed: 0,ID,SMILES,Transmittance(340),Transmittance(450),Fluorescence(340/450),Fluorescence(>480)
0,2,COC1=C(OC)C=C2C(=O)N(CCC(=O)NCC3=CC=CC(F)=C3)C...,0.112635,0.053736,0.378381,0.014725
1,3,O=C(C1CCCCC11OCCO1)N1CCOC(C1)C1=CC=CC=C1,0.004252,0.128374,0.18004,0.000981
2,5,CC[C@@H](C)[C@@H](CO)NC(=O)CC1=C(C)C2=CC=C(O)C...,0.969526,0.090279,0.989464,0.048202
3,8,O=C(CCC1NC(=O)N(C1=O)C1=CC=CC2=CC=CC=C12)NCCC1...,0.157718,0.176484,0.126915,0.069267
4,14,CC1=NN(CCNC(=O)CCC2NC(=O)N(C2=O)C2=CC=CC=C2)C(...,0.031769,0.137583,0.056516,0.057944


In [23]:
# Output submitted prediction file
test.to_csv("submission.csv", index=False,
            columns = ['Transmittance(340)', 'Transmittance(450)', 'Fluorescence(340/450)', 'Fluorescence(>480)'])