In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import tqdm
import pandas as pd 
from rdkit import Chem

from coati.models.regression.fc_resnet import FCResNet
from coati.models.io.coati import load_e3gnn_smiles_clip_e2e
from coati.generative.coati_purifications import embed_smiles_batch

class PropertyPredictorResNet(FCResNet):
    def __init__(
        self,
        input_dim,
        features=256,
        depth=5,
        spectral_normalization=True,
        coeff=0.95,
        n_power_iterations=1,
        dropout_rate=0.01,
        num_outputs=1,
        activation="relu",
    ):
        super(PropertyPredictorResNet, self).__init__(
            input_dim=input_dim,
            features=features,
            depth=depth,
            spectral_normalization=spectral_normalization,
            coeff=coeff,
            n_power_iterations=n_power_iterations,
            dropout_rate=dropout_rate,
            num_outputs=num_outputs,
            activation=activation,
        )

class MoleculePropertyDataset(Dataset):
    def __init__(self, encodings, properties):
        """
        参数:
            encodings (torch.Tensor): 分子的编码向量。
            properties (torch.Tensor): 对应的化学性质标签。
        """
        self.encodings = encodings
        self.properties = properties

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        return {
            'encoding': self.encodings[idx],
            'property': self.properties[idx]
        }

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

smiles_data = pd.read_csv('./paddle_prediction/data/train.csv')
smiles = list(smiles_data['smiles'])
labels = torch.tensor(smiles_data['label'].values, dtype=torch.float32)

# Model parameters are pulled from the url and stored in a local models/ dir.
encoder, tokenizer = load_e3gnn_smiles_clip_e2e(
    freeze=True,
    device=DEVICE, 
    # model parameters to load.
    doc_url="./models/grande_closed.pkl",
)

encodings = embed_smiles_batch(smiles, encoder, tokenizer)
encodings = encodings.cpu()  # Move to CPU if necessary

dataset = MoleculePropertyDataset(encodings, labels)
train_size = int(0.85 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

Loading model from ./models/grande_closed.pkl
Loading tokenizer may_closedparen from ./models/grande_closed.pkl
number of parameters: 12.64M
number of parameters Total: 2.44M xformer: 17.92M Total: 20.36M 
vocab_name not found in tokenizer_vocabs, trying to load from file
Freezing encoder
20561664 params frozen!


In [3]:
trnset = list(train_dataset)
trnset[0]['encoding'].unsqueeze(1).shape


torch.Size([256, 1])