In [1]:
# pip install ipykernel ipywidgets torch pandas datasets transformers[torch] tensorboard astartes
# clone then run python setup.py install: https://github.com/NVIDIA/apex.git

In [1]:
from pathlib import Path

In [2]:
_base = "DeepChem/ChemBERTa-77M-MLM"
_tuning_data = Path("../../data/krasnov/bigsoldb_chemprop_nonaq.csv")

In [3]:
from astartes import train_test_split
import pandas as pd
from datasets import Dataset

In [4]:
df = pd.read_csv(_tuning_data)
studies_train, studies_val = train_test_split(pd.unique(df["source"]), random_state=1, train_size=0.90, test_size=0.10)
train_indexes = df.index[df["source"].isin(studies_train)].tolist()
val_indexes = df.index[df["source"].isin(studies_val)].tolist()
train_dataset = Dataset.from_pandas(df.iloc[train_indexes])
val_dataset = Dataset.from_pandas(df.iloc[val_indexes])

In [5]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer

In [6]:
def tokenize_function(examples, tokenizer):
    inputs = [s1 + ' ' + s2 + ' ' + str(temp) for s1, s2, temp in zip(examples["solute_smiles"], examples["solvent_smiles"], examples["temperature"])]
    return tokenizer(inputs, padding="max_length", truncation=True)

def preprocess_labels(example):
    example["labels"] = float(example["logS"])
    return example


In [7]:
tokenizer = AutoTokenizer.from_pretrained(_base)
model = AutoModelForSequenceClassification.from_pretrained(_base, num_labels=1)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_dataset = train_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True).map(preprocess_labels)
val_dataset = val_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True).map(preprocess_labels)

Map:   0%|          | 0/37190 [00:00<?, ? examples/s]

Map:   0%|          | 0/37190 [00:00<?, ? examples/s]

Map:   0%|          | 0/4534 [00:00<?, ? examples/s]

Map:   0%|          | 0/4534 [00:00<?, ? examples/s]

In [9]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    report_to=["tensorboard"],
)



In [10]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")  # Extract labels
        outputs = model(**inputs)
        logits = outputs.logits.squeeze(-1)  # Ensure shape compatibility
        loss_fct = torch.nn.MSELoss()  # Mean Squared Error for regression
        loss = loss_fct(logits, labels)        
        return (loss, outputs) if return_outputs else loss

In [11]:
# Trainer setup
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
)

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Save final model
trainer.save_model("./solubility_llm")
tokenizer.save_pretrained("./solubility_llm")

In [None]:
# Function to make predictions
def predict(smiles_1, smiles_2, temperature):
    model.eval()
    inputs = tokenizer(smiles_1 + " " + smiles_2 + " " + str(temperature), return_tensors="pt", padding="max_length", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.logits.item()

# Example usage
print(predict("CCO", "CCCC", 25.0))