In [1]:
import pandas as pd
import numpy as np
import os
import random

from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

In [2]:
import os

print(os.getcwd())

/home/wjstjrals417/Dacon/New_drug_development/Seokmin/src/ChemBERTa


In [3]:
from os.path import join
train_path = join('..', '..', '..', 'data', 'total_data', 'train.csv')
test_path = join('..', '..', '..', 'data', 'total_data', 'test.csv')
submission_path = join('..', '..', '..', 'data', 'total_data', 'sample_submission.csv')

## Model Load

In [4]:
model_name = "seyonec/ChemBERTa-zinc-base-v1"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=1)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Load data
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [6]:
# Tokenize SMILES strings with dynamic padding
def tokenize_smiles(smiles):
    return tokenizer(smiles, padding=True, truncation=True, max_length=128, return_tensors='pt')

In [7]:
# Apply tokenization and store as a list
train_tokens = [tokenize_smiles(smiles) for smiles in train_df['Smiles']]
test_tokens = [tokenize_smiles(smiles) for smiles in test_df['Smiles']]

In [8]:
# No scaling applied to IC50 values
y = train_df['IC50_nM'].values

In [9]:
# Split into training and validation sets
train_tokens, val_tokens, y_train, y_val = train_test_split(train_tokens, y, test_size=0.2, random_state=42)

In [10]:
class SMILESDataset(Dataset):
    def __init__(self, tokens, targets=None):
        self.tokens = tokens
        self.targets = targets
    
    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, idx):
        item = {key: val.squeeze(0) for key, val in self.tokens[idx].items()}
        if self.targets is not None:
            item['labels'] = torch.tensor(self.targets[idx], dtype=torch.float)
        return item

In [11]:
train_dataset = SMILESDataset(train_tokens, y_train)
val_dataset = SMILESDataset(val_tokens, y_val)
test_dataset = SMILESDataset(test_tokens)

In [12]:
# Use DataCollatorWithPadding to handle dynamic padding
data_collator = DataCollatorWithPadding(tokenizer)

In [13]:
# Prepare training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=500,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)


In [14]:
# Fine-tune the model
trainer.train()

# Predict on the test set
predictions = trainer.predict(test_dataset)

# Save the predictions to a CSV file
output_df = pd.DataFrame({'ID': test_df['ID'], 'IC50_nM': predictions.predictions.squeeze()})
output_df.to_csv('./predictions_chembert_v1.csv', index=False) # 0.6297



Epoch,Training Loss,Validation Loss
1,No log,4674883.5
2,No log,4671197.5
3,No log,4669167.5
4,No log,4667687.5
5,No log,4666441.5
6,8020812.800000,4665314.0
7,8020812.800000,4664257.0
8,8020812.800000,4663246.0
9,8020812.800000,4662261.5
10,8020812.800000,4661299.0


