In [1]:
import pandas as pd
import numpy as np
import os
import random
import torch
import torch.nn as nn

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

In [4]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

In [5]:
# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름
print(f'Number of examples is: {len(chembl_data)}')
chembl_data.head()

Number of examples is: 1952


Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles
0,CHEMBL4443947,IC50,'=',0.022,nM,10.66,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.022,10.66,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...
1,CHEMBL4556091,IC50,'=',0.026,nM,10.59,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.026,10.59,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
2,CHEMBL4566431,IC50,'=',0.078,nM,10.11,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.078,10.11,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
3,CHEMBL4545898,IC50,'=',0.081,nM,10.09,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.081,10.09,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
4,CHEMBL4448950,IC50,'=',0.099,nM,10.0,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.099,10.0,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...


In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Load the pre-trained model and tokenizer
model_name = "DeepChem/ChemBERTa-77M-MLM"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Determine the maximum sequence length
max_length = tokenizer.model_max_length
print(max_length)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


512




In [7]:
def tokenize(string):
    """
    Tokenize and encode a string using the provided tokenizer.
    
    Parameters:
        string (str): Input string to be tokenized.
    
    Returns:
        Tuple of input_ids and attention_mask.
    """
    encodings = tokenizer.encode_plus(
        string,
        add_special_tokens=True,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_attention_mask=True
    )
    input_ids = encodings["input_ids"]
    attention_mask = encodings["attention_mask"]
    return input_ids, attention_mask

# Tokenize the 'CANONICAL_SMILES' column and create new columns 'input_ids' and 'attention_mask'
tqdm.pandas()
chembl_data[["input_ids", "attention_mask"]] = chembl_data["Smiles"].progress_apply(lambda x: tokenize(x)).apply(pd.Series)

100%|██████████| 1952/1952 [00:00<00:00, 3311.42it/s]


In [8]:
# Split the dataset into train, validation, and test sets
train_df, val_df = train_test_split(chembl_data, test_size=0.2, random_state=21)
print(f"There are {len(train_df)} molecules in Train df.")
print(f"There are {len(val_df)} molecules in Val df.")

There are 1561 molecules in Train df.
There are 391 molecules in Val df.
There are 113 molecules in Test df.


In [9]:
# Function to convert data to PyTorch tensors
def get_tensor_data(data):
    """
    Convert data to PyTorch tensors.
    
    Parameters:
        data (DataFrame): Input data containing 'input_ids', 'attention_mask', and 'pIC50' columns.
    
    Returns:
        TensorDataset containing input_ids, attention_mask, and labels tensors.
    """
    input_ids_tensor = torch.tensor(data["input_ids"].tolist(), dtype=torch.int32)
    attention_mask_tensor = torch.tensor(data["attention_mask"].tolist(), dtype=torch.int32)
    labels_tensor = torch.tensor(data["pIC50"].tolist(), dtype=torch.float32)
    return TensorDataset(input_ids_tensor, attention_mask_tensor, labels_tensor)

# Create datasets and data loaders
train_dataset = get_tensor_data(train_df)
val_dataset = get_tensor_data(val_df)

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [10]:
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

# Loss criterion and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)  # Decrease LR by a factor of 0.5 every 10 epochs
device = torch.device("cuda")
model.to(device)

epochs = 80
torch.manual_seed(12345)

for epoch in tqdm(range(epochs)):
    # Training loop
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad(set_to_none=True)
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        output_dict = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        predictions = output_dict.logits.squeeze(dim=1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
    avg_train_loss = total_train_loss / len(train_loader)

    # Validation loop
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            output_dict = model(input_ids, attention_mask=attention_mask, labels=labels)
            predictions = output_dict.logits.squeeze(dim=1)
            loss = criterion(predictions, labels)
            total_val_loss += loss.item()
    avg_val_loss = total_val_loss / len(val_loader)
    
    print(f"Epoch {epoch + 1}: Train Loss {avg_train_loss:.4f}, Val Loss {avg_val_loss:.4f}")
    
    # Step the scheduler
    scheduler.step()


  1%|          | 1/100 [00:02<04:40,  2.83s/it]

Epoch 1: Train Loss 21.0704, Val Loss 1.9090


  2%|▏         | 2/100 [00:05<04:19,  2.64s/it]

Epoch 2: Train Loss 2.3840, Val Loss 1.6161


  3%|▎         | 3/100 [00:07<04:10,  2.58s/it]

Epoch 3: Train Loss 1.2791, Val Loss 1.2671


  4%|▍         | 4/100 [00:10<04:05,  2.55s/it]

Epoch 4: Train Loss 1.2556, Val Loss 1.2675


  5%|▌         | 5/100 [00:12<04:01,  2.54s/it]

Epoch 5: Train Loss 1.2044, Val Loss 1.2755


  6%|▌         | 6/100 [00:15<03:57,  2.53s/it]

Epoch 6: Train Loss 1.1656, Val Loss 1.0235


  7%|▋         | 7/100 [00:17<03:54,  2.52s/it]

Epoch 7: Train Loss 0.8625, Val Loss 0.7221


  8%|▊         | 8/100 [00:20<03:51,  2.52s/it]

Epoch 8: Train Loss 0.6854, Val Loss 0.6310


  9%|▉         | 9/100 [00:22<03:48,  2.51s/it]

Epoch 9: Train Loss 0.6123, Val Loss 0.6918


 10%|█         | 10/100 [00:25<03:46,  2.51s/it]

Epoch 10: Train Loss 0.6430, Val Loss 0.6616


 11%|█         | 11/100 [00:27<03:43,  2.51s/it]

Epoch 11: Train Loss 0.5784, Val Loss 0.5990


 12%|█▏        | 12/100 [00:30<03:40,  2.51s/it]

Epoch 12: Train Loss 0.5167, Val Loss 0.5571


 13%|█▎        | 13/100 [00:32<03:38,  2.51s/it]

Epoch 13: Train Loss 0.4513, Val Loss 0.5773


 14%|█▍        | 14/100 [00:35<03:39,  2.55s/it]

Epoch 14: Train Loss 0.4452, Val Loss 0.5173


 15%|█▌        | 15/100 [00:38<03:35,  2.54s/it]

Epoch 15: Train Loss 0.4260, Val Loss 0.6378


 16%|█▌        | 16/100 [00:40<03:32,  2.53s/it]

Epoch 16: Train Loss 0.4351, Val Loss 0.5396


 17%|█▋        | 17/100 [00:43<03:29,  2.52s/it]

Epoch 17: Train Loss 0.4147, Val Loss 0.5032


 18%|█▊        | 18/100 [00:45<03:26,  2.52s/it]

Epoch 18: Train Loss 0.4010, Val Loss 0.5012


 19%|█▉        | 19/100 [00:48<03:23,  2.51s/it]

Epoch 19: Train Loss 0.3748, Val Loss 0.5165


 20%|██        | 20/100 [00:50<03:20,  2.51s/it]

Epoch 20: Train Loss 0.3725, Val Loss 0.4608


 21%|██        | 21/100 [00:53<03:18,  2.51s/it]

Epoch 21: Train Loss 0.3566, Val Loss 0.4603


 22%|██▏       | 22/100 [00:55<03:15,  2.51s/it]

Epoch 22: Train Loss 0.3595, Val Loss 0.4992


 23%|██▎       | 23/100 [00:58<03:13,  2.51s/it]

Epoch 23: Train Loss 0.3194, Val Loss 0.4593


 24%|██▍       | 24/100 [01:00<03:10,  2.51s/it]

Epoch 24: Train Loss 0.3112, Val Loss 0.4604


 25%|██▌       | 25/100 [01:03<03:07,  2.51s/it]

Epoch 25: Train Loss 0.3170, Val Loss 0.4778


 26%|██▌       | 26/100 [01:05<03:05,  2.51s/it]

Epoch 26: Train Loss 0.3163, Val Loss 0.4794


 27%|██▋       | 27/100 [01:08<03:02,  2.51s/it]

Epoch 27: Train Loss 0.3115, Val Loss 0.4898


 28%|██▊       | 28/100 [01:10<03:00,  2.51s/it]

Epoch 28: Train Loss 0.3072, Val Loss 0.4830


 29%|██▉       | 29/100 [01:13<02:57,  2.51s/it]

Epoch 29: Train Loss 0.3137, Val Loss 0.4497


 30%|███       | 30/100 [01:15<02:55,  2.51s/it]

Epoch 30: Train Loss 0.3044, Val Loss 0.4757


 31%|███       | 31/100 [01:18<02:52,  2.51s/it]

Epoch 31: Train Loss 0.2919, Val Loss 0.4711


 32%|███▏      | 32/100 [01:20<02:50,  2.51s/it]

Epoch 32: Train Loss 0.3024, Val Loss 0.4373


 33%|███▎      | 33/100 [01:23<02:47,  2.51s/it]

Epoch 33: Train Loss 0.2941, Val Loss 0.4591


 34%|███▍      | 34/100 [01:25<02:45,  2.51s/it]

Epoch 34: Train Loss 0.2865, Val Loss 0.5071


 35%|███▌      | 35/100 [01:28<02:42,  2.51s/it]

Epoch 35: Train Loss 0.2911, Val Loss 0.4512


 36%|███▌      | 36/100 [01:30<02:40,  2.51s/it]

Epoch 36: Train Loss 0.2843, Val Loss 0.4535


 37%|███▋      | 37/100 [01:33<02:37,  2.51s/it]

Epoch 37: Train Loss 0.2691, Val Loss 0.5011


 38%|███▊      | 38/100 [01:35<02:35,  2.51s/it]

Epoch 38: Train Loss 0.2728, Val Loss 0.4996


 39%|███▉      | 39/100 [01:38<02:32,  2.51s/it]

Epoch 39: Train Loss 0.2881, Val Loss 0.4497


 40%|████      | 40/100 [01:40<02:30,  2.50s/it]

Epoch 40: Train Loss 0.2901, Val Loss 0.4728


 41%|████      | 41/100 [01:43<02:27,  2.51s/it]

Epoch 41: Train Loss 0.2702, Val Loss 0.4706


 42%|████▏     | 42/100 [01:45<02:25,  2.51s/it]

Epoch 42: Train Loss 0.2890, Val Loss 0.4624


 43%|████▎     | 43/100 [01:48<02:22,  2.51s/it]

Epoch 43: Train Loss 0.2790, Val Loss 0.4658


 44%|████▍     | 44/100 [01:50<02:20,  2.51s/it]

Epoch 44: Train Loss 0.2729, Val Loss 0.4561


 45%|████▌     | 45/100 [01:53<02:17,  2.51s/it]

Epoch 45: Train Loss 0.2667, Val Loss 0.4614


 46%|████▌     | 46/100 [01:55<02:15,  2.51s/it]

Epoch 46: Train Loss 0.2560, Val Loss 0.4833


 47%|████▋     | 47/100 [01:58<02:12,  2.51s/it]

Epoch 47: Train Loss 0.2589, Val Loss 0.4731


 48%|████▊     | 48/100 [02:00<02:10,  2.51s/it]

Epoch 48: Train Loss 0.2508, Val Loss 0.4723


 49%|████▉     | 49/100 [02:03<02:07,  2.50s/it]

Epoch 49: Train Loss 0.2649, Val Loss 0.4665


 50%|█████     | 50/100 [02:05<02:05,  2.50s/it]

Epoch 50: Train Loss 0.2529, Val Loss 0.4829


 51%|█████     | 51/100 [02:08<02:02,  2.50s/it]

Epoch 51: Train Loss 0.2749, Val Loss 0.4588


 52%|█████▏    | 52/100 [02:10<02:00,  2.50s/it]

Epoch 52: Train Loss 0.2569, Val Loss 0.4728


 53%|█████▎    | 53/100 [02:13<01:57,  2.50s/it]

Epoch 53: Train Loss 0.2664, Val Loss 0.4774


 54%|█████▍    | 54/100 [02:15<01:55,  2.50s/it]

Epoch 54: Train Loss 0.2608, Val Loss 0.4708


 55%|█████▌    | 55/100 [02:18<01:52,  2.50s/it]

Epoch 55: Train Loss 0.2634, Val Loss 0.4665


 56%|█████▌    | 56/100 [02:20<01:50,  2.51s/it]

Epoch 56: Train Loss 0.2794, Val Loss 0.4601


 57%|█████▋    | 57/100 [02:23<01:47,  2.51s/it]

Epoch 57: Train Loss 0.2745, Val Loss 0.4673


 58%|█████▊    | 58/100 [02:25<01:45,  2.51s/it]

Epoch 58: Train Loss 0.2700, Val Loss 0.4730


 59%|█████▉    | 59/100 [02:28<01:42,  2.51s/it]

Epoch 59: Train Loss 0.2591, Val Loss 0.4726


 60%|██████    | 60/100 [02:30<01:40,  2.51s/it]

Epoch 60: Train Loss 0.2634, Val Loss 0.4644


 61%|██████    | 61/100 [02:33<01:37,  2.51s/it]

Epoch 61: Train Loss 0.2636, Val Loss 0.4728


 62%|██████▏   | 62/100 [02:35<01:35,  2.51s/it]

Epoch 62: Train Loss 0.2578, Val Loss 0.4822


 63%|██████▎   | 63/100 [02:38<01:32,  2.51s/it]

Epoch 63: Train Loss 0.2512, Val Loss 0.4770


 64%|██████▍   | 64/100 [02:40<01:30,  2.51s/it]

Epoch 64: Train Loss 0.2761, Val Loss 0.4826


 65%|██████▌   | 65/100 [02:43<01:27,  2.50s/it]

Epoch 65: Train Loss 0.2518, Val Loss 0.4782


 66%|██████▌   | 66/100 [02:45<01:25,  2.50s/it]

Epoch 66: Train Loss 0.2643, Val Loss 0.4673


 67%|██████▋   | 67/100 [02:48<01:22,  2.50s/it]

Epoch 67: Train Loss 0.2565, Val Loss 0.4676


 68%|██████▊   | 68/100 [02:50<01:20,  2.51s/it]

Epoch 68: Train Loss 0.2521, Val Loss 0.4661


 69%|██████▉   | 69/100 [02:53<01:17,  2.50s/it]

Epoch 69: Train Loss 0.2761, Val Loss 0.4694


 70%|███████   | 70/100 [02:55<01:15,  2.50s/it]

Epoch 70: Train Loss 0.2595, Val Loss 0.4808


 71%|███████   | 71/100 [02:58<01:12,  2.51s/it]

Epoch 71: Train Loss 0.2502, Val Loss 0.4784


 72%|███████▏  | 72/100 [03:00<01:10,  2.50s/it]

Epoch 72: Train Loss 0.2628, Val Loss 0.4747


 73%|███████▎  | 73/100 [03:03<01:07,  2.50s/it]

Epoch 73: Train Loss 0.2731, Val Loss 0.4788


 74%|███████▍  | 74/100 [03:05<01:05,  2.50s/it]

Epoch 74: Train Loss 0.2404, Val Loss 0.4855


 75%|███████▌  | 75/100 [03:08<01:02,  2.50s/it]

Epoch 75: Train Loss 0.2490, Val Loss 0.4829


 76%|███████▌  | 76/100 [03:10<01:00,  2.50s/it]

Epoch 76: Train Loss 0.2691, Val Loss 0.4796


 77%|███████▋  | 77/100 [03:13<00:57,  2.50s/it]

Epoch 77: Train Loss 0.2663, Val Loss 0.4804


 78%|███████▊  | 78/100 [03:15<00:55,  2.50s/it]

Epoch 78: Train Loss 0.2369, Val Loss 0.4730


 79%|███████▉  | 79/100 [03:18<00:52,  2.51s/it]

Epoch 79: Train Loss 0.2626, Val Loss 0.4703


 80%|████████  | 80/100 [03:20<00:50,  2.51s/it]

Epoch 80: Train Loss 0.2497, Val Loss 0.4715


 81%|████████  | 81/100 [03:23<00:47,  2.50s/it]

Epoch 81: Train Loss 0.2591, Val Loss 0.4730


 82%|████████▏ | 82/100 [03:25<00:45,  2.50s/it]

Epoch 82: Train Loss 0.2427, Val Loss 0.4727


 83%|████████▎ | 83/100 [03:28<00:42,  2.50s/it]

Epoch 83: Train Loss 0.2533, Val Loss 0.4730


 84%|████████▍ | 84/100 [03:30<00:40,  2.50s/it]

Epoch 84: Train Loss 0.2447, Val Loss 0.4740


 85%|████████▌ | 85/100 [03:33<00:37,  2.50s/it]

Epoch 85: Train Loss 0.2539, Val Loss 0.4723


 86%|████████▌ | 86/100 [03:35<00:35,  2.50s/it]

Epoch 86: Train Loss 0.2551, Val Loss 0.4720


 87%|████████▋ | 87/100 [03:38<00:32,  2.50s/it]

Epoch 87: Train Loss 0.2687, Val Loss 0.4706


 88%|████████▊ | 88/100 [03:40<00:30,  2.50s/it]

Epoch 88: Train Loss 0.2553, Val Loss 0.4706


 89%|████████▉ | 89/100 [03:43<00:27,  2.50s/it]

Epoch 89: Train Loss 0.2707, Val Loss 0.4720


 90%|█████████ | 90/100 [03:45<00:25,  2.50s/it]

Epoch 90: Train Loss 0.2534, Val Loss 0.4741


 91%|█████████ | 91/100 [03:48<00:22,  2.50s/it]

Epoch 91: Train Loss 0.2510, Val Loss 0.4767


 92%|█████████▏| 92/100 [03:50<00:20,  2.50s/it]

Epoch 92: Train Loss 0.2587, Val Loss 0.4796


 93%|█████████▎| 93/100 [03:53<00:17,  2.50s/it]

Epoch 93: Train Loss 0.2476, Val Loss 0.4798


 94%|█████████▍| 94/100 [03:55<00:15,  2.50s/it]

Epoch 94: Train Loss 0.2422, Val Loss 0.4795


 95%|█████████▌| 95/100 [03:58<00:12,  2.50s/it]

Epoch 95: Train Loss 0.2435, Val Loss 0.4787


 96%|█████████▌| 96/100 [04:00<00:10,  2.50s/it]

Epoch 96: Train Loss 0.2506, Val Loss 0.4769


 97%|█████████▋| 97/100 [04:03<00:07,  2.50s/it]

Epoch 97: Train Loss 0.2547, Val Loss 0.4754


 98%|█████████▊| 98/100 [04:05<00:05,  2.50s/it]

Epoch 98: Train Loss 0.2657, Val Loss 0.4751


 99%|█████████▉| 99/100 [04:08<00:02,  2.50s/it]

Epoch 99: Train Loss 0.2597, Val Loss 0.4753


100%|██████████| 100/100 [04:10<00:00,  2.51s/it]

Epoch 100: Train Loss 0.2551, Val Loss 0.4750





In [16]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

In [17]:
# Load the test data
test_df = pd.read_csv('test.csv')

# Tokenize the 'Smiles' column in the test dataset
tqdm.pandas()
test_df[["input_ids", "attention_mask"]] = test_df["Smiles"].progress_apply(lambda x: tokenize(x)).apply(pd.Series)

100%|██████████| 113/113 [00:00<00:00, 1627.76it/s]


In [18]:
# Function to convert data to PyTorch tensors for the test set
def get_test_tensor_data(data):
    input_ids_tensor = torch.tensor(data["input_ids"].tolist(), dtype=torch.int32)
    attention_mask_tensor = torch.tensor(data["attention_mask"].tolist(), dtype=torch.int32)
    return TensorDataset(input_ids_tensor, attention_mask_tensor)

# Create test dataset and DataLoader
test_dataset = get_test_tensor_data(test_df)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [19]:
# Testing loop
model.eval()  # Set the model to evaluation mode
test_predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        output_dict = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = output_dict.logits.squeeze(dim=1)
        test_predictions.extend(predictions.tolist())

test_ic50_predictions = pIC50_to_IC50(np.array(test_predictions))

In [20]:
# Save the predictions to a submission file
test_df["IC50_nM"] = test_ic50_predictions
submission_df = test_df[["ID", "IC50_nM"]]
submission_df.to_csv("submission.csv", index=False)