In [1]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

### 데이터 로드

In [None]:
# 1. 데이터 로딩 및 기본 정보 확인
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')

### EDA

In [None]:
print("Train 데이터 크기:", train.shape)
print("Test 데이터 크기:", test.shape)
print("\nTrain 데이터 정보:")
print(train.info())
print("\nTest 데이터 정보:")
print(test.info())

print("\n결측치 확인:")
print(train.isnull().sum())
print(test.isnull().sum())

In [None]:
#train data 수치형 변수 확인
num_col = train.select_dtypes(include=['int', 'float']).columns.tolist()
num_col

In [None]:
n = len(num_col)
cols = 2
rows = (n+1)//cols

plt.figure(figsize=(cols*6, rows*4))

for i, col in enumerate(num_col):
    plt.subplot(rows,cols, i+1)
    plt.hist(train[col], bins=50, color='skyblue',edgecolor = 'black')
    plt.title(f"histogram of {col}")
    plt.xlabel(col)
    plt.ylabel('count')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(cols*6, rows*4))

for i, col in enumerate(num_col):
    plt.subplot(rows,cols, i+1)
    plt.boxplot(train[col])
    plt.title(f"boxplot of {col}")
    plt.xlabel(col)
    plt.ylabel('count')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6,6))
sns.scatterplot(x='IC50_nM', y='pIC50', data=train, alpha=0.5)
plt.title('IC50_nM vs pIC50')
plt.show()

In [None]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))


In [None]:
train = chembl_data[['Smiles', 'pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values

# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

In [None]:
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)
test

In [None]:
# 랜덤 포레스트 모델 학습
model = RandomForestRegressor(random_state=CFG['SEED'])
model.fit(train_x, train_y)

def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

# Validation 데이터로부터의 학습 모델 평가
val_y_pred = model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')



test_x = np.stack(test['Fingerprint'].values)

test_y_pred = model.predict(test_x)

submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.head()

submit.to_csv('./baseline_submit.csv', index=False)