In [1]:
import random
import os

import numpy as np
import pandas as pd

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import missingno

# device 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print (device)

seed = 42 # seed 값 설정
random.seed(seed) # 파이썬 난수 생성기
os.environ['PYTHONHASHSEED'] = str(seed) # 해시 시크릿값 고정
np.random.seed(seed) # 넘파이 난수 생성기

torch.manual_seed(seed) # 파이토치 CPU 난수 생성기
torch.backends.cudnn.deterministic = True # 확정적 연산 사용 설정
torch.backends.cudnn.benchmark = False   # 벤치마크 기능 사용 해제
torch.backends.cudnn.enabled = False        # cudnn 기능 사용 해제

if device == 'cuda':
    torch.cuda.manual_seed(seed) # 파이토치 GPU 난수 생성기
    torch.cuda.manual_seed_all(seed) # 파이토치 멀티 GPU 난수 생성기

cpu


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/data/test.csv')

In [4]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [5]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

def calculate_metabolic_stability_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    logP = Descriptors.MolLogP(mol)
    # 화합물의 친유성을 측정한 것으로 지질 또는 비극성 환경에서의 용해도를 나타냅니다. 생물학적 막을 통과하는 화합물의 능력을 반영합니다.
    apka = Descriptors.MolWt(mol)
    # 화합물의 산 해리 상수의 추정치로 다양한 pH 조건에서 이온화 거동에 대한 정보를 제공합니다.
    num_rotatable_bonds = Descriptors.NumRotatableBonds(mol)
    # 화합물에서 회전 가능한 결합의 수입니다. 이것은 화합물의 유연성과 효소 또는 다른 분자와의 잠재적인 상호 작용에 대한 통찰력을 제공할 수 있습니다.
    num_heteroatoms = Descriptors.NumHeteroatoms(mol)
    # 분자 내 헤테로원자(탄소 및 수소 이외의 원자) 수. 이는 화합물의 반응성과 대사 안정성에 영향을 줄 수 있습니다.
    num_hydrogen_acceptors = Descriptors.NumHAcceptors(mol)
    # 분자 내 수소 결합 수용체의 수. 이들은 결합 및 반응성에 영향을 미치는 다른 분자의 수소 결합 기증자와 상호 작용할 수 있는 사이트입니다.
    num_hydrogen_donors = Descriptors.NumHDonors(mol)
    # 분자 내 수소 결합 기증자의 수입니다. 이들은 수소 결합 상호작용에서 수소 원자를 제공할 수 있는 사이트입니다.
    # morgan_fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    morgan_fingerprint = AllChem.GetHashedMorganFingerprint(mol, 6, nBits=4096)
    # 분자 하위 구조의 이진 벡터 표현입니다. 이 열에는 화합물과 효소의 상호 작용 및 대사 안정성에 영향을 줄 수 있는 구조적 특징을 포착하는 이진 지문이 포함되어 있습니다.
    morgan_array = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(morgan_fingerprint, morgan_array)

    return logP, apka, num_rotatable_bonds, num_heteroatoms, num_hydrogen_acceptors, num_hydrogen_donors, morgan_array

train[[
    'logP', 'apka', 'num_rotatable_bonds', 'num_heteroatoms',
    'num_hydrogen_acceptors', 'num_hydrogen_donors', 'morgan_fingerprint'
]] = train['SMILES'].apply(calculate_metabolic_stability_descriptors).apply(pd.Series)

test[[
    'logP', 'apka', 'num_rotatable_bonds', 'num_heteroatoms',
    'num_hydrogen_acceptors', 'num_hydrogen_donors', 'morgan_fingerprint'
]] = test['SMILES'].apply(calculate_metabolic_stability_descriptors).apply(pd.Series)

train


Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,logP,apka,num_rotatable_bonds,num_heteroatoms,num_hydrogen_acceptors,num_hydrogen_donors,morgan_fingerprint
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37,3.87744,400.504,8,8,6,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47,3.35474,301.415,2,5,4,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45,1.20450,297.366,3,7,7,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60,3.89356,494.665,5,9,7,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43,2.81772,268.316,1,4,3,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74,2.74730,396.200,4,11,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37,2.27630,359.389,3,7,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14,2.04130,261.325,5,5,5,1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51,1.42720,284.699,4,7,6,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [6]:
train['AlogP'].fillna(train['AlogP'].median(), inplace=True)
test['AlogP'].fillna(test['AlogP'].median(), inplace=True)

In [7]:
class NewFeatureDataset(Dataset):
    def __init__(self, data, target_col=None, transform=None, is_test=False):
        self.is_test = is_test
        self.transform = transform
        self.is_test = is_test

        if not self.is_test:
            self.data = data.drop(['id', 'SMILES', 'morgan_fingerprint','MLM', 'HLM'], axis=1)
        else: # test
            self.data = data.drop(['id', 'SMILES', 'morgan_fingerprint'], axis=1)


        if self.transform is not None and not self.is_test:  # 훈련 데이터에만 fit_transform 적용
            self.data = self.transform.fit_transform(self.data)
        elif self.transform is not None and self.is_test:  # 테스트 데이터에는 transform만 적용
            self.data = self.transform.transform(self.data)

        if target_col is not None and not self.is_test:
            self.target = data[target_col]

    def __getitem__(self, index):
        features = self.data[index]

        if hasattr(self, 'target'):
            target = self.target[index]
            return torch.tensor(features).to(device).float(), torch.tensor(target).to(device).float().unsqueeze(dim=-1)
        else:
            return torch.tensor(features).to(device).float()

    def __len__(self):
        return len(self.data)


In [8]:
transform = StandardScaler()
transform.fit(train.drop(['id','SMILES','morgan_fingerprint', 'MLM', 'HLM'], axis=1))

train_MLM = NewFeatureDataset(train, target_col='MLM', transform=transform, is_test=False)
train_HLM = NewFeatureDataset(train, target_col='HLM', transform=transform, is_test=False)

input_size = train_MLM.data.shape[1]
input_size

13

In [9]:
train_HLM.data.shape

(3498, 13)

In [10]:
# train,valid split
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42)
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42)

In [11]:
torch.tensor(train_MLM.data[1]).shape, torch.tensor(train['MLM'][1]).float().unsqueeze(dim=-1).shape

(torch.Size([13]), torch.Size([1]))

In [12]:
# Hyperparameter
CFG = {'BATCH_SIZE': 256,
       'EPOCHS': 5000,
       'INPUT_SIZE': input_size,
       'HIDDEN_SIZE': 256,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.5,
       'LEARNING_RATE': 0.0001}

In [13]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)


train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

In [14]:
X_train, y_train = next(iter(train_MLM_loader))
print (X_train.shape, y_train.shape)

torch.Size([256, 13]) torch.Size([256, 1])


In [15]:
class NewFeatureModel(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size):
        super(NewFeatureModel, self).__init__()

        self.fc_layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.LeakyReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Dropout(dropout_rate),
        )

        self.fc_out = nn.Linear(64, out_size)

    def forward(self, x):
        out = self.fc_layers(x)
        out = self.fc_out(out)
        return out


In [16]:
featuresModel_MLM = NewFeatureModel(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])
featuresModel_HLM = NewFeatureModel(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])

In [17]:
import torch.nn as nn

class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()
        self.mse = nn.MSELoss()  # 기존의 MSELoss 함수 사용

    def forward(self, output, target):
        mse_loss = self.mse(output, target)  # 기존의 MSELoss를 계산
        rmse_loss = torch.sqrt(mse_loss)  # MSE에 제곱근 씌워 RMSE 계산
        return rmse_loss

criterion = RMSELoss()
optimizer_MLM = torch.optim.Adam(featuresModel_MLM.parameters(), lr=CFG['LEARNING_RATE'])
optimizer_HLM = torch.optim.Adam(featuresModel_HLM.parameters(), lr=CFG['LEARNING_RATE'])

In [18]:
def train(train_loader, valid_loader, model, criterion, optimizer, epochs, patience=100):
    best_valid_loss = float('inf')
    no_improvement_count = 0

    for epoch in range(epochs):
        model.train()  # 모델을 훈련 모드로 설정
        running_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()

            output = model(inputs)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        model.eval()  # 모델을 검증 모드로 설정
        valid_loss = 0
        with torch.no_grad():
          for inputs, targets in valid_loader:
            output = model(inputs)
            loss = criterion(output, targets)
            valid_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        avg_valid_loss = valid_loss / len(valid_loader)
        print(f'Epoch: {epoch}/{epochs}, Train Loss: {avg_train_loss}, Valid Loss: {avg_valid_loss}')

        if avg_valid_loss < best_valid_loss:
          best_valid_loss = avg_valid_loss
          no_improvement_count = 0
          best_model_state = model.state_dict()
        else:
          no_improvement_count += 1
          if no_improvement_count >= patience:
            print(f'얼리 스토핑: {patience} 에포크 동안 검증 손실이 향상되지 않음. 에포크 {epoch}에서 훈련 중단.')
            break

    # 최적의 모델 상태 불러오기
    model.load_state_dict(best_model_state)
    return model

In [19]:
print("Training Start: MLM")
featuresModel_MLM = train(train_MLM_loader, valid_MLM_loader, featuresModel_MLM, criterion, optimizer_MLM, epochs=CFG['EPOCHS'])

print("Training Start: HLM")
featuresModel_HLM = train(train_HLM_loader, valid_HLM_loader, featuresModel_HLM, criterion, optimizer_HLM, epochs=CFG['EPOCHS'])

Training Start: MLM
Epoch: 0/5000, Train Loss: 51.27079876986417, Valid Loss: 52.43033981323242
Epoch: 1/5000, Train Loss: 51.27351205999201, Valid Loss: 52.38189951578776
Epoch: 2/5000, Train Loss: 51.25491298328746, Valid Loss: 52.326367696126304
Epoch: 3/5000, Train Loss: 51.15074400468306, Valid Loss: 52.27757263183594
Epoch: 4/5000, Train Loss: 51.121328527277164, Valid Loss: 52.23943201700846
Epoch: 5/5000, Train Loss: 51.1218920621005, Valid Loss: 52.20205307006836
Epoch: 6/5000, Train Loss: 51.04326421564276, Valid Loss: 52.16639200846354
Epoch: 7/5000, Train Loss: 51.00367494062944, Valid Loss: 52.13565190633138
Epoch: 8/5000, Train Loss: 50.95009994506836, Valid Loss: 52.104974110921226
Epoch: 9/5000, Train Loss: 50.92027213356712, Valid Loss: 52.06761678059896
Epoch: 10/5000, Train Loss: 50.84490897438743, Valid Loss: 52.02779006958008
Epoch: 11/5000, Train Loss: 50.78615292635831, Valid Loss: 51.98272196451823
Epoch: 12/5000, Train Loss: 50.77085460316051, Valid Loss: 51.94

In [20]:
torch.save(featuresModel_MLM.state_dict(), '/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/models/OnlyNewFeature_DNN_Model_MLM_2.pth')  # 모델 객체의 state_dict 저장
torch.save(featuresModel_HLM.state_dict(), '/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/models/OnlyNewFeature_DNN_Model_HLM_2.pth')

In [21]:
featuresModel_MLM.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/models/OnlyNewFeature_DNN_Model_MLM_2.pth'))
featuresModel_HLM.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/models/OnlyNewFeature_DNN_Model_HLM_2.pth'))

<All keys matched successfully>

In [22]:
test_MLM = NewFeatureDataset(test, target_col=None, transform=transform, is_test=True)
test_HLM = NewFeatureDataset(test, target_col=None, transform=transform, is_test=True)

test_MLM_loader = DataLoader(dataset=test_MLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

test_HLM_loader = DataLoader(dataset=test_HLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

In [23]:
def inference(test_loader, model):
    model.eval()
    preds = []

    with torch.no_grad():
        for inputs in test_loader:
            output = model(inputs)
            preds.extend(output.cpu().numpy().flatten().tolist())

    return preds

In [24]:
newFeature_predictions_MLM = inference(test_MLM_loader, featuresModel_MLM)
newFeature_predictions_HLM = inference(test_HLM_loader, featuresModel_HLM)
# newFeature 예측값들

In [25]:
### 여기서부턴 Morgan_Model

In [26]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/data/test.csv')

In [27]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

def calculate_metabolic_stability_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    logP = Descriptors.MolLogP(mol)
    num_rotatable_bonds = Descriptors.NumRotatableBonds(mol)
    num_heteroatoms = Descriptors.NumHeteroatoms(mol)
    num_hydrogen_acceptors = Descriptors.NumHAcceptors(mol)
    num_hydrogen_donors = Descriptors.NumHDonors(mol)
    morgan_fingerprint = AllChem.GetHashedMorganFingerprint(mol, 6, nBits=4096)
    morgan_array = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(morgan_fingerprint, morgan_array)
    aromatic_rings = mol.GetRingInfo().NumRings()
    tpsa = Descriptors.TPSA(mol)

    return logP, num_rotatable_bonds, num_heteroatoms, num_hydrogen_acceptors, num_hydrogen_donors, morgan_array, aromatic_rings, tpsa

train[[
    'logP', 'num_rotatable_bonds', 'num_heteroatoms',
    'num_hydrogen_acceptors', 'num_hydrogen_donors', 'morgan_fingerprint', 'aromatic_rings',
    'tpsa'
]] = train['SMILES'].apply(calculate_metabolic_stability_descriptors).apply(pd.Series)

test[[
    'logP', 'num_rotatable_bonds', 'num_heteroatoms',
    'num_hydrogen_acceptors', 'num_hydrogen_donors', 'morgan_fingerprint', 'aromatic_rings',
    'tpsa'
]] = test['SMILES'].apply(calculate_metabolic_stability_descriptors).apply(pd.Series)

train


Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,logP,num_rotatable_bonds,num_heteroatoms,num_hydrogen_acceptors,num_hydrogen_donors,morgan_fingerprint,aromatic_rings,tpsa
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.010,50.680,3.259,400.495,5,2,8,3.259,117.37,3.87744,8,8,6,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,89.13
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.270,50.590,2.169,301.407,2,1,2,2.172,73.47,3.35474,2,5,4,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,45.23
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45,1.20450,3,7,7,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.710,2.000,4.771,494.652,6,0,5,3.475,92.60,3.89356,5,9,7,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,84.22
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.270,99.990,2.335,268.310,3,0,1,2.337,42.43,2.81772,1,4,3,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,42.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,TRAIN_3493,Cn1nc(CNC(=O)Cn2nc(C(F)(F)F)c3c2CCC3)c(Cl)c1Cl,1.556,3.079,3.409,396.195,3,1,5,3.409,64.74,2.74730,4,11,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,64.74
3494,TRAIN_3494,CCn1[nH]cc/c1=N\C(=O)c1nn(-c2ccccc2)c(=O)c2ccc...,35.560,47.630,1.912,359.381,4,1,3,1.844,77.37,2.27630,3,7,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4,85.04
3495,TRAIN_3495,CCOC(=O)CCCc1nc2cc(N)ccc2n1C,56.150,1.790,1.941,261.320,3,1,6,2.124,70.14,2.04130,5,5,5,1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,70.14
3496,TRAIN_3496,Nc1cc(C(=O)OCCC2CCOC2=O)cnc1Cl,0.030,2.770,0.989,284.696,5,1,5,0.989,91.51,1.42720,4,7,6,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",2,91.51


In [28]:
train['AlogP'].fillna(train['AlogP'].median(), inplace=True)
test['AlogP'].fillna(test['AlogP'].median(), inplace=True)

In [29]:
class MorganDataset(Dataset):
    def __init__(self, data, target_col=None, transform=None, is_test=False):
        self.is_test = is_test
        self.transform = transform
        self.is_test = is_test

        if not self.is_test:
            self.data = self.transform.fit_transform(np.stack(data['morgan_fingerprint']))
        else: # test
            self.data = self.transform.transform(np.stack(data['morgan_fingerprint']))

        if target_col is not None and not self.is_test:
            self.target = data[target_col]

    def __getitem__(self, index):
        features = self.data[index]

        if hasattr(self, 'target'):
            target = self.target[index]
            return torch.tensor(features).to(device).float(), torch.tensor(target).to(device).float().unsqueeze(dim=-1)
        else:
            return torch.tensor(features).to(device).float()

    def __len__(self):
        return len(self.data)


In [30]:
transform = VarianceThreshold(threshold=0.05)

train_MLM = MorganDataset(train, target_col='MLM', transform=transform, is_test=False)
train_HLM = MorganDataset(train, target_col='HLM', transform=transform, is_test=False)

input_size = train_MLM.data.shape[1]
input_size

251

In [31]:
train_HLM.data.shape

(3498, 251)

In [32]:
# train,valid split
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42)
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42)

In [33]:
torch.tensor(train_MLM.data[1]).shape, torch.tensor(train['MLM'][1]).float().unsqueeze(dim=-1).shape

(torch.Size([251]), torch.Size([1]))

In [34]:
# Hyperparameter
CFG = {'BATCH_SIZE': 256, # 200 과적합일 시에 낮추기
       'EPOCHS': 10000,
       'INPUT_SIZE': input_size,
       'HIDDEN_SIZE': 1024,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.8, # 0.8 과적합일 시에 높이기
       'LEARNING_RATE': 1e-1}

In [35]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)


train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

In [36]:
X_train, y_train = next(iter(train_MLM_loader))
print (X_train.shape, y_train.shape)

torch.Size([256, 251]) torch.Size([256, 1])


In [37]:
class MorganModel(nn.Module): # 이게 제출한 전 코드임!!!!!
    def __init__(self, input_size, hidden_size, dropout_rate, out_size):
        super(MorganModel, self).__init__()

        self.fc_layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.LeakyReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.LeakyReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.LeakyReLU(),
            nn.Dropout(dropout_rate),
        )

        self.fc_out = nn.Linear(hidden_size, out_size)

    def forward(self, x):
        out = self.fc_layers(x)
        out = self.fc_out(out)
        return out

In [38]:
morganModel_MLM = MorganModel(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])
morganModel_HLM = MorganModel(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])

In [39]:
import torch.nn as nn

class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()
        self.mse = nn.MSELoss()  # 기존의 MSELoss 함수 사용

    def forward(self, output, target):
        mse_loss = self.mse(output, target)  # 기존의 MSELoss를 계산
        rmse_loss = torch.sqrt(mse_loss)  # MSE에 제곱근 씌워 RMSE 계산
        return rmse_loss

criterion = RMSELoss()
optimizer_MLM = torch.optim.Adam(morganModel_MLM.parameters(), lr=CFG['LEARNING_RATE'])
optimizer_HLM = torch.optim.Adam(morganModel_HLM.parameters(), lr=CFG['LEARNING_RATE'])


In [40]:
def train(train_loader, valid_loader, model, criterion, optimizer, epochs, patience=100):
    best_valid_loss = float('inf')
    no_improvement_count = 0

    for epoch in range(epochs):
        model.train()  # 모델을 훈련 모드로 설정
        running_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()

            output = model(inputs)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        model.eval()  # 모델을 검증 모드로 설정
        valid_loss = 0
        with torch.no_grad():
          for inputs, targets in valid_loader:
            output = model(inputs)
            loss = criterion(output, targets)
            valid_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        avg_valid_loss = valid_loss / len(valid_loader)
        print(f'Epoch: {epoch}/{epochs}, Train Loss: {avg_train_loss}, Valid Loss: {avg_valid_loss}')

        if avg_valid_loss < best_valid_loss:
          best_valid_loss = avg_valid_loss
          no_improvement_count = 0
          best_model_state = model.state_dict()
        else:
          no_improvement_count += 1
          if no_improvement_count >= patience:
            print(f'얼리 스토핑: {patience} 에포크 동안 검증 손실이 향상되지 않음. 에포크 {epoch}에서 훈련 중단.')
            break

    # 최적의 모델 상태 불러오기
    model.load_state_dict(best_model_state)
    return model

In [41]:
print("Training Start: MLM")
morganModel_MLM = train(train_MLM_loader, valid_MLM_loader, morganModel_MLM, criterion, optimizer_MLM, epochs=CFG['EPOCHS'])

print("Training Start: HLM")
morganModel_HLM = train(train_HLM_loader, valid_HLM_loader, morganModel_HLM, criterion, optimizer_HLM, epochs=CFG['EPOCHS'])

Training Start: MLM
Epoch: 0/10000, Train Loss: 39.60606869784269, Valid Loss: 37.37712605794271
Epoch: 1/10000, Train Loss: 36.50003780018199, Valid Loss: 36.68722788492838
Epoch: 2/10000, Train Loss: 36.00373979048295, Valid Loss: 36.6932258605957
Epoch: 3/10000, Train Loss: 36.29834816672585, Valid Loss: 36.67222340901693
Epoch: 4/10000, Train Loss: 35.14621908014471, Valid Loss: 36.27057902018229
Epoch: 5/10000, Train Loss: 34.39154260808771, Valid Loss: 34.22482426961263
Epoch: 6/10000, Train Loss: 33.61454287442294, Valid Loss: 33.68545150756836
Epoch: 7/10000, Train Loss: 33.178049087524414, Valid Loss: 34.550366719563804
Epoch: 8/10000, Train Loss: 33.39829913052645, Valid Loss: 36.13474909464518
Epoch: 9/10000, Train Loss: 32.78080662814054, Valid Loss: 34.476731618245445
Epoch: 10/10000, Train Loss: 33.1479400287975, Valid Loss: 34.55092748006185
Epoch: 11/10000, Train Loss: 32.54441937533292, Valid Loss: 34.28586196899414
Epoch: 12/10000, Train Loss: 31.836274233731356, Vali

In [42]:
torch.save(morganModel_MLM.state_dict(), '/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/models/OnlyMorganFingerprint_DNN_Model_MLM_3.pth')  # 모델 객체의 state_dict 저장
torch.save(morganModel_HLM.state_dict(), '/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/models/OnlyMorganFingerprint_DNN_Model_HLM_3.pth')

In [None]:
morganModel_MLM.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/models/OnlyMorganFingerprint_DNN_Model_MLM_3.pth'))
morganModel_HLM.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/models/OnlyMorganFingerprint_DNN_Model_HLM_3.pth'))

In [43]:
test_MLM = MorganDataset(test, target_col=None, transform=transform, is_test=True)
test_HLM = MorganDataset(test, target_col=None, transform=transform, is_test=True)

test_MLM_loader = DataLoader(dataset=test_MLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

test_HLM_loader = DataLoader(dataset=test_HLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

In [44]:
def inference(test_loader, model):
    model.eval()
    preds = []

    with torch.no_grad():
        for inputs in test_loader:
            output = model(inputs)
            preds.extend(output.cpu().numpy().flatten().tolist())

    return preds

In [45]:
morgan_predictions_MLM = inference(test_MLM_loader, morganModel_MLM)
morgan_predictions_HLM = inference(test_HLM_loader, morganModel_HLM)

In [46]:
import numpy as np

morgan_predict_MLM = np.array(morgan_predictions_MLM)
newFeature_predict_MLM = np.array(newFeature_predictions_MLM)

morgan_predict_HLM = np.array(morgan_predictions_HLM)
newFeature_predict_HLM = np.array(newFeature_predictions_HLM)

ensemble_predictions_MLM = (morgan_predict_MLM + newFeature_predict_MLM) / 2
ensemble_predictions_HLM = (morgan_predict_HLM + newFeature_predict_HLM) / 2

print(ensemble_predictions_HLM)


[60.46485901 86.49609375 61.66860771 68.56609726 89.90302658 83.05833435
 34.22291851 61.19134903 27.76046944 43.76188087 37.40528107 83.35913849
 76.06226349 70.82374573 28.8131752  82.70818329 41.24373722 59.513237
 84.40455627 76.69549561 59.70306206 25.78975296 28.51766205 26.89700985
 39.10724068 63.7062149  58.40719414 85.40262985 69.63093758 24.79141331
 60.55277061 34.51484489 47.14051247 48.41561317 73.45012283 29.45366383
 50.29488564 81.99282074 36.84933472 27.42454624 36.21807671 88.9228096
 22.88251591 43.12194538 28.44503403 87.02971649 29.41240215 27.10623264
 74.64299774 60.62939644 83.29589081 29.78503895 37.05029488 22.7594223
 46.39592934 64.76865578 64.74313545 39.76068497 31.2953968  36.34324837
 44.07148552 73.46212769 49.8523407  79.96922302 68.48624039 51.85720825
 74.70523453 58.68249321 73.4086895  62.44529533 63.1654911  76.51783752
 63.22107506 83.98417664 36.63149071 78.38834381 68.07758904 59.97873306
 80.65073395 31.84738922 77.74015045 77.7095108  33.412

In [47]:
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/data/sample_submission.csv')

submission['MLM'] = ensemble_predictions_MLM
submission['HLM'] = ensemble_predictions_HLM
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,14.932632,60.464859
1,TEST_001,69.077065,86.496094
2,TEST_002,26.201039,61.668608
3,TEST_003,48.412098,68.566097
4,TEST_004,39.029470,89.903027
...,...,...,...
478,TEST_478,13.450228,30.333930
479,TEST_479,82.493900,93.934593
480,TEST_480,30.967842,77.119736
481,TEST_481,49.739725,74.777763


In [48]:
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/AIDrug_Competition/submissions/NewFeature+Morgan_DNN_EnsembleModel_1_submission.csv', index=False)