In [1]:
import os
import pickle as pickle
import numpy as np
import pandas as pd

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

In [3]:
TARGET_DIR = './data/target'

# 1. Config / Params

In [4]:
dropout_rate = 0.2
num_classes = 50
shift_cls = 0

model_path = './checkpoints/ir-model-01.pt'
model_state_dict_path = './checkpoints/ir-model_state_dict-01.pt'

In [5]:
pass_threshould = 80.0

# 2. Dataset & Dataloader

In [6]:
class IrDataset(Dataset):
    def __init__(self, spectra, transform=None):
        self.spectra = spectra
        self.transform = transform

    def __len__(self):
        return self.spectra.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        xs = self.spectra[idx]
        xs = torch.from_numpy(xs).float()
        sample = { 
            'xs': xs
        }

        if self.transform:
            sample = self.transform(sample)

        return sample

In [7]:
df = pd.read_pickle(TARGET_DIR + '/df_valid.pk')
row_count = df.shape[0]

target = df['spectrum'].values
target = np.hstack(target).squeeze()
target = np.reshape(target, (row_count, 1, -1))

valid_dataset = IrDataset(spectra=target)
valid_ys = df.iloc[:, 2:52].values

In [8]:
df = pd.read_pickle(TARGET_DIR + '/df_test.pk')
row_count = df.shape[0]

target = df['spectrum'].values
target = np.hstack(target).squeeze()
target = np.reshape(target, (row_count, 1, -1))

test_dataset = IrDataset(spectra=target)
test_ys = df.iloc[:, 2:52].values

In [9]:
valid_loader = DataLoader(valid_dataset, batch_size=len(valid_dataset), shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False, num_workers=0)

In [10]:
df_train = pd.read_pickle(TARGET_DIR + '/df_train.pk')
df_valid = pd.read_pickle(TARGET_DIR + '/df_valid.pk')
df_test = pd.read_pickle(TARGET_DIR + '/df_test.pk')

df_train_col_names = df_train.columns.values.tolist()[2:52]
df_valid_col_names = df_valid.columns.values.tolist()[2:52]
df_test_col_names = df_test.columns.values.tolist()[2:52]
for idx in range(50):
    if df_train_col_names[idx] == df_valid_col_names[idx] and df_valid_col_names[idx] == df_test_col_names[idx]:
        continue
    print('!!!', idx)

In [11]:
df_test_col_names

['CO',
 'cOC',
 'COC(-,:C)=O',
 'cnc',
 'cCl',
 'cO',
 'CCl',
 'CC(-,:C)=O',
 'cBr',
 'c[N&+](=O)[O&-]',
 'cC(-,:C)=O',
 'C=CC',
 'cN',
 'cC(=O)OC',
 'COC',
 'CF',
 'CBr',
 'coc',
 'cF',
 'CC(=O)O',
 'c=O',
 'c[n&H1]c',
 'csc',
 'cC=O',
 'CNC',
 'CN',
 'CN(-,:C)C',
 'cC#N',
 'cn(-,:c)C',
 'cC(=O)O',
 'CC=C(-,:C)C',
 'CC#N',
 'cNC(-,:C)=O',
 'cNC',
 'C/C=C/C',
 'CC=CC',
 'C=C(-,:C)C',
 'C#CC',
 'cC(-,:c)=O',
 'cN(-,:C)C',
 'CC(=O)OC',
 'CC#CC',
 'cI',
 'CNC(-,:C)=O',
 'cC=Cc',
 'c-n(-,:c)c',
 'cnn(-,:c)C',
 'cnnc',
 'cP(-,:c)c',
 'CS']

# 3. Model

In [12]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [13]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=2, kernel_size=11, stride=1, padding=5)
        self.conv1_bn = nn.BatchNorm1d(2)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.conv2 = nn.Conv1d(in_channels=2, out_channels=4, kernel_size=11, stride=1, padding=5)
        self.conv2_bn = nn.BatchNorm1d(4)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.conv3 = nn.Conv1d(in_channels=4, out_channels=8, kernel_size=11, stride=1, padding=5)
        self.conv3_bn = nn.BatchNorm1d(8)
        self.dropout3 = nn.Dropout(dropout_rate)
        
        self.fc1 = nn.Linear(3400, 1000)
        self.fc1_bn = nn.BatchNorm1d(1000)
        self.dropout_fc1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(1000, 250)
        self.fc2_bn = nn.BatchNorm1d(250)
        self.dropout_fc2 = nn.Dropout(dropout_rate)
        self.fc3 = nn.Linear(250, 64)
        self.fc3_bn = nn.BatchNorm1d(64)
        self.dropout_fc3 = nn.Dropout(dropout_rate)
        self.fc4 = nn.Linear(64, num_classes)

    def forward(self, x):
        z = self.conv1_bn(
            F.relu(
                self.conv1(x)
            )
        )
        x = self.dropout1(z)
        x = F.max_pool1d(x, 2)
        
        z = self.conv2_bn(
            F.relu(
                self.conv2(x)
            )
        )
        x = self.dropout2(z)
        x = F.max_pool1d(x, 2)
        
        z = self.conv3_bn(
            F.relu(
                self.conv3(x)
            )
        )
        x = self.dropout3(z)
        x = F.max_pool1d(x, 2)
        
        x = torch.flatten(x, 1)
        
        x = F.relu(self.fc1_bn(self.fc1(x)))
        x = self.dropout_fc1(x)
        x = F.relu(self.fc2_bn(self.fc2(x)))
        x = self.dropout_fc2(x)
        x = F.relu(self.fc3_bn(self.fc3(x)))
        x = self.dropout_fc3(x)
        x = self.fc4(x)
        
        output = torch.sigmoid(x)
        
        return output

In [14]:
model = Net().to(device)

# optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
# criterion = nn.BCELoss(reduction='none')

# scheduler = StepLR(optimizer, step_size=10, gamma=gamma)

# 4. Train / Test processes

In [15]:
# train_results = []

# def train_process(model, device, data_loader, criterion, optimizer, epoch):
#     model.train()
    
#     total_loss = 0.0
#     counter = 0.0
#     for batch_idx, data in enumerate(data_loader):
#         xs, ys, ws = data['xs'].to(device), data['ys'].to(device), data['ws'].to(device)
#         optimizer.zero_grad()
        
#         output = model(xs)
#         loss = criterion(output, ys)
#         loss = (loss * ws).mean()
        
#         total_loss += loss.item()
#         counter += 1.0
        
#         loss.backward()
        
#         optimizer.step()
    
#     train_results.append((epoch, total_loss / counter))

In [16]:
min_verify_loss = 10000

def make_predict_and_bundle_answer(model, device, data_loader, ys):
    model.eval()

    ans = []
    with torch.no_grad():
        for data in data_loader:
            xs = data['xs'].to(device)
            output = model(xs)
            output = output.cpu().numpy()

            ans.append((output, ys))
    if len(ans) is not 1:
        print('ERROR: data_loader should be 1 batch!')
        return False
    return ans[0]

# 5. Run

In [17]:
# %%time

# for epoch in range(epochs):
#     train_process(model, device, train_loader, criterion, optimizer, epoch)
#     verify_process(model, device, valid_loader, criterion, epoch)
#     if epoch % log_interval == 0:
#         print(
#             'Epoch: {:0>3d}; Train Loss: {:.5f}; Validation Loss: {:.5f}'.format(
#                 epoch, 
#                 train_results[-1][1], 
#                 verify_results[-1][1]
#             )
#         )
#     scheduler.step()

In [18]:
# train_loss = [a for e, a in train_results]
# train_eps = [e for e, a in train_results]
# valid_loss = [a for e, a in verify_results]
# valid_eps = [e for e, a in verify_results]
# plt.plot(train_eps, train_loss, 'b-')
# plt.plot(valid_eps, valid_loss, 'r.')
# plt.show()
# min(valid_loss)

# 6. Validation

In [19]:
map_location='cpu'

loaded_model = torch.load(model_path, map_location=map_location)
loaded_model.eval()

Net(
  (conv1): Conv1d(1, 2, kernel_size=(11,), stride=(1,), padding=(5,))
  (conv1_bn): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (conv2): Conv1d(2, 4, kernel_size=(11,), stride=(1,), padding=(5,))
  (conv2_bn): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (conv3): Conv1d(4, 8, kernel_size=(11,), stride=(1,), padding=(5,))
  (conv3_bn): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout3): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=3400, out_features=1000, bias=True)
  (fc1_bn): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout_fc1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=1000, out_features=250, bias=True)
  (fc2_bn): BatchNorm1d(250, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout_fc

In [20]:
def calc_accuracy(cls_idx, yhs, ys):
    t_total, t_correct, t_wrong = 0, 0, 0
    f_total, f_correct, f_wrong = 0, 0, 0

    for idx in range(len(ys)):
        yh, y = yhs[idx][cls_idx], ys[idx][cls_idx]
        if y == 1:
            t_total += 1
            if yh >= 0.5:
                t_correct += 1
            else:
                t_wrong += 1
        elif y == 0:
            f_total += 1
            if yh < 0.5:
                f_correct += 1
            else:
                f_wrong += 1
        else:
            t_wrong += 1
            f_wrong += 1
    t_acc = t_correct * 100.0 / t_total
    f_acc = f_correct * 100.0 / f_total
    return t_acc, f_acc


def print_result(cls_idx, valid_t_acc, valid_f_acc, test_t_acc, test_f_acc):
    print(
        'FG{:0>2d}({}) ValidTrueAcc = {:0.0f}%({}); ValidFalseAcc = {:.0f}%({}); TestTrueAcc = {:0.0f}%({}); TestFalseAcc = {:.0f}%({}); {}'.format(
            cls_idx,
            'O' if valid_t_acc >= pass_threshould and valid_f_acc >= pass_threshould and test_t_acc >= pass_threshould and test_f_acc >= pass_threshould else 'X',
            valid_t_acc,
            'O' if valid_t_acc >= pass_threshould else 'X',
            valid_f_acc,
            'O' if valid_f_acc >= pass_threshould else 'X',
            test_t_acc,
            'O' if test_t_acc >= pass_threshould else 'X',
            test_f_acc,
            'O' if test_f_acc >= pass_threshould else 'X',
            df_test_col_names[cls_idx]
            
        )
    )

In [21]:
valid_yhs, valid_ys = make_predict_and_bundle_answer(loaded_model, device, valid_loader, valid_ys)
test_yhs, test_ys = make_predict_and_bundle_answer(loaded_model, device, test_loader, test_ys)

fail_fg_idxs = []

pass_count = 0
dict_fg_acc_results = {}
for cls_idx in range(num_classes):
    valid_t_acc, valid_f_acc = calc_accuracy(cls_idx, valid_yhs, valid_ys)
    test_t_acc, test_f_acc = calc_accuracy(cls_idx, test_yhs, test_ys)
    print_result(cls_idx, valid_t_acc, valid_f_acc, test_t_acc, test_f_acc)
    if valid_t_acc >= pass_threshould and valid_f_acc >= pass_threshould and test_t_acc >= pass_threshould and test_f_acc >= pass_threshould:
        pass_count += 1
        test_balanced_acc = (test_t_acc + test_f_acc) / 2.0
        dict_fg_acc_results[df_test_col_names[cls_idx]] = {
            'bacc': test_balanced_acc,
            'output_idx': cls_idx,
        }
    else:
        fail_fg_idxs.append(cls_idx)
        
print(
    'number of useful FGs in valid sets (t_acc and f_acc are more than {}%) = '.format(pass_threshould), 
    pass_count,
)

FG00(O) ValidTrueAcc = 89%(O); ValidFalseAcc = 94%(O); TestTrueAcc = 96%(O); TestFalseAcc = 95%(O); CO
FG01(O) ValidTrueAcc = 86%(O); ValidFalseAcc = 87%(O); TestTrueAcc = 83%(O); TestFalseAcc = 87%(O); cOC
FG02(O) ValidTrueAcc = 97%(O); ValidFalseAcc = 93%(O); TestTrueAcc = 97%(O); TestFalseAcc = 93%(O); COC(-,:C)=O
FG03(X) ValidTrueAcc = 88%(O); ValidFalseAcc = 83%(O); TestTrueAcc = 89%(O); TestFalseAcc = 80%(X); cnc
FG04(X) ValidTrueAcc = 89%(O); ValidFalseAcc = 80%(O); TestTrueAcc = 91%(O); TestFalseAcc = 78%(X); cCl
FG05(O) ValidTrueAcc = 92%(O); ValidFalseAcc = 95%(O); TestTrueAcc = 88%(O); TestFalseAcc = 93%(O); cO
FG06(O) ValidTrueAcc = 89%(O); ValidFalseAcc = 90%(O); TestTrueAcc = 90%(O); TestFalseAcc = 90%(O); CCl
FG07(O) ValidTrueAcc = 84%(O); ValidFalseAcc = 91%(O); TestTrueAcc = 97%(O); TestFalseAcc = 90%(O); CC(-,:C)=O
FG08(X) ValidTrueAcc = 81%(O); ValidFalseAcc = 77%(X); TestTrueAcc = 91%(O); TestFalseAcc = 76%(X); cBr
FG09(O) ValidTrueAcc = 90%(O); ValidFalseAcc = 96%(

In [22]:
dict_fg_acc_results

{'CO': {'bacc': 95.50769437537801, 'output_idx': 0},
 'cOC': {'bacc': 85.05103380989905, 'output_idx': 1},
 'COC(-,:C)=O': {'bacc': 95.19251568245727, 'output_idx': 2},
 'cO': {'bacc': 90.88144881047847, 'output_idx': 5},
 'CCl': {'bacc': 89.92131616595137, 'output_idx': 6},
 'CC(-,:C)=O': {'bacc': 93.48700495049505, 'output_idx': 7},
 'c[N&+](=O)[O&-]': {'bacc': 94.65554948059048, 'output_idx': 9},
 'cC(-,:C)=O': {'bacc': 93.66684891561874, 'output_idx': 10},
 'C=CC': {'bacc': 90.92034968431278, 'output_idx': 11},
 'cN': {'bacc': 95.26069921639542, 'output_idx': 12},
 'cC(=O)OC': {'bacc': 91.36191947806678, 'output_idx': 13},
 'COC': {'bacc': 97.96348314606742, 'output_idx': 14},
 'CF': {'bacc': 94.31358721203574, 'output_idx': 15},
 'CBr': {'bacc': 83.41736694677871, 'output_idx': 16},
 'coc': {'bacc': 88.109243697479, 'output_idx': 17},
 'CC(=O)O': {'bacc': 91.8969298245614, 'output_idx': 19},
 'c=O': {'bacc': 91.83232849926674, 'output_idx': 20},
 'CNC': {'bacc': 94.22547149819877,

In [23]:
fn_dict_fg_acc_results = './checkpoints/dict_fg_acc_results.pk'
with open(fn_dict_fg_acc_results, 'wb') as file:
    pickle.dump(dict_fg_acc_results, file)
    
loaded_dict_fg_acc_results = None
with open(fn_dict_fg_acc_results, 'rb') as file:
    loaded_dict_fg_acc_results = pickle.load(file)

In [24]:
loaded_dict_fg_acc_results

{'CO': {'bacc': 95.50769437537801, 'output_idx': 0},
 'cOC': {'bacc': 85.05103380989905, 'output_idx': 1},
 'COC(-,:C)=O': {'bacc': 95.19251568245727, 'output_idx': 2},
 'cO': {'bacc': 90.88144881047847, 'output_idx': 5},
 'CCl': {'bacc': 89.92131616595137, 'output_idx': 6},
 'CC(-,:C)=O': {'bacc': 93.48700495049505, 'output_idx': 7},
 'c[N&+](=O)[O&-]': {'bacc': 94.65554948059048, 'output_idx': 9},
 'cC(-,:C)=O': {'bacc': 93.66684891561874, 'output_idx': 10},
 'C=CC': {'bacc': 90.92034968431278, 'output_idx': 11},
 'cN': {'bacc': 95.26069921639542, 'output_idx': 12},
 'cC(=O)OC': {'bacc': 91.36191947806678, 'output_idx': 13},
 'COC': {'bacc': 97.96348314606742, 'output_idx': 14},
 'CF': {'bacc': 94.31358721203574, 'output_idx': 15},
 'CBr': {'bacc': 83.41736694677871, 'output_idx': 16},
 'coc': {'bacc': 88.109243697479, 'output_idx': 17},
 'CC(=O)O': {'bacc': 91.8969298245614, 'output_idx': 19},
 'c=O': {'bacc': 91.83232849926674, 'output_idx': 20},
 'CNC': {'bacc': 94.22547149819877,

In [25]:
len(loaded_dict_fg_acc_results)

33

In [26]:
torch.save(loaded_model.state_dict(), model_state_dict_path)

# 7. Performance

In [27]:
fail_fg_idxs

[3, 4, 8, 18, 21, 22, 23, 25, 29, 31, 34, 37, 38, 42, 45, 47, 49]

In [28]:
len(fail_fg_idxs)

17