## PL Trainer の訓練をoptunaに対応させる

In [1]:
import sys
sys.path.append('../script/')
from os.path import exists

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pytorch_lightning as pl
from optuna.integration import PyTorchLightningPruningCallback

import utils
import models

DEVICE = "cuda"
EPOCHS = 2



In [2]:
class MetricsCallback(pl.Callback):
    """PyTorch Lightning metric callback."""

    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)

## Add folds No. for CV

In [3]:
path_fold = "../input/folds/train_folds.csv"
if not exists(path_fold):
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
    df = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
    df.loc[:, "kfold"] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    targets = df.drop("sig_id", axis=1).values

    mskf = MultilabelStratifiedKFold(n_splits=5)
    for fold_, (tr_, va_) in enumerate(mskf.split(X=df, y=targets)):
        df.loc[va_, "kfold"] = fold_
    df.to_csv(path_fold, index=False)

## Params for training function `run_training`

In [4]:
fold = 0
# optuna 使わないので適当
params = {
    "num_layers": 3,
    "hidden_size": 16,
    "dropout": 0.3,
    "learning_rate": 1e-3,
}
save_model=True

---
## Prototyping training process from HERE

In [5]:
df = pd.read_csv("../input/lish-moa/train_features.csv")
df = utils.process_data(df)
folds = pd.read_csv("../input/folds/train_folds.csv")

## Create aux target 
`nsc_labels` means # of labels found in non-scored train set Which is not available in test set.

In [6]:
non_scored_df = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
targets_non_scored = non_scored_df.drop("sig_id", axis=1).to_numpy().sum(axis=1)
non_scored_df.loc[:, "nsc_labels"] = targets_non_scored
drop_cols = [c for c in non_scored_df.columns if c not in ("nsc_labels", "sig_id")]
non_scored_df = non_scored_df.drop(drop_cols, axis=1)
folds = folds.merge(non_scored_df, on="sig_id", how="left")
folds.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold,nsc_labels
0,id_09f3cdc66,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_83659b848,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,0
2,id_17e3b5c2d,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1
3,id_29e34a9e1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
4,id_cabd98a36,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
targets = folds.drop(["sig_id", "kfold"], axis=1).columns
features = df.drop("sig_id", axis=1).columns

In [8]:
df = df.merge(folds, on="sig_id", how="left")

In [9]:
print(f'[Fold No.{fold:>3}]\n')
train_df = df[df.kfold != fold].reset_index(drop=True)
valid_df = df[df.kfold == fold].reset_index(drop=True)

[Fold No.  0]



In [10]:
x_tr = train_df[features].to_numpy()
x_va = valid_df[features].to_numpy()

In [11]:
y_tr = train_df[targets].to_numpy()
y_va = valid_df[targets].to_numpy()

In [12]:
metrics_callback = MetricsCallback()
trainer = pl.Trainer(logger=False, gpus=1, max_epochs=5, weights_summary="full", callbacks=[metrics_callback])
# add param early_stop_callback=PyTorchLightningPruningCallback(trial, monitor="val_acc")

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


In [13]:
model = utils.LitMoA(
    hparams={},
    model=models.BaseLine2(
        num_features=x_tr.shape[1],
        num_targets=y_tr.shape[1],
        num_layers=params["num_layers"],
        hidden_size=params["hidden_size"],
        dropout=params["dropout"],
    ),
)
dm = utils.MoADataModule(
    hparams={"train_size": x_tr.shape[0]}, data=np.vstack([x_tr, x_va]), targets=np.vstack([y_tr, y_va])
)

In [14]:
trainer.fit(model, dm)

{'train_size': 19052}



   | Name          | Type              | Params
-----------------------------------------------------
0  | model         | BaseLine2         | 18 K  
1  | model.model   | Sequential        | 18 K  
2  | model.model.0 | Linear            | 14 K  
3  | model.model.1 | BatchNorm1d       | 32    
4  | model.model.2 | Dropout           | 0     
5  | model.model.3 | Linear            | 272   
6  | model.model.4 | BatchNorm1d       | 32    
7  | model.model.5 | Dropout           | 0     
8  | model.model.6 | Linear            | 272   
9  | model.model.7 | BatchNorm1d       | 32    
10 | model.model.8 | Dropout           | 0     
11 | model.model.9 | Linear            | 3 K   
12 | criterion     | BCEWithLogitsLoss | 0     


Epoch 0:  79%|███████████████████████████████▋        | 19/24 [00:05<00:01,  3.46it/s, loss=0.722, training_loss=0.707]
Validating: 0it [00:00, ?it/s]
Epoch 0: 100%|██████████████████████| 24/24 [00:10<00:00,  2.28it/s, loss=0.722, training_loss=0.707, valid_loss=0.694]
Epoch 1:  79%|███▏| 19/24 [00:05<00:01,  3.46it/s, loss=0.700, training_loss=0.691, valid_loss=0.694, train_loss=0.722]
Validating: 0it [00:00, ?it/s]
Epoch 1: 100%|████| 24/24 [00:10<00:00,  2.28it/s, loss=0.700, training_loss=0.691, valid_loss=0.675, train_loss=0.722]
Epoch 2:  79%|███▏| 19/24 [00:05<00:01,  3.40it/s, loss=0.681, training_loss=0.671, valid_loss=0.675, train_loss=0.699]
Validating: 0it [00:00, ?it/s]
Epoch 2: 100%|█████| 24/24 [00:10<00:00,  2.25it/s, loss=0.681, training_loss=0.671, valid_loss=0.66, train_loss=0.699]
Epoch 3:  79%|████▊ | 19/24 [00:05<00:01,  3.43it/s, loss=0.660, training_loss=0.646, valid_loss=0.66, train_loss=0.68]
Validating: 0it [00:00, ?it/s]
Epoch 3: 100%|█████| 24/24 [00:10<00

Saving latest checkpoint..


Epoch 4: 100%|█████| 24/24 [00:10<00:00,  2.25it/s, loss=0.637, training_loss=0.627, valid_loss=0.612, train_loss=0.66]


1

In [15]:
metrics_callback.metrics[-1]["valid_loss"].item()

0.6115400195121765

In [16]:
def objective(trial):
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        os.path.join(MODEL_DIR, "trial_{}".format(trial.number), "{epoch}"), monitor="val_acc"
    )
    metrics_callback = MetricsCallback()
    
    params = {
        "num_layers": trial.suggest_int("num_layers", 1, 7),
        "hidden_size": trial.suggest_int("hidden_size", 16, 2048),
        "dropout": trial.suggest_uniform("dropout", 0.1, 0.8),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 1e-3),
    }
    
    metrics_callback = MetricsCallback()
    trainer = pl.Trainer(
        logger=False,
        limit_val_batches=PERCENT_VALID_EXAMPLES,
        checkpoint_callback=checkpoint_callback,
        max_epochs=EPOCHS,
        gpus=1 if torch.cuda.is_available() else None,
        callbacks=[metrics_callback],
        early_stop_callback=PyTorchLightningPruningCallback(trial, monitor="val_acc"),
    )

    model = LightningNet(trial)
    trainer.fit(model)

    return metrics_callback.metrics[-1]["val_acc"].item()

    loss_all = []
    for fold_ in range(5):
        loss_tmp = run_training(fold_, params, save_model=False)
        loss_all.append(loss_tmp)
    return np.mean(loss_all)

In [17]:
df_te = pd.read_csv("../input/lish-moa/test_features.csv")
df_te = utils.process_data(df_te)
x_te = df_te[features].to_numpy()
dataset_te = utils.TestMoaDataset(dataset=x_te)
loader_te = torch.utils.data.DataLoader(
    dataset_te, batch_size=1024, num_workers=4, shuffle=False,
)

In [18]:
predictions = np.zeros((x_te.shape[0], 206))
inference_model = model.model

In [19]:
inference_model.eval()
for ind, batch in enumerate(loader_te):
    p = torch.sigmoid(inference_model(batch["x"])).detach().cpu().numpy()
    predictions[ind * 1024 : (ind + 1) * 1024] = p[:,:-1]

In [20]:
test_features1 = pd.read_csv("../input/lish-moa/test_features.csv")
s = pd.DataFrame({"sig_id": test_features1["sig_id"].values})

for col in folds.columns[1:-2].tolist():
    s[col] = 0
s.loc[:, folds.columns[1:-2]] = predictions

In [21]:
s.loc[
    s["sig_id"].isin(test_features1.loc[test_features1["cp_type"] == "ctl_vehicle", "sig_id"]),
    folds.columns[1:-2],
] = 0

In [22]:
s.to_csv("../submission/submission.csv", index=False)
torch.save(model.model.state_dict(), "../weight/model.pt")