## Exp-029 (ULMS BERT)

Exp-027からの変更点<br>
前処理

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!nvidia-smi

Mon Sep 13 03:00:25 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    38W / 250W |   7631MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers pycld2
! pip install texthero



In [None]:
input_dir = "/content/drive/MyDrive/07_Competition/signate-471/data/"
output_dir = "/content/drive/MyDrive/07_Competition/signate-471/log/"
submission_dir = "/content/drive/MyDrive/07_Competition/signate-471/submission/"
model_dir = "/content/drive/MyDrive/07_Competition/signate-471/model_bin/"
pred_dir = "/content/drive/MyDrive/07_Competition/signate-471/pred/"

In [None]:
import os
import math
import random
import pandas as pd
import numpy as np
from glob import glob
import gc
gc.enable()

import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import fbeta_score

from transformers import BertConfig, RobertaConfig
from transformers import (get_cosine_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup)
from transformers import BertTokenizer, RobertaTokenizer
from transformers import BertModel, RobertaModel
from transformers import AutoConfig, BertConfig, RobertaConfig
from transformers import BertForSequenceClassification, RobertaForSequenceClassification
from torch import cuda
import time

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel, AutoModelForSequenceClassification
from transformers import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
from transformers import get_linear_schedule_with_warmup

from IPython.display import clear_output
from tqdm import tqdm, trange

import re
import nltk
import pycld2 as cld2
import texthero as hero
from scipy.optimize import minimize, minimize_scalar
import regex
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
class CFG:
  exp = "exp29h"
  seed = 71
  fold = 5
  max_len = 280
  epochs = 1
  lr = 2e-5
  train_batch_size = 16
  valid_batch_size = 32
  model_name = "GanjinZero/UMLSBert_ENG"

CONFIG = CFG()

In [None]:
os.makedirs(model_dir+CONFIG.exp+"/", exist_ok=True)
os.makedirs(pred_dir+CONFIG.exp+"/", exist_ok=True)
os.makedirs(output_dir+CONFIG.exp+"/", exist_ok=True)

In [None]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

set_random_seed(CONFIG.seed)

In [None]:
DEVICE = torch.device('cuda') if cuda.is_available() else 'cpu'

In [None]:
def init_logger(log_file=output_dir + CONFIG.exp+ f"/{CONFIG.exp}_train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

In [None]:
def get_train_data(train):
    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=CONFIG.fold, shuffle=True, random_state=CONFIG.seed)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

def get_test_data(test):
    return test

In [None]:
class SRWSDataset(Dataset):
  def __init__(self, df, inference_only=False):

    # Berttokenizer
    tokenizer = BertTokenizer.from_pretrained(CONFIG.model_name)

    self.df = df
    self.inference_only = inference_only # "train":False or "test":True
    self.text = self.df["title_abst"].tolist() # text

    if not self.inference_only:
      # ここvalueだけ
      self.target = df["judgement"].values
      
    self.encoded = tokenizer.batch_encode_plus(
        self.text,
        padding = "max_length",
        max_length = CONFIG.max_len,
        truncation = True,
        return_attention_mask=True
    )

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    input_ids = torch.tensor(self.encoded["input_ids"][index])
    attention_mask = torch.tensor(self.encoded["attention_mask"][index])

    # returnをsetかdictで返すかは自由
    if self.inference_only:
      return (input_ids, attention_mask)

    else:
      # ここで、tensor に変更している
      target = torch.tensor(self.target[index]).float()
      return (input_ids, attention_mask, target)


In [None]:
class SRWSBertModel(nn.Module):
  def __init__(self):
    super().__init__()

    self.config = BertConfig.from_pretrained(CFG.model_name)
    self.config.update({"output_hidden_states": True})
    self.config.hidden_dropout_prob = 0
    self.config.attention_probs_dropout_prob = 0
    self.bert = BertModel.from_pretrained(CONFIG.model_name, config=self.config)
    self.regressor = nn.Linear(self.config.hidden_size*4, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask):
    output = self.bert(input_ids=input_ids, attention_mask=attention_mask) 
    sequence_output = torch.cat([output["hidden_states"][-1*i][:,0] for i in range(1, 4+1)], dim=1)
    bert_output = self.regressor(sequence_output)
    bert_output = self.sigmoid(bert_output).squeeze()

    return bert_output

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [None]:
# 学習
def train_fn(model, train_loader, optimizer, epoch, loss_function, scheduler=None):
  start = end = time.time()
  losses = AverageMeter()
  model.train()

  for batch_num, (input_ids, attention_mask, target) in enumerate(train_loader):
    optimizer.zero_grad()

    input_ids = input_ids.to(DEVICE)
    attention_mask = attention_mask.to(DEVICE)
    target = target.to(DEVICE)
    batch_size = target.size(0)

    pred = model(input_ids, attention_mask)

    # Loss算出
    #print(pred)
    loss = loss_function(pred, target)
    losses.update(loss.item(), batch_size)
    loss.backward()

    optimizer.step()

    if scheduler:
      scheduler.step()

    if batch_num % 100 == 0 or batch_num == (len(train_loader) -1):
      print(
            f"Epoch: [{epoch + 1}][{batch_num}/{len(train_loader)}] "
            f"Elapsed {timeSince(start, float(batch_num + 1) / len(train_loader)):s} "
            f"Loss: {losses.avg:.4f} "
            )
      
  return losses.avg

def valid_fn(valid_loader, model, loss_function):
  start = end = time.time()
  losses = AverageMeter()

  model.eval()
  preds = []

  for batch_num, (input_ids, attention_mask, target) in enumerate(valid_loader):
    input_ids = input_ids.to(DEVICE)
    attention_mask = attention_mask.to(DEVICE)
    target = target.to(DEVICE)
    batch_size = target.size(0)

    # compare loss
    with torch.no_grad():
      pred = model(input_ids, attention_mask)

    loss = loss_function(pred, target)
    losses.update(loss.item(), batch_size)

    # スコア追加
    preds.append(pred.to("cpu").numpy())

    if batch_num % 100 == 0 or batch_num == (len(valid_loader) - 1):
      print(
          f"EVAL: [{batch_num}/{len(valid_loader)}]"
          f"Elapsed {timeSince(start, float(batch_num+1) / len(valid_loader)):s}"
          f"Loss: {losses.avg:.4f}"
      )
  predictions = np.concatenate(preds)

  return losses.avg, predictions

# 予測
def inference():
    predictions = []

    test_dataset = SRWSDataset(test,  inference_only=True)
    test_loader = DataLoader(
        test_dataset, 
        batch_size=CONFIG.valid_batch_size, 
        shuffle=False, 
        num_workers=4, 
        pin_memory=True
    )

    for fold in range(CONFIG.fold):
        LOGGER.info(f"========== model: {CONFIG.model_name} fold: {fold} inference ==========")
        model = SRWSBertModel()
        model.to(DEVICE)
        model.load_state_dict(torch.load(model_dir +CONFIG.exp + "/"+ f"{CONFIG.model_name.split('/')[1]}_fold{fold}_best.pth")["model"])
        model.eval()
        preds = []
        for i, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total=len(test_loader)):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
            with torch.no_grad():
                y_preds = model(input_ids, attention_mask)
            preds.append(y_preds.to("cpu").numpy())
        preds = np.concatenate(preds)
        predictions.append(preds)
    predictions = np.mean(predictions, axis=0)

    return predictions

In [None]:
# 最適化（使ってない）
# https://signate.jp/competitions/471/discussions/tf-roberta-base-baseline-cv08949-lb08734

def opt_fbeta_threshold(y_true, y_pred):
  """fbeta score計算時のthresholdを最適化"""
  def opt_(x):
    return -fbeta_score(y_true, y_pred >= x, beta=7)
  result = minimize(opt_, x0=np.array([0.02]), method='Powell')
  best_threshold = result['x'].item()
  return best_threshold

In [None]:
def get_optimizer_grouped_parameters(model):
    model_type = 'bert'
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters()
                       if 'lstm' in n
                       or 'cnn' in n
                       or 'regressor' in n],
            "weight_decay": 0.0,
            "lr": 1e-3,
        },
    ]
    num_layers = model.config.num_hidden_layers
    layers = [getattr(model, model_type).embeddings] + list(getattr(model, model_type).encoder.layer)
    layers.reverse()
    lr = CONFIG.lr
    for layer in layers:
        lr *= 0.95
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.1,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
    return optimizer_grouped_parameters

In [None]:
# LOOP
def train_loop(train, fold):
  LOGGER.info(f"========== fold: {fold} training ==========")

  # ====================================================
  # Data Loader
  # ====================================================
  trn_idx = train[train["fold"] != fold].index
  val_idx = train[train["fold"] == fold].index

  train_folds = train.loc[trn_idx].reset_index(drop=True)
  valid_folds = train.loc[val_idx].reset_index(drop=True)

  train_dataset = SRWSDataset(train_folds)
  valid_dataset = SRWSDataset(valid_folds)

  train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG.train_batch_size,
        shuffle=True,
        num_workers=4,
        pin_memory=True, # https://qiita.com/sugulu_Ogawa_ISID/items/62f5f7adee083d96a587
        drop_last=True,
  )
  valid_loader = DataLoader(
        valid_dataset,
        batch_size=CONFIG.valid_batch_size,
        shuffle=False,
        num_workers=4,
        pin_memory=True,
        drop_last=False,
  )

  # ====================================================
  # Model
  # ====================================================
  model = SRWSBertModel()
  model.to(DEVICE)

  optimizer_parameters = get_optimizer_grouped_parameters(model)

  #optimizer = AdamW(model.parameters(), lr=CONFIG.lr)
  optimizer = AdamW(optimizer_parameters, lr=CONFIG.lr, weight_decay=0.1)

  # Loss_function
  loss_function = nn.BCELoss()

  # ====================================================
  # LOOP
  # ====================================================

  best_score = -1
  best_loss = np.inf
  best_borders=[]

  # 学習
  for epoch in range(CONFIG.epochs):
    start_time = time.time()

    # train
    avg_loss = train_fn(model, train_loader, optimizer, epoch, loss_function)

    # valid
    avg_val_loss, preds = valid_fn(valid_loader, model,loss_function)
    valid_labels = valid_folds["judgement"].values

    # border最適化
    border_m = opt_fbeta_threshold(valid_labels, preds)
    best_borders.append(border_m)

    # score
    score = fbeta_score(valid_labels, np.where(preds < border_m, 0, 1), beta=7.0)

    elapsed = time.time() - start_time
    LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
    )
    LOGGER.info(f"Epoch {epoch+1} - Score: {score}")

    if score > best_score:
      best_score = score
      LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} ")
      torch.save(
                {"model": model.state_dict(), "preds": preds}, model_dir +CONFIG.exp + "/"+ f"{CONFIG.model_name.split('/')[1]}_fold{fold}_best.pth"
      ) # scibertでの変更
  check_point = torch.load(model_dir +CONFIG.exp + "/"+ f"{CONFIG.model_name.split('/')[1]}_fold{fold}_best.pth")

  valid_folds["preds"] = check_point["preds"]

  return valid_folds,best_borders

In [None]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["judgement"].values
    best_threshold = opt_fbeta_threshold(labels, preds)
    print("Best_Threshold：" + str(best_threshold))
    # 上実行すると、ValueError: Classification metrics can't handle a mix of continuous and binary targets
    score = fbeta_score(labels, np.where(preds < best_threshold, 0, 1), beta=7.0)
    print("Score：" + str(score))
    LOGGER.info(f"Score: {score:<.5f}")

# inference用に、best_thresholdを出力するようにする関数
def get_result_for_cv(result_df,best_border):
    preds = result_df["preds"].values
    labels = result_df["judgement"].values
    #best_threshold = opt_fbeta_threshold(labels, preds)
    print("Best_Threshold：" + str(best_border))
    # 上実行すると、ValueError: Classification metrics can't handle a mix of continuous and binary targets
    score = fbeta_score(labels, np.where(preds < best_border, 0, 1), beta=7.0)
    LOGGER.info(f"Score: {score:<.5f}")

    return score

def mean_best_border(*best_borders):
    best_border = np.mean(best_borders)
    print("Best_Threshold：" + str(best_border))
    LOGGER.info(f"Best_Border: {best_border:<.8f}")

    return best_border

In [None]:
def clean_stopword(text):
  stopwords = nltk.corpus.stopwords.words('english')
  list_x = text.split()
  res = []
  for w in list_x:
    if w not in stopwords:
      res.append(w)
  return ' '.join(res)

def clean_puncts(x):
  # 化学式とかがあるから '-'は削除しないほうがいいか？

  puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
            '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…',
            '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─',
            '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
            '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', '（', '）', '～',
            '➡', '％', '⇒', '▶', '「', '➄', '➆',  '➊', '➋', '➌', '➍', '⓪', '①', '②', '③', '④', '⑤', '⑰', '❶', '❷', '❸', '❹', '❺', '❻', '❼', '❽',  
            '＝', '※', '㈱', '､', '△', '℮', 'ⅼ', '‐', '｣', '┝', '↳', '◉', '／', '＋', '○',
            '【', '】', '✅', '☑', '➤', 'ﾞ', '↳', '〶', '☛', '｢', '⁺', '『', '≫',
            'Â©', '<sub>','Aﾎｲ', 'ﾎｲ', "ﾃｩ"
          ] 
  # 文字化け対応はここで対応するしかない？
  
  for punct in puncts:
    x = x.replace(punct, '')
  return x

def _pre_preprocess(x):
  return str(x).lower() 

def rm_num(x, use_num=True):
  numbers = ["0","1","2","3","4","5","6","7","8","9","０","１","２","３","４","５","６","７","８","９"]
  x = re.sub('[0-9]{5,}', '', x)
  x = re.sub('[0-9]{4}', '', x)
  x = re.sub('[0-9]{3}', '', x)
  x = re.sub('[0-9]{2}', '', x)    
  for i in numbers:
    x = x.replace(str(i), '')        
  return x

def convert_mojibake(text):
  text = text.encode("shift-jis").decode("utf-8", errors="ignore")
  return text

def remove_double(text):
  text = text.replace("  ", " ")
  return text

def preprocess_text(text):
  #text = _pre_preprocess(text)
  #text = clean_stopword(text)
  #text = clean_puncts(text)
  text = rm_num(text)
  text = remove_double(text)

  return text

def split_copyright(text):
  if "Copyright" in text:
    text = text.split('Copyright')[0]
    return text
  else:
    return text


In [None]:
pd.set_option("display.max_colwidth", 50)
train = pd.read_csv(input_dir + "train.csv")
test = pd.read_csv(input_dir + "test.csv")
sub = pd.read_csv(input_dir + "sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]

In [None]:
# ラベル変更
# https://signate.jp/competitions/471/discussions/20210816152356-59

train.loc[train["id"]==2488, "judgement"] = 0
train.loc[train["id"]==7708, "judgement"] = 0

In [None]:
train = get_train_data(train)
train.head()

Unnamed: 0,id,title,abstract,judgement,fold
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0,4
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0,3
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0,2
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0,0
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0,1


In [None]:
#train["title"] = cleansing_hero_only_text(train,"title")
#train["abstract"] = cleansing_hero_only_text(train,"abstract")
#test["title"] = cleansing_hero_only_text(test,"title")
#test["abstract"] = cleansing_hero_only_text(test,"abstract")

In [None]:
train["title_abst"] = train["title"] + train["abstract"]
train["title_abst"].fillna(train["title"], inplace=True)

test["title_abst"] = test["title"] + test["abstract"]
test["title_abst"].fillna(test["title"], inplace=True)

# preprocess
train["title_abst"] = train["title_abst"].apply(lambda x: preprocess_text(x))
test["title_abst"] = test["title_abst"].apply(lambda x: preprocess_text(x))
train

Unnamed: 0,id,title,abstract,judgement,fold,title_abst
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0,4,One-year age changes in MRI brain volumes in o...
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0,3,Supportive CSF biomarker evidence to enhance t...
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0,2,Occurrence of basal ganglia germ cell tumors w...
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0,0,New developments in diagnosis and therapy of C...
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0,1,Prolonged shedding of SARS-CoV- in an elderly ...
...,...,...,...,...,...,...
27140,27140,The amyloidogenic pathway of amyloid precursor...,Amyloid beta-protein (A beta) is the main cons...,0,1,The amyloidogenic pathway of amyloid precursor...
27141,27141,Technologic developments in radiotherapy and s...,We present a review of current technological p...,0,4,Technologic developments in radiotherapy and s...
27142,27142,Novel screening cascade identifies MKK4 as key...,Phosphorylation of Tau at serine 422 promotes ...,0,2,Novel screening cascade identifies MKK as key ...
27143,27143,Visualization of the gall bladder on F-18 FDOP...,The ability to label dihydroxyphenylalanine (D...,0,2,Visualization of the gall bladder on F- FDOPA ...


In [None]:
# titleの単語数が3以下のものは除外してみる
train["title_word_len"] = train["title"].str.split(" ").str.len()
train = train[train["title_word_len"]>3]

# titleが他言語の場合は除外
train["title_lang"] = train["title"].fillna("").map(lambda x: cld2.detect(x)[2][0][1])
train = train[(train["title_lang"]=="en")|(train["title_lang"]=="un")]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
train

Unnamed: 0,id,title,abstract,judgement,fold,title_abst,title_word_len,title_lang
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0,4,One-year age changes in MRI brain volumes in o...,10,en
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0,3,Supportive CSF biomarker evidence to enhance t...,23,en
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0,2,Occurrence of basal ganglia germ cell tumors w...,10,en
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0,0,New developments in diagnosis and therapy of C...,13,en
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0,1,Prolonged shedding of SARS-CoV- in an elderly ...,16,en
...,...,...,...,...,...,...,...,...
27140,27140,The amyloidogenic pathway of amyloid precursor...,Amyloid beta-protein (A beta) is the main cons...,0,1,The amyloidogenic pathway of amyloid precursor...,15,en
27141,27141,Technologic developments in radiotherapy and s...,We present a review of current technological p...,0,4,Technologic developments in radiotherapy and s...,7,en
27142,27142,Novel screening cascade identifies MKK4 as key...,Phosphorylation of Tau at serine 422 promotes ...,0,2,Novel screening cascade identifies MKK as key ...,13,en
27143,27143,Visualization of the gall bladder on F-18 FDOP...,The ability to label dihydroxyphenylalanine (D...,0,2,Visualization of the gall bladder on F- FDOPA ...,13,en


In [None]:
# Training
#border = len(train[train["judgement"] == 1]) / len(train["judgement"]) # 0.023245467689912133
#border = border * 0.6

mean_border_folds = []
 
oof_df = pd.DataFrame()
for fold in range(CONFIG.fold):
  _oof_df,best_borders = train_loop(train, fold)
  oof_df = pd.concat([oof_df, _oof_df])
  LOGGER.info(f"========== fold: {fold} result ==========")
  best_border_fold = mean_best_border(best_borders)
  mean_border_folds.append(best_border_fold)
        
# CV result
LOGGER.info(f"========== CV ==========")
best_border = mean_best_border(mean_border_folds)
get_result_for_cv(oof_df,best_border)
    
# Save OOF result
oof_df.to_csv(pred_dir +CONFIG.exp + "/oof_df.csv", index=False)



Epoch: [1][0/1345] Elapsed 0m 0s (remain 15m 8s) Loss: 0.9872 
Epoch: [1][100/1345] Elapsed 0m 43s (remain 9m 1s) Loss: 0.1161 
Epoch: [1][200/1345] Elapsed 1m 27s (remain 8m 15s) Loss: 0.0940 
Epoch: [1][300/1345] Elapsed 2m 10s (remain 7m 32s) Loss: 0.0880 
Epoch: [1][400/1345] Elapsed 2m 53s (remain 6m 49s) Loss: 0.0817 
Epoch: [1][500/1345] Elapsed 3m 37s (remain 6m 6s) Loss: 0.0773 
Epoch: [1][600/1345] Elapsed 4m 20s (remain 5m 22s) Loss: 0.0742 
Epoch: [1][700/1345] Elapsed 5m 3s (remain 4m 39s) Loss: 0.0706 
Epoch: [1][800/1345] Elapsed 5m 47s (remain 3m 55s) Loss: 0.0686 
Epoch: [1][900/1345] Elapsed 6m 30s (remain 3m 12s) Loss: 0.0662 
Epoch: [1][1000/1345] Elapsed 7m 13s (remain 2m 28s) Loss: 0.0644 
Epoch: [1][1100/1345] Elapsed 7m 56s (remain 1m 45s) Loss: 0.0621 
Epoch: [1][1200/1345] Elapsed 8m 40s (remain 1m 2s) Loss: 0.0605 
Epoch: [1][1300/1345] Elapsed 9m 23s (remain 0m 19s) Loss: 0.0588 
Epoch: [1][1344/1345] Elapsed 9m 42s (remain 0m 0s) Loss: 0.0583 
EVAL: [0/169]

Epoch 1 - avg_train_loss: 0.0583  avg_val_loss: 0.0441  time: 630s
Epoch 1 - avg_train_loss: 0.0583  avg_val_loss: 0.0441  time: 630s
Epoch 1 - Score: 0.9201979585524281
Epoch 1 - Score: 0.9201979585524281
Epoch 1 - Save Best Score: 0.9202 
Epoch 1 - Save Best Score: 0.9202 
Best_Border: 0.05809437
Best_Border: 0.05809437


Best_Threshold：0.058094373033049984
Epoch: [1][0/1346] Elapsed 0m 0s (remain 15m 5s) Loss: 1.4011 
Epoch: [1][100/1346] Elapsed 0m 44s (remain 9m 3s) Loss: 0.1126 
Epoch: [1][200/1346] Elapsed 1m 27s (remain 8m 18s) Loss: 0.0916 
Epoch: [1][300/1346] Elapsed 2m 10s (remain 7m 34s) Loss: 0.0797 
Epoch: [1][400/1346] Elapsed 2m 54s (remain 6m 50s) Loss: 0.0750 
Epoch: [1][500/1346] Elapsed 3m 37s (remain 6m 6s) Loss: 0.0681 
Epoch: [1][600/1346] Elapsed 4m 20s (remain 5m 23s) Loss: 0.0687 
Epoch: [1][700/1346] Elapsed 5m 4s (remain 4m 39s) Loss: 0.0668 
Epoch: [1][800/1346] Elapsed 5m 47s (remain 3m 56s) Loss: 0.0659 
Epoch: [1][900/1346] Elapsed 6m 30s (remain 3m 12s) Loss: 0.0649 
Epoch: [1][1000/1346] Elapsed 7m 13s (remain 2m 29s) Loss: 0.0622 
Epoch: [1][1100/1346] Elapsed 7m 57s (remain 1m 46s) Loss: 0.0603 
Epoch: [1][1200/1346] Elapsed 8m 40s (remain 1m 2s) Loss: 0.0586 
Epoch: [1][1300/1346] Elapsed 9m 23s (remain 0m 19s) Loss: 0.0578 
Epoch: [1][1345/1346] Elapsed 9m 42s (remai

Epoch 1 - avg_train_loss: 0.0574  avg_val_loss: 0.0418  time: 630s
Epoch 1 - avg_train_loss: 0.0574  avg_val_loss: 0.0418  time: 630s
Epoch 1 - Score: 0.9235231262402687
Epoch 1 - Score: 0.9235231262402687
Epoch 1 - Save Best Score: 0.9235 
Epoch 1 - Save Best Score: 0.9235 


EVAL: [167/168]Elapsed 0m 47s (remain 0m 0s)Loss: 0.0418


Best_Border: 0.02863160
Best_Border: 0.02863160


Best_Threshold：0.02863159795171475
Epoch: [1][0/1345] Elapsed 0m 0s (remain 15m 13s) Loss: 0.5593 
Epoch: [1][100/1345] Elapsed 0m 44s (remain 9m 3s) Loss: 0.1078 
Epoch: [1][200/1345] Elapsed 1m 27s (remain 8m 17s) Loss: 0.0945 
Epoch: [1][300/1345] Elapsed 2m 10s (remain 7m 33s) Loss: 0.0788 
Epoch: [1][400/1345] Elapsed 2m 53s (remain 6m 49s) Loss: 0.0786 
Epoch: [1][500/1345] Elapsed 3m 37s (remain 6m 5s) Loss: 0.0766 
Epoch: [1][600/1345] Elapsed 4m 20s (remain 5m 22s) Loss: 0.0719 
Epoch: [1][700/1345] Elapsed 5m 3s (remain 4m 39s) Loss: 0.0681 
Epoch: [1][800/1345] Elapsed 5m 47s (remain 3m 55s) Loss: 0.0661 
Epoch: [1][900/1345] Elapsed 6m 30s (remain 3m 12s) Loss: 0.0637 
Epoch: [1][1000/1345] Elapsed 7m 13s (remain 2m 29s) Loss: 0.0620 
Epoch: [1][1100/1345] Elapsed 7m 57s (remain 1m 45s) Loss: 0.0602 
Epoch: [1][1200/1345] Elapsed 8m 40s (remain 1m 2s) Loss: 0.0580 
Epoch: [1][1300/1345] Elapsed 9m 23s (remain 0m 19s) Loss: 0.0576 
Epoch: [1][1344/1345] Elapsed 9m 42s (remai

Epoch 1 - avg_train_loss: 0.0574  avg_val_loss: 0.0464  time: 630s
Epoch 1 - avg_train_loss: 0.0574  avg_val_loss: 0.0464  time: 630s
Epoch 1 - Score: 0.9237875288683602
Epoch 1 - Score: 0.9237875288683602
Epoch 1 - Save Best Score: 0.9238 
Epoch 1 - Save Best Score: 0.9238 
Best_Border: 0.12049942
Best_Border: 0.12049942


Best_Threshold：0.12049942208174554
Epoch: [1][0/1346] Elapsed 0m 0s (remain 15m 19s) Loss: 0.6409 
Epoch: [1][100/1346] Elapsed 0m 43s (remain 9m 1s) Loss: 0.0738 
Epoch: [1][200/1346] Elapsed 1m 27s (remain 8m 16s) Loss: 0.0772 
Epoch: [1][300/1346] Elapsed 2m 10s (remain 7m 32s) Loss: 0.0735 
Epoch: [1][400/1346] Elapsed 2m 53s (remain 6m 49s) Loss: 0.0691 
Epoch: [1][500/1346] Elapsed 3m 37s (remain 6m 6s) Loss: 0.0699 
Epoch: [1][600/1346] Elapsed 4m 20s (remain 5m 22s) Loss: 0.0677 
Epoch: [1][700/1346] Elapsed 5m 3s (remain 4m 39s) Loss: 0.0657 
Epoch: [1][800/1346] Elapsed 5m 46s (remain 3m 55s) Loss: 0.0653 
Epoch: [1][900/1346] Elapsed 6m 30s (remain 3m 12s) Loss: 0.0646 
Epoch: [1][1000/1346] Elapsed 7m 13s (remain 2m 29s) Loss: 0.0623 
Epoch: [1][1100/1346] Elapsed 7m 56s (remain 1m 46s) Loss: 0.0610 
Epoch: [1][1200/1346] Elapsed 8m 39s (remain 1m 2s) Loss: 0.0586 
Epoch: [1][1300/1346] Elapsed 9m 23s (remain 0m 19s) Loss: 0.0585 
Epoch: [1][1345/1346] Elapsed 9m 42s (remai

Epoch 1 - avg_train_loss: 0.0580  avg_val_loss: 0.0490  time: 630s
Epoch 1 - avg_train_loss: 0.0580  avg_val_loss: 0.0490  time: 630s
Epoch 1 - Score: 0.9210327646081836
Epoch 1 - Score: 0.9210327646081836
Epoch 1 - Save Best Score: 0.9210 
Epoch 1 - Save Best Score: 0.9210 
Best_Border: 0.00147955
Best_Border: 0.00147955


Best_Threshold：0.0014795493218492403
Epoch: [1][0/1346] Elapsed 0m 0s (remain 16m 24s) Loss: 0.9640 
Epoch: [1][100/1346] Elapsed 0m 43s (remain 9m 2s) Loss: 0.1230 
Epoch: [1][200/1346] Elapsed 1m 27s (remain 8m 16s) Loss: 0.1015 
Epoch: [1][300/1346] Elapsed 2m 10s (remain 7m 32s) Loss: 0.0898 
Epoch: [1][400/1346] Elapsed 2m 53s (remain 6m 49s) Loss: 0.0795 
Epoch: [1][500/1346] Elapsed 3m 37s (remain 6m 6s) Loss: 0.0772 
Epoch: [1][600/1346] Elapsed 4m 20s (remain 5m 23s) Loss: 0.0737 
Epoch: [1][700/1346] Elapsed 5m 4s (remain 4m 39s) Loss: 0.0703 
Epoch: [1][800/1346] Elapsed 5m 47s (remain 3m 56s) Loss: 0.0686 
Epoch: [1][900/1346] Elapsed 6m 30s (remain 3m 13s) Loss: 0.0669 
Epoch: [1][1000/1346] Elapsed 7m 14s (remain 2m 29s) Loss: 0.0656 
Epoch: [1][1100/1346] Elapsed 7m 57s (remain 1m 46s) Loss: 0.0638 
Epoch: [1][1200/1346] Elapsed 8m 40s (remain 1m 2s) Loss: 0.0612 
Epoch: [1][1300/1346] Elapsed 9m 23s (remain 0m 19s) Loss: 0.0595 
Epoch: [1][1345/1346] Elapsed 9m 43s (rem

Epoch 1 - avg_train_loss: 0.0592  avg_val_loss: 0.0465  time: 631s
Epoch 1 - avg_train_loss: 0.0592  avg_val_loss: 0.0465  time: 631s
Epoch 1 - Score: 0.8694302126998914
Epoch 1 - Score: 0.8694302126998914
Epoch 1 - Save Best Score: 0.8694 
Epoch 1 - Save Best Score: 0.8694 
Best_Border: 0.01172707
Best_Border: 0.01172707
Best_Border: 0.04408640
Best_Border: 0.04408640
Score: 0.87004
Score: 0.87004


Best_Threshold：0.011727065799933973
Best_Threshold：0.044086401637658694
Best_Threshold：0.044086401637658694


In [None]:
best_border

0.044086401637658694

In [None]:
predictions = inference()

# stacking用にpredictionを保存
pred_df = pd.DataFrame()
pred_df["id"] = test["id"]
pred_df["judgement"] = predictions
pred_df.to_csv(pred_dir +CONFIG.exp + "/pred_df.csv", index=False)

predictions = np.where(predictions < best_border, 0, 1)

# submission
sub["judgement"] = predictions
sub.to_csv(submission_dir +CONFIG.exp+ "_submission.csv", index=False, header=False)