In [1]:
#rom pretrain.examples.extract_features import *
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
#from kumc_dataset import InputDataset
from pynvml.smi import nvidia_smi
from torch.optim import Adam
from tqdm import tqdm
from torch import nn
import numpy as np
import pickle, re
import torch
import json
import os
import csv

In [2]:

class ScheduledOptim():
    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

def to_np(t):
    return t.cpu().detach().numpy()

In [3]:
torch.cuda.empty_cache()

# HYPERPARAMETERS
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
batch_size = 3
seq_len = 10
epoch= 2
n_gpu = torch.cuda.device_count()
with_vocab=True
gpu_write=False
savepath="./models/"

In [4]:
os.chdir(r"C:\Users\Samsung\Desktop\MyDream\Project\Medical_Legal\preview\corpus\KMBERT")

In [8]:
# MODELS & TOKENIZERS (KR-BERT BASELINE / KUMC BERT)
if with_vocab:
    tokenizer = BertTokenizer(vocab_file="./kumc_vocab_bert.txt", do_lower_case=False)   # KUMC BERT TOKENIZER
    model = BertForPreTraining.from_pretrained("./model_ranked/")                        # KUMC BERT
    savepath+="with_vocab/"
else:
    tokenizer = BertTokenizer(vocab_file="./kr_bert_vocab.txt", do_lower_case=False)    # KUMC BERT TOKENIZER (without vocab)
    model = BertForPreTraining.from_pretrained("./kr_bert_model/")                      # KUMC BERT (without vocab)
    savepath+="without_vocab/"

In [9]:
# Loading previous best info - DEPRECATED
if "kumc_bert.log" in os.listdir(savepath):
    log = open(savepath+"kumc_bert.log").read()
    prev_best=float(re.findall("(?!'total loss avg': )\d+\.\d+", log)[0])
else:
    prev_best=999.999


In [10]:
device

device(type='cpu')

In [11]:
model.to(device)
loss_fct = CrossEntropyLoss(ignore_index=-1).to(device)

In [12]:
model

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(16424, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )


In [13]:
model.bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(16424, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin