In [None]:
import csv
from datetime import datetime
import json
import math
import os
import sys
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import (TransformerDecoder, TransformerDecoderLayer,
                      TransformerEncoder, TransformerEncoderLayer)
from torch.utils.data import DataLoader
from tqdm import tqdm

In [143]:
try:
  from google.colab import drive
  IS_GOOGLE_COLAB = True
except ImportError:
  IS_GOOGLE_COLAB = False

if IS_GOOGLE_COLAB:
  mount_path = '/content/drive'
  base_folder = os.path.join(mount_path, "My Drive", "Data")
else:
  base_folder = '../../../../Data'

In [79]:
class Logger:

  @classmethod
  def connect_drive(cls, mount_path='/content/drive'):
    from google.colab import drive
    drive.mount(mount_path)

  def __init__(self, model_name, version, base_path=None, storage_handler='colab', max_retry=3, local_cache_period=10, client_id=None):
    """ Logging class to store training logs

    Args:
        model_name (str): It create a folder {base_path}/{model_name}/.
        verison (str): It create a file {base_path}/{model_name}/{model_name}_v{version}.csv.
        base_path (str, optional): Base path to store logs. If you use cloud storage, this is used as temporal folder. Defaults to None.
        storage_handler (str|BaseHandler, optional): It change storage service. 'colab' can be selected. Defaults to 'colab'.
        max_retry (int, optional): max count of retry when store logs via network. Defaults to 3.
        local_cache_period(int, optional): Valid for cloud storage only. period to chache logs until send it to the storage. Defaults to 10.
        client_id(str, optional): client_id to authenticate cloud service with OAuth2.0/OIDC. Defaults to None.
    """
    # define common veriables
    MOUNT_PATH = '/content/drive'
    self.__use_cloud_storage = False
    self.__init_storage = lambda : None
    self.__local_cache_period = local_cache_period
    self.model_name = model_name
    self.version = version
    self.max_retry = max_retry

    # define variables depends on env
    if storage_handler == 'colab':
      # this case we store logs on mounted path
      self.__init_colab()
      self.__init_storage = self.__init_colab
      if base_path is None:
        self.base_path = MOUNT_PATH
      else:
        base_pathes = [p for p in base_path.split('/') if len(p) > 0]
        self.base_path = os.path.join(MOUNT_PATH, 'My Drive', *base_pathes)
    elif type(storage_handler) is str:
      raise ValueError(f"{storage_handler} is not supported. Please create StorageHandler for the service.")
    elif storage_handler is not None:
      # this case we store logs on app folder of dropbox, using cloud_storage_handlder
      self.__cloud_handler = storage_handler
      if self.__cloud_handler.refresh_token is None:
        self.__cloud_handler.authenticate()
      self.__use_cloud_storage = True
      if base_path is None:
        self.base_path = './'
      else:
        self.base_path = base_path
    else:
      self.__cloud_handler = None
      if base_path is None:
        self.base_path = './'
      else:
        self.base_path = base_path
    model_log_folder = os.path.join(self.base_path, model_name)
    if not os.path.exists(model_log_folder):
        os.makedirs(model_log_folder)
    file_name = f"{model_name}_v{version}.csv"
    self.log_file_path = os.path.join(model_log_folder, file_name)
    self.__cache = []

  def __init_colab(self):
    from google.colab import drive
    drive.mount(MOUNT_PATH)

  def __store_files_to_cloud_storage(self, file_path):
    try:
      self.__cloud_handler.upload_training_results(self.model_name, [file_path])
    except Exception as e:
      print(f"failed to save logs to dropbox: {e}")

  def reset(self, model_name=None, file_name=None):
    if file_name is None:
      file_name = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
    if model_name is None:
      if file_name is None:
        raise ValueError("Either model_name or file_name should be specified")
      self.log_file_path = os.path.join(self.base_path, file_name)
    else:
      model_log_folder = os.path.join(self.base_path, model_name)
      if not os.path.exists(model_log_folder):
        os.makedirs(model_log_folder)
      self.log_file_path = os.path.join(model_log_folder, file_name)
    self.__cache = []

  def __cache_log(self, log_entry: list):
    self.__cache.append(log_entry)

  def __append_log(self, log_entry:list, retry_count=0):
      try:
          with open(self.log_file_path, 'a') as log_file:
            writer = csv.writer(log_file)
            if len(self.__cache) > 0:
              writer.writerows(self.__cache)
              self.__cache = []
            writer.writerow(log_entry)
      except Exception as e:
        if retry_count < self.max_retry:
          if retry_count == 0:
            print(e)
          self.__init_storage()
          self.__append_log(log_entry, retry_count+1)
        else:
          self.__cache.append(log_entry)

  def save_params(self, params:dict, model_name=None, model_version=None):
    data_folder = os.path.dirname(self.log_file_path)
    param_file_path = os.path.join(data_folder, f'{model_name}_v{model_version}_params.json')
    with open(param_file_path, mode="w") as fp:
      json.dump(params, fp)
    if self.__use_cloud_storage:
      self.__store_files_to_cloud_storage(param_file_path)

  def save_model(self, model, model_name=None, model_version=None):
    if model is not None:
      data_folder = os.path.dirname(self.log_file_path)
      param_file_path = os.path.join(data_folder, f'{model_name}_v{model_version}.torch')
      torch.save(model.state_dict(), param_file_path)
      if self.__use_cloud_storage:
        self.__store_files_to_cloud_storage(param_file_path)

  def save_checkpoint(self, model, optimizer, scheduler, model_name, model_version, **kwargs):
    if model is not None:
      data_folder = os.path.dirname(self.log_file_path)
      model_path = os.path.join(data_folder, f'{model_name}_v{model_version}.torch')
      torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        **kwargs
      }, model_path)
      if self.__use_cloud_storage:
        self.__store_files_to_cloud_storage(model_path)

  def save_logs(self):
    if len(self.__cache) > 0:
      with open(self.log_file_path, 'a') as log_file:
        if len(self.__cache) > 0:
          writer = csv.writer(log_file)
          writer.writerows(self.__cache)
    if self.__use_cloud_storage:
        self.__store_files_to_cloud_storage(self.log_file_path)

  def add_training_log(self, training_loss, validation_loss, log_entry:list=None):
    timestamp = datetime.now().isoformat()
    basic_entry = [timestamp, training_loss, validation_loss]
    if log_entry is not None:
      if type(log_entry) is list and len(log_entry) > 0:
        basic_entry.extend(log_entry)
    if len(self.__cache) < self.__local_cache_period:
      self.__cache_log(basic_entry)
    else:
      self.__append_log(basic_entry)
      if self.__use_cloud_storage:
        self.__store_files_to_cloud_storage(self.log_file_path)

  def get_min_losses(self, train_loss_column=1, val_loss_column=2):
    logs = None
    if os.path.exists(self.log_file_path) is False:
      file_name = os.path.dirname(self.log_file_path)
      destination_path = f'/{self.model_name}/{file_name}'
      if self.__cloud_handler is not None:
        response = self.__cloud_handler.download_file(destination_path, self.log_file_path)
        if response is not None:
          logs = pd.read_csv(self.log_file_path)
    else:
      logs = pd.read_csv(self.log_file_path)

    if logs is None:
      print("no log available")
      return np.inf, np.inf
    else:
      if type(train_loss_column) is int:
        train_loss = logs.iloc[:, train_loss_column]
      elif type(train_loss_column) is str:
        train_loss = logs[train_loss_column]
      min_train_loss = train_loss.min()

      if type(val_loss_column) is int:
        val_loss = logs.iloc[:, val_loss_column]
      elif type(val_loss_column) is str:
        val_loss = logs[val_loss_column]
      min_val_loss = val_loss.min()

      return min_train_loss, min_val_loss

In [121]:
def load_model(model_name, model_version, device, optimizer_class, scheduler_class, train=True, storage_handler=None, model_folder=None, lr=1e-3):
  if model_folder is None:
    model_folder = base_folder
  model_folder = os.path.join(model_folder, model_name)

  params_file_name = f'{model_folder}/{model_name}_v{model_version}_params.json'
  if os.path.exists(params_file_name) is False:
    if storage_handler is None:
      print(f"exsisting model params not found on {params_file_name}.")
      return None, None, None, None
    else:
      response = storage_handler.download_file(f"/{model_name}/{model_name}_v{model_version}_params.json", params_file_name)
      if response is None:
        print("exsisting model params not found.")
        return None, None, None, None
  with open(params_file_name) as fp:
      params = json.load(fp)
  # need to create create_model function for respective model
  model = create_model(**params, vocab_size=params["VOCAB_SIZE"]).to(device)
  optimizer = optimizer_class(model.parameters(), lr=lr)
  scheduler = scheduler_class(optimizer, 1.0)
  if train:
    model_path = f'{model_folder}/{model_name}_train_v{model_version}.torch'
  else:
    model_path = f'{model_folder}/{model_name}_v{model_version}.torch'
  if os.path.exists(model_path) is False:
    if storage_handler is None:
      print("exsisting model not found.")
      return None, None, None, None
    file_name = os.path.basename(model_path)
    response = storage_handler.download_file(f"/{model_name}/{file_name}", model_path)
    if response is None:
      print("exsisting model not found.")
      return None, None, None, None

  if torch.cuda.is_available():
    check_point = torch.load(model_path)
  else:
    check_point = torch.load(model_path, map_location=torch.device('cpu'))
  if "model_state_dict" in check_point:
    model.load_state_dict(check_point['model_state_dict'])
    optimizer.load_state_dict(check_point['optimizer_state_dict'])
    scheduler.load_state_dict(check_point['scheduler_state_dict'])
    return params, model, optimizer, scheduler
  else:
    if optimizer_class is not None:
      print("checkpoint is not available.")
    model.load_state_dict(check_point)
    return params, model, None, None

In [152]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.05, batch_first=True):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(-2)
        if batch_first:
            pe = pe.transpose(0, 1)
            self.forward = self.__fforward
        else:
            self.forward = self.__mforward
            
        self.register_buffer("pe", pe)

    def __mforward(self, src):
        src_pos = src.size(0)
        return self.dropout(src + self.pe[:src_pos, :])
    
    def __fforward(self, src):
        src_pos = src.size(1)
        return self.dropout(src + self.pe[:, :src_pos, :])

In [170]:
class Seq2SeqTransformer(nn.Module):

    def __init__(
        self, num_encoder_layers: int, num_decoder_layers: int, d_model: int, vocab_size: int,
        dim_feedforward:int = 512, dropout:float = 0.1, nhead:int = 8,
        batch_first=True,
    ):

        super(Seq2SeqTransformer, self).__init__()

        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout, batch_first=batch_first)
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
        encoder_layer = TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=batch_first
        )
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        decoder_layer = TransformerDecoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=batch_first
        )
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        self.output = nn.Linear(d_model, vocab_size)


    def forward(
        self, src: Tensor, tgt: Tensor,
        mask_src: Tensor=None, padding_mask_src: Tensor=None, padding_mask_tgt: Tensor=None,
        memory_key_padding_mask: Tensor=None
    ):
        mask_tgt = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
        # src shape is (batch_size, seq_len), make it to (batch_size, seq_len, d_model)
        src = self.embedding(src)
        tgt = self.embedding(tgt)
        src = self.positional_encoding(src)
        tgt = self.positional_encoding(tgt)
        memory = self.transformer_encoder(src, mask_src, padding_mask_src)
        hidden = self.transformer_decoder(
            tgt, memory, mask_tgt, None,
            padding_mask_tgt, memory_key_padding_mask
        )
        outs = self.output(hidden)
        return outs

In [165]:
def create_model(num_encoder_layers, num_decoder_layers, d_model, vocab_size, dim_feedforward, dropout, nhead, batch_first, **kwargs):
    model = Seq2SeqTransformer(
        num_encoder_layers=num_encoder_layers,
        num_decoder_layers=num_decoder_layers,
        d_model=d_model,
        vocab_size=vocab_size,
        dim_feedforward=dim_feedforward,
        dropout=dropout, nhead=nhead,
        batch_first=batch_first
    )
    return model

In [191]:
def train(model, dataloader, optimizer, criterion, device, vocab_size):

    model = model.train()
    losses = 0

    length = 0.0
    for src, tgt in tqdm(dataloader):
        length+=1.0
        src = src.to(device)
        tgt = tgt.to(device)
        # assume batch_first = True, so shape is (batch_size, seq_len)
        input_tgt = tgt[:, :-1]
        output = model(
            src=src, tgt=input_tgt
        )

        optimizer.zero_grad()

        output_tgt = tgt[:, 1:]
        output = output.view(-1, vocab_size)
        output_tgt = output_tgt.reshape(-1)
        loss = criterion(output, output_tgt)
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / length

In [192]:
def evaluate(model, dataloader, criterion, device, vocab_size):

    model = model.eval()
    losses = 0
    length = 0.0
    for src, tgt in tqdm(dataloader):
        length+=1.0
        src = src.to(device)
        tgt = tgt.to(device)

        input_tgt = tgt[:, :-1]
        output = model(
            src=src, tgt=input_tgt
        )

        output_tgt = tgt[:, 1:]
        loss = criterion(output.view(-1, vocab_size), output_tgt.view(-1))
        losses += loss.item()

    return losses / length

In [189]:
def prediction(model, inputs, feature_size, prediction_length, device):

    preds = torch.zeros(1, inputs.size(1), feature_size, device=device)
    preds[0, :, :] = inputs[-1,:, :]
    while preds.size(0) <= prediction_length:
        out = model(src=inputs, tgt=preds)
        preds = torch.cat([preds, out[-1:]], dim=0)
    return preds

In [205]:
# from multiprocessing import Pool
import random
from collections.abc import Iterable

from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence


class NumberDataset(Dataset):
    version = 1

    token_map = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        '6': 6,
        '7': 7,
        '8': 8,
        '9': 9,
        '10': 10,
        'NEG': 11,
        'POS': 12,
        '+': 13,
        '-': 14,
        '*': 15,
        '/': 16,
        '=': 17,
        'PAD': 18,
    }

    def __init__(
        self,
        num_of_terms=2,
        max_digits=3,
        data_length=10000,
        dtype=torch.float32,
        batch_first=False,
        device="cuda",
        seed=1017,
    ):
        self.seed(seed)
        self.dtype = dtype
        self.device = device
        self.batch_first = batch_first
        self.num_of_terms = num_of_terms
        self.max_digits = max_digits
        self.data_length = data_length

    def calculate_answer(self, terms, symbols):
        eval_str = str(terms[0])
        for i in range(len(symbols)):
            eval_str += f' {symbols[i]} {terms[i+1]}'
        try:
            answer = eval(eval_str)
        except ZeroDivisionError as e:
            print(f"Division by zero encountered. Regenerating problem: {eval_str}")
            raise e
        except Exception as e:
            print(f"Error evaluating expression: {eval_str}")
            raise e
        return answer

    def generate_random_math_problems(self):
        # generate random number from -1 * (10 ** digits) to 1 * (10 ** digits)
        terms = [random.randrange(-1 * (10 ** self.max_digits), 1 * (10 ** self.max_digits)) for term in range(self.num_of_terms)]
        symbols = [random.choice(['+', '-', '*', '/']) for _ in range(self.num_of_terms - 1)]
        for s in range(len(symbols)):
            if symbols[s] == '/' and terms[s+1] == 0:
                value = random.randrange(1, 1 * (10 ** self.max_digits))
                sign = random.choice([-1, 1])
                terms[s+1] = sign * value
        return terms, symbols
    
    def num_to_tokens(self, number: int):
        # convert number to list of tokens
        tokens = []
        if number < 0:
            tokens.append('NEG')
            number = -number
        else:
             tokens.append('POS')
        digits = list(str(number))
        real_digits = []
        minor_digits = []
        is_real = True
        for n, d in enumerate(digits):
            if d == ".":
                is_real = False
                continue
            if is_real:
                real_digits.append(d)
            else:
                minor_digits.append(d)
        for n, d in enumerate(real_digits[:-1]):
            tokens.append(d)
            tokens.extend(["*", "10"] * (len(real_digits) - n - 1))
            tokens.append("+")
        tokens.append(real_digits[-1])
        for n, d in enumerate(minor_digits):
            tokens.append("+")
            tokens.append(d)
            tokens.extend(["/", "10"] * (n + 1))
        return tokens

    def tokens_to_num(self, tokens: list):
        # convert list of tokens to number
        eval_str = ""
        if tokens[0] == "POS":
            sign = 1
        else:
            sign = -1
        for element in tokens[1:]:
            eval_str += f' {element}'
        answer = sign * eval(eval_str)
        return answer
    
    def generate_data(self):
        terms, symbols = self.generate_random_math_problems()
        input_tokens = []
        for n, term in enumerate(terms):
            term_tokens = self.num_to_tokens(term)
            term_tokens = [self.token_map[token] for token in term_tokens]
            input_tokens.extend(term_tokens)
            if n < len(symbols):
                input_tokens.append(self.token_map[symbols[n]])
        answer = self.calculate_answer(terms, symbols)
        output_tokens = self.num_to_tokens(answer)
        output_tokens = [self.token_map[token] for token in output_tokens]
        output_tokens = [self.token_map['='], * output_tokens]
        return input_tokens, output_tokens

    def __len__(self):
        return self.data_length

    def __getitem__(self, ndx):
        input_data = []
        output_data = []

        if isinstance(ndx, int):
            input_tokens, output_tokens = self.generate_data()
            return torch.tensor(input_tokens), torch.tensor(output_tokens)
        else:
            if isinstance(ndx, Iterable):
                for i in ndx:
                    input_tokens, output_tokens = self.generate_data()
                    input_data.append(input_tokens)
                    output_data.append(output_tokens)
            elif isinstance(ndx, slice):
                for _i in range(ndx.start, ndx.stop):
                    input_tokens, output_tokens = self.generate_data()
                    input_data.append(input_tokens)
                    output_data.append(output_tokens)
            # make data to tensor
            input_data = pad_sequence([torch.tensor(t) for t in input_data], batch_first=self.batch_first, padding_value=0)
            output_data = pad_sequence([torch.tensor(t) for t in output_data], batch_first=self.batch_first, padding_value=0)
        return input_data, output_data

    def seed(self, seed=None):
        """ """
        if seed is None:
            seed = 1017
        else:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
        torch.manual_seed(seed)
        random.seed(seed)
        np.random.seed(seed)
        self.seed_value = seed

    def seed_worker(worker_id):
        worker_seed = torch.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)


In [208]:
ds = NumberDataset(device="cpu", seed=1017, num_of_terms=2, max_digits=3, data_length=10000)

In [209]:
from torch.nn.utils.rnn import pad_sequence

batch_first = True

def my_collate_fn(batch):
    inputs, outputs = zip(*batch)
    
    inputs_padded = pad_sequence(inputs, batch_first=batch_first, padding_value=0)
    outputs_padded = pad_sequence(outputs, batch_first=batch_first, padding_value=0)
    
    return inputs_padded, outputs_padded

### Training Number Transformer Model

In [212]:
from torch.utils.data import DataLoader

model_name = "math_basic_transformer_v1"
model_version = "1"
VACAB_SIZE = 19  # 0-10, NEG, POS, +, -, *, /, =, PAD
BATCH_SIZE = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataloader = DataLoader(
    ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=my_collate_fn
)

model_params, model, optimizer, scheduler = load_model(model_name, model_version, device, train=True, storage_handler=None,
                                 optimizer_class=torch.optim.Adam,
                                 scheduler_class=torch.optim.lr_scheduler.StepLR)

if model is None:
    print("Initialize a new model.")

    # Hyper parameters
    model_params = {
        "nhead": 4,
        "dim_feedforward": 10,
        "num_encoder_layers": 2,
        "num_decoder_layers": 2,
        "d_model": 8,
        "dropout": 0.1,
        "batch_first": batch_first
    }

    assert model_params is not None, "model_params should be specified."
    assert model_params["d_model"] % model_params["nhead"] == 0, "d_model must be divisible by nhead."

    model = create_model(
        vocab_size=VACAB_SIZE,
        **model_params
    ).to(device)
    params_num = 0
    # initialize model parameters
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
        if p.requires_grad:
            params_num += p.numel()
else:
    print("Loaded existing model.")

print(f"params: {params_num}")

Loaded existing model.
params: 2906


In [213]:
lr = 0.01

criterion = nn.CrossEntropyLoss(ignore_index=ds.token_map['PAD'])
if optimizer is None:
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma = 0.9)

In [214]:
logger = Logger(model_name, model_version, base_folder, storage_handler=None, local_cache_period=1)

params = {
          "batch_size": BATCH_SIZE,
          **model_params,
          "VOCAB_SIZE": VACAB_SIZE,
          "params_num": params_num,
          # format version
          "version": 2
}

logger.save_params(params, model_name, model_version)

print("training log will be saved on ", logger.log_file_path)

training log will be saved on  ../../../../Data\math_basic_transformer_v1\math_basic_transformer_v1_v1.csv


In [193]:
epoch = 500
best_train_loss, best_valid_loss = logger.get_min_losses()
best_model = None
best_train_model = None
patience = 3
counter = 0

for loop in range(1, epoch + 1):
    start_time = time.time()
    loss_train = train(
        model=model, dataloader=dataloader, optimizer=optimizer,
        criterion=criterion,
        device=device, vocab_size=VACAB_SIZE
    )

    elapsed_time = time.time() - start_time

    # loss_valid = evaluate(
    #     model=model, dataloader=dataloader, criterion=criterion,
    #     device=device
    # )
    loss_valid = 0.0

    elapsed_mins = math.floor(elapsed_time / 60)
    log = '[{}/{}] train loss: {:.10f}, valid loss: {:.10f}  [{}{:.0f}s] count: {}, {}'.format(
        loop, epoch,
        loss_train, loss_valid,
        str(int(elapsed_mins)) + 'm' if elapsed_mins > 0 else '',
        elapsed_time % 60,
        counter,
        '**' if best_valid_loss > loss_valid else ''
    )
    print(log)
    logger.add_training_log(loss_train, loss_valid, elapsed_time)

    if best_train_loss > loss_train:
        best_train_loss = loss_train
        best_train_model = model
        counter = 0
    else:
        counter += 1
        if counter == 1:
          logger.save_checkpoint(best_train_model, optimizer, scheduler, f'{model_name}_train', model_version)
        scheduler.step()
    if best_valid_loss > loss_valid:
        best_valid_loss = loss_valid
        best_model = model
        logger.save_checkpoint(best_model, optimizer, scheduler, model_name, model_version)

    if counter > patience:
        break
    
logger.save_checkpoint(best_train_model, optimizer, scheduler, f'{model_name}_train', model_version)
logger.save_checkpoint(best_model, optimizer, scheduler, model_name, model_version)

no log available


100%|██████████| 313/313 [01:36<00:00,  3.25it/s]


[1/500] train loss: 0.2828935831, valid loss: 0.0000000000  [1m36s] count: 0, **


100%|██████████| 313/313 [01:40<00:00,  3.10it/s]


[2/500] train loss: 0.1043742655, valid loss: 0.0000000000  [1m41s] count: 0, 


100%|██████████| 313/313 [01:43<00:00,  3.02it/s]


[3/500] train loss: 0.0999354033, valid loss: 0.0000000000  [1m44s] count: 0, 


100%|██████████| 313/313 [01:47<00:00,  2.90it/s]


[4/500] train loss: 0.0921883629, valid loss: 0.0000000000  [1m48s] count: 0, 


100%|██████████| 313/313 [01:52<00:00,  2.77it/s]


[5/500] train loss: 0.0912944428, valid loss: 0.0000000000  [1m53s] count: 0, 


100%|██████████| 313/313 [01:57<00:00,  2.65it/s]


[6/500] train loss: 0.0902929190, valid loss: 0.0000000000  [1m58s] count: 0, 


100%|██████████| 313/313 [02:01<00:00,  2.57it/s]


[7/500] train loss: 0.0867518416, valid loss: 0.0000000000  [2m2s] count: 0, 


100%|██████████| 313/313 [02:01<00:00,  2.58it/s]


[8/500] train loss: 0.0844118449, valid loss: 0.0000000000  [2m1s] count: 0, 


100%|██████████| 313/313 [02:00<00:00,  2.59it/s]


[9/500] train loss: 0.0818250393, valid loss: 0.0000000000  [2m1s] count: 0, 


100%|██████████| 313/313 [02:01<00:00,  2.58it/s]


[10/500] train loss: 0.0813190124, valid loss: 0.0000000000  [2m1s] count: 0, 


100%|██████████| 313/313 [02:02<00:00,  2.56it/s]


[11/500] train loss: 0.0811263973, valid loss: 0.0000000000  [2m3s] count: 0, 


100%|██████████| 313/313 [02:03<00:00,  2.54it/s]


[12/500] train loss: 0.0792114498, valid loss: 0.0000000000  [2m3s] count: 0, 


100%|██████████| 313/313 [02:01<00:00,  2.57it/s]


[13/500] train loss: 0.0786322073, valid loss: 0.0000000000  [2m2s] count: 0, 


100%|██████████| 313/313 [02:02<00:00,  2.56it/s]


[14/500] train loss: 0.0777754910, valid loss: 0.0000000000  [2m2s] count: 0, 


100%|██████████| 313/313 [02:03<00:00,  2.53it/s]


[15/500] train loss: 0.0770887446, valid loss: 0.0000000000  [2m4s] count: 0, 


100%|██████████| 313/313 [02:01<00:00,  2.58it/s]


[16/500] train loss: 0.0771903205, valid loss: 0.0000000000  [2m2s] count: 0, 


100%|██████████| 313/313 [02:03<00:00,  2.53it/s]


[17/500] train loss: 0.0743074931, valid loss: 0.0000000000  [2m4s] count: 1, 


100%|██████████| 313/313 [02:05<00:00,  2.50it/s]


[18/500] train loss: 0.0736443583, valid loss: 0.0000000000  [2m5s] count: 0, 


100%|██████████| 313/313 [02:04<00:00,  2.52it/s]


[19/500] train loss: 0.0737023068, valid loss: 0.0000000000  [2m4s] count: 0, 


100%|██████████| 313/313 [02:05<00:00,  2.50it/s]


[20/500] train loss: 0.0742383749, valid loss: 0.0000000000  [2m5s] count: 1, 


100%|██████████| 313/313 [02:03<00:00,  2.53it/s]


[21/500] train loss: 0.0722637291, valid loss: 0.0000000000  [2m4s] count: 2, 


100%|██████████| 313/313 [02:08<00:00,  2.43it/s]


[22/500] train loss: 0.0721011687, valid loss: 0.0000000000  [2m9s] count: 0, 


100%|██████████| 313/313 [02:06<00:00,  2.48it/s]


[23/500] train loss: 0.0723342764, valid loss: 0.0000000000  [2m6s] count: 0, 


100%|██████████| 313/313 [02:10<00:00,  2.41it/s]


[24/500] train loss: 0.0715054098, valid loss: 0.0000000000  [2m10s] count: 1, 


100%|██████████| 313/313 [02:08<00:00,  2.43it/s]


[25/500] train loss: 0.0714600778, valid loss: 0.0000000000  [2m9s] count: 0, 


100%|██████████| 313/313 [02:45<00:00,  1.89it/s]


[26/500] train loss: 0.0711281832, valid loss: 0.0000000000  [2m46s] count: 0, 


100%|██████████| 313/313 [02:07<00:00,  2.46it/s]


[27/500] train loss: 0.0712824575, valid loss: 0.0000000000  [2m7s] count: 0, 


100%|██████████| 313/313 [02:04<00:00,  2.51it/s]


[28/500] train loss: 0.0713565806, valid loss: 0.0000000000  [2m4s] count: 1, 


100%|██████████| 313/313 [02:03<00:00,  2.53it/s]


[29/500] train loss: 0.0701654573, valid loss: 0.0000000000  [2m4s] count: 2, 


100%|██████████| 313/313 [02:04<00:00,  2.52it/s]


[30/500] train loss: 0.0695716613, valid loss: 0.0000000000  [2m4s] count: 0, 


100%|██████████| 313/313 [02:00<00:00,  2.59it/s]


[31/500] train loss: 0.0708019206, valid loss: 0.0000000000  [2m1s] count: 0, 


100%|██████████| 313/313 [02:02<00:00,  2.56it/s]


[32/500] train loss: 0.0693206877, valid loss: 0.0000000000  [2m2s] count: 1, 


100%|██████████| 313/313 [02:04<00:00,  2.52it/s]


[33/500] train loss: 0.0690583726, valid loss: 0.0000000000  [2m4s] count: 0, 


100%|██████████| 313/313 [02:02<00:00,  2.57it/s]


[34/500] train loss: 0.0708977700, valid loss: 0.0000000000  [2m2s] count: 0, 


100%|██████████| 313/313 [02:03<00:00,  2.54it/s]


[35/500] train loss: 0.0691133455, valid loss: 0.0000000000  [2m3s] count: 1, 


100%|██████████| 313/313 [02:00<00:00,  2.60it/s]


[36/500] train loss: 0.0699960681, valid loss: 0.0000000000  [2m0s] count: 2, 


100%|██████████| 313/313 [02:03<00:00,  2.54it/s]

[37/500] train loss: 0.0693855999, valid loss: 0.0000000000  [2m3s] count: 3, 





In [196]:
input_tokens, output_tokens = ds[0]

In [203]:
model(input_tokens.unsqueeze(0), tgt=output_tokens[:1].unsqueeze(0))

tensor([[[-0.1394,  4.6278,  3.6682,  3.3525,  3.4011,  3.1810,  3.0235,
           2.9970,  2.8556,  2.7734, -2.7741, -6.3379, -5.7530, -2.7143,
          -5.0460, -2.7028, -8.8554, -5.9190]]], grad_fn=<ViewBackward0>)