In [1]:
#!unzip ./drive/MyDrive/single-person-pose-estimation/dataset/tfrecords.zip

Archive:  ./drive/MyDrive/single-person-pose-estimation/dataset/tfrecords.zip
   creating: dataset/tfrecords/
   creating: dataset/tfrecords/valid/
  inflating: dataset/tfrecords/valid/file_valid_01-2048.tfrec  
  inflating: dataset/tfrecords/valid/file_valid_03-208.tfrec  
  inflating: dataset/tfrecords/valid/file_valid_02-2048.tfrec  
  inflating: dataset/tfrecords/valid/file_valid_00-2048.tfrec  
   creating: dataset/tfrecords/train/
  inflating: dataset/tfrecords/train/file_train_00-2048.tfrec  
  inflating: dataset/tfrecords/train/file_train_26-2048.tfrec  
  inflating: dataset/tfrecords/train/file_train_12-2048.tfrec  
  inflating: dataset/tfrecords/train/file_train_47-2048.tfrec  
  inflating: dataset/tfrecords/train/file_train_05-2048.tfrec  
  inflating: dataset/tfrecords/train/file_train_35-2048.tfrec  
  inflating: dataset/tfrecords/train/file_train_13-2048.tfrec  
  inflating: dataset/tfrecords/train/file_train_69-2048.tfrec  
  inflating: dataset/tfrecords/train/file_train

In [2]:
import sys
sys.path.append('/content/drive/MyDrive/single-person-pose-estimation')

from datetime import date
import tensorflow as tf
import math
import glob
import os

In [3]:
from configs import default_config as cfg
from dataset_builder import DatasetBuilder
from model.hourglass import HourGlassNet

In [4]:
def make_checkpoint_callback(config, time_now):
  checkpoint_filepath  = config.CHECKPOINTS_PATH + '/' + time_now + '/' + 'checkpoint'
  return tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_filepath,
                                            save_weights_only = True,
                                            monitor = 'val_loss',
                                            mode = 'min', # since monitor val_loss, overwrite when its mean
                                            save_best_only=True,
                                            verbose = True)

ckpt_callback = make_checkpoint_callback(cfg, 'Monday')
# Callback for printing the LR at the end of each epoch.
class PrintLR(tf.keras.callbacks.Callback):
    """Callback for printing the LR at the beginning of each epoch"""

    def on_epoch_begin(self, epoch, logs=None):
        print('\nLearning rate for epoch {} is {}'.format(epoch + 1, self.model.optimizer.lr.numpy()))

call_backs = [
  ckpt_callback, 
  PrintLR()
]

In [10]:
def make_checkpoint_callback(checkpoint_filepath):
  return tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_filepath,
                                            save_weights_only = True,
                                            monitor = 'val_loss',
                                            mode = 'min', # since monitor val_loss, overwrite when its mean
                                            save_best_only=True,
                                            verbose = True)

# Callback for printing the LR at the end of each epoch.
class PrintLR(tf.keras.callbacks.Callback):
    """Callback for printing the LR at the beginning of each epoch"""

    def on_epoch_begin(self, epoch, logs=None):
        print('\nLearning rate for epoch {} is {}'.format(epoch + 1, self.model.optimizer.lr.numpy()))

In [7]:
today = date.today().strftime("%d-%m-%Y")
print(today)

22-03-2022


In [67]:
class Trainer:
  def __init__(self, model, ds_builder, epochs, learning_rate, config):
    self.model = model
    self.ds_train, self.ds_valid = ds_builder.build_datasets()
    self.ds_train = self.ds_train.take(16)
    self.ds_valid = self.ds_valid.take(16)

    self.steps_per_epoch = math.ceil(ds_builder.num_train_examples // config.BATCH_SIZE)
    self.valid_steps = math.ceil(ds_builder.num_valid_examples // config.BATCH_SIZE)
    self.epochs = epochs 
    self.checkpoints_path = config.CHECKPOINTS_PATH
    
    self.learning_rate = learning_rate
    self.optimizer = tf.keras.optimizers.Adam(learning_rate = self.learning_rate)
    self.loss = tf.keras.losses.MeanSquaredError()
  
  def train(self):
    self.model.compile(optimizer = self.optimizer,
            loss = self.loss)

    today = date.today().strftime("%d-%m-%Y")
    path = self.checkpoints_path + f'/{today}-E{self.epochs}' + '.cpkt'
    ckpt_callback = make_checkpoint_callback(path)
    callbacks = [ckpt_callback, PrintLR()]

    print(f'''Start traing with:
    1. Current date {today}.
    2. Number of epochs {self.epochs}.
    3. Learning rate {self.learning_rate}.
    ''')
   
    self.model.fit(
      self.ds_train, 
      epochs = self.epochs,
      callbacks = callbacks,
      steps_per_epoch = 1,#self.steps_per_epoch,
      validation_data = self.ds_valid,
      validation_steps = 1#self.valid_steps,
    )

    print(f'''Finished training!!
    Temporary checkpoint is saved at {self.checkpoints_path}
    To save model call save_model() method
    ''')
  
  def resume_training(self):
    '''
    This shoudl be called on a newly created instance
    '''
    assert os.path.exists(self.checkpoints_path)

    cpkt_name, previous_epochs = self.get_epochs_from_name(self.checkpoints_path)
    self.epochs += previous_epochs

    self.model.compile(optimizer = self.optimizer,
            loss = self.loss)
    
    print(f'Loading weights from {self.checkpoints_path}')
    self.model.load_weights(self.checkpoints_path + '/' + cpkt_name)

    today = date.today().strftime("%d-%m-%Y")
    new_path = self.checkpoints_path + f'/{today}-E{self.epochs}' + '.cpkt'
    ckpt_callback = make_checkpoint_callback(new_path)
    callbacks = [ckpt_callback, PrintLR()]

    print(f'''Start traing with:
    1. Current date {today}.
    2. Resume training for {self.epochs - previous_epochs}, from epoch {previous_epochs} to epoch {self.epochs}.
    3. Learning rate {self.learning_rate}.
    ''')
   
    self.model.fit(
      self.ds_train, 
      epochs = self.epochs,
      callbacks = callbacks,
      steps_per_epoch = 1,#self.steps_per_epoch,
      validation_data = self.ds_valid,
      validation_steps = 1,#self.valid_steps,
      initial_epoch = previous_epochs
    )

    print(f'''Finished training!!
    Temporary checkpoint is saved at {new_path}
    To save model call save_model() method
    ''')
  
  def save_model(self, path):
    self.model.save(path)


      
  @staticmethod
  def get_epochs_from_name(path):
    name = glob.glob(path + '/*.cpkt.index')
    name.sort()

    last = name[-1] #last in the list 
    last = last.split('/')[-1] # get rid of slashes
    ckpt_name = last[:-6] # eliminate '.index'
    
    epochs = ckpt_name[:-5] # eliminate '.cpkt'
    epochs = epochs.split('-')[-1] # get E{epcoch}
    epochs = int(epochs[1:]) # get rid of 'E'
    
    return ckpt_name, epochs

In [6]:
HG = HourGlassNet(17, 4, 256, 
              (256, 256, 3), (64, 64, 3))
model = HG.create_hg_model()

Created HourGlassmodel:
    1. 4 stacks.
    2. 12080324 parameters. Call object.get_summary() for more detail.
    


In [7]:
ds_builder = DatasetBuilder(cfg)

In [68]:
trainer = Trainer(model, ds_builder, 2, 0.01, cfg)

Train dataset with 149813 examples.
Valid dataset with 6352 examples.


In [17]:
trainer.train()

Start traing with:
    1. Current date 22-03-2022.
    2. Number of epochs 2.
    3. Learning rate 0.01.
    

Learning rate for epoch 1 is 0.009999999776482582
Epoch 1/2
Epoch 1: val_loss improved from inf to 1.62478, saving model to temp/checkpoints/22-03-2022-E2.cpkt

Learning rate for epoch 2 is 0.009999999776482582
Epoch 2/2
Epoch 2: val_loss improved from 1.62478 to 0.71620, saving model to temp/checkpoints/22-03-2022-E2.cpkt
Finished training!!
    Temporary checkpoint is saved at temp/checkpoints
    To save model call save_model() method
    


In [65]:
trainer.resume_training()

Loading weights from temp/checkpoints
Start traing with:
    1. Current date 22-03-2022.
    2. Training epochs from epoch 2 to epoch 4.
    3. Learning rate 0.01.
    

Learning rate for epoch 3 is 0.009999999776482582
Epoch 3/4
Epoch 3: val_loss improved from inf to 0.11985, saving model to temp/checkpoints/22-03-2022-E4.cpkt

Learning rate for epoch 4 is 0.009999999776482582
Epoch 4/4
Epoch 4: val_loss improved from 0.11985 to 0.00378, saving model to temp/checkpoints/22-03-2022-E4.cpkt
Finished training!!
    Temporary checkpoint is saved at temp/checkpoints/22-03-2022-E4.cpkt
    To save model call save_model() method
    


In [66]:
trainer.resume_training()

Loading weights from temp/checkpoints
Start traing with:
    1. Current date 22-03-2022.
    2. Training epochs from epoch 4 to epoch 8.
    3. Learning rate 0.01.
    

Learning rate for epoch 5 is 0.009999999776482582
Epoch 5/8
Epoch 5: val_loss improved from inf to 0.00378, saving model to temp/checkpoints/22-03-2022-E8.cpkt

Learning rate for epoch 6 is 0.009999999776482582
Epoch 6/8
Epoch 6: val_loss did not improve from 0.00378

Learning rate for epoch 7 is 0.009999999776482582
Epoch 7/8
Epoch 7: val_loss did not improve from 0.00378

Learning rate for epoch 8 is 0.009999999776482582
Epoch 8/8
Epoch 8: val_loss did not improve from 0.00378
Finished training!!
    Temporary checkpoint is saved at temp/checkpoints/22-03-2022-E8.cpkt
    To save model call save_model() method
    


In [69]:
trainer.save_model("save")

INFO:tensorflow:Assets written to: save/assets


In [40]:
name = glob.glob('temp/checkpoints/*.cpkt.index')

In [41]:
name.sort()

In [42]:
name = name[-1].split('-')[-1]
name

'E2.cpkt.index'

In [69]:
name = name.split('.')[0]
name

'E2'

In [70]:
name = name[1:]

In [71]:
name

'2'

In [86]:
!rm -r temp