In [1]:
!pip install datasets tokenizers yacs tensorboard torchmetrics



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import  Dataset, DataLoader, random_split
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace


import torchmetrics
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter


import copy
import math
import numpy as np
from pathlib import Path
import os

# Configuation


In [3]:
from yacs.config import CfgNode as CF

__C = CF()

# Training
__C.TRAINING = CF()
__C.TRAINING.BATCH_SIZE = 8
__C.TRAINING.NUM_EPOCHS = 20
__C.TRAINING.LEARNING_RATE = 1e-4
__C.TRAINING.PRELOAD = "latest"


# DATA
__C.DATA = CF()
__C.DATA.DATASOURCE = "opus_books"
__C.DATA.LANG_SRC = "en"
__C.DATA.LANG_TGT = "it"
__C.DATA.TOKENIZER_FILE = "tokenizer_{0}.json"

# MODEL
__C.MODEL = CF()
__C.MODEL.D_MODEL = 512
__C.MODEL.SEQ_LEN = 350

# EXPERIMENT
__C.EXPERIMENT = CF()
__C.EXPERIMENT.NAME = "runs/experiences"
__C.EXPERIMENT.MODEL_FOLDER = "weights"
# __C.EXPERIMENT.MODEL_BASENAME = "tmodel_"

def get_config():
    return __C.clone()


config = get_config()

Let's define some helper functions

In [4]:
def clones(layer, N):
    return nn.ModuleList([copy.deepcopy(layer) for _ in range(N)])


def save_checkpoint(
        model,
        optimizer,
        epoch,
        config
):
    checkpoint = {
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "epoch": epoch
    }


    if not os.path.exists(config.EXPERIMENT.NAME):
        print(f"Creating directory {config.EXPERIMENT.NAME}")
        os.makedirs(config.EXPERIMENT.NAME)

    file_path = os.path.join(config.EXPERIMENT.NAME, "transformer.model")
    torch.save(checkpoint, file_path)

def load_checkpoint(config):
    file_path = os.path.join(config.EXPERIMENT.NAME,  "transformer.model")
    checkpoint = torch.load(file_path)
    return checkpoint

# Encoder Decoder Architecture

In [5]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed,  generator):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed

        self.generator = generator

    def forward(self, src, tgt, src_mask, tgt_mask):
        memory = self.encode(src, src_mask)
        return self.decode(memory, src_mask, tgt, tgt_mask)

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(
            self.tgt_embed(tgt),
            memory,
            src_mask,
            tgt_mask
        )

In [6]:
class Generator(nn.Module):
    def __init__(self, d_model, vocab):
        super().__init__()

        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim = -1)

In [7]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super().__init__()

        self.norm = nn.LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        out = self.dropout(sublayer(self.norm(x)))
        return x + out

## Encoder

In [8]:
class Encoder(nn.Module):
    def __init__(self, layer, N = 6):
        super().__init__()

        self.layers = clones(layer, N )
        self.norm = nn.LayerNorm(layer.size)

    def forward(self, x, src_mask):
        for layer in self.layers:
            x = layer(x, src_mask)

        return self.norm(x)

In [9]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super().__init__()

        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayers = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        x = self.sublayers[0](x, lambda x: self.self_attn(x, x, x, mask))

        return self.sublayers[1](x, self.feed_forward)

## Decoder

In [10]:
class Decoder(nn.Module):
    def __init__(self, layer, N = 6):
        super().__init__()

        self.layers = clones(layer, N)
        self.norm = nn.LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)

        return self.norm(x)

In [11]:
class DecoderLayer(nn.Module):
    def __init__(self, size, src_attn, self_attn, feed_forward, dropout):
        super().__init__()

        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayers = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        x = self.sublayers[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayers[1](x, lambda x: self.src_attn(x, memory, memory, src_mask))

        return self.sublayers[2](x, self.feed_forward)

## Attention Layer

In [12]:
def attention(query, key, value, mask = None, dropout = None):
    d_k = query.size(-1)

    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)

    return torch.matmul(p_attn ,value), p_attn

In [13]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()

        assert d_model % heads == 0

        self.d_k = d_model // heads
        self.heads = heads

        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p = dropout)

    def forward(self, query, key, value, mask = None):
        if mask is not None and mask.dim() < 4:
            mask = mask.unsqueeze(1) # (1, Seq_Len, Seq_len) -> (1, 1, Seq_len, Seq_len)

        nbatches = query.size(0)

        query, key, value = [
            lin(x).view(nbatches, -1, self.heads, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        x, self.attn = attention(
            query, key, value, mask = mask, dropout =  self.dropout
        )
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.heads * self.d_k)

        del query
        del key
        del value

        return self.linears[-1](x)

## Position Wise FeedForward Layer

In [14]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout = 0.1):
        super().__init__()

        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

## Embedding Layer

In [15]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab_size):
        super().__init__()


        self.lut = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

## Absolution Position Embedding

In [16]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, dropout, max_len = 5000):
        super().__init__()


        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # X: (B, Seq_len, d_model)
        # Pe: (1, Seq_len, d_model)
        x = x + self.pe[:, :x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [17]:
def make_model(
        src_vocab,
        tgt_vocab,
        N = 6,
        d_model = 512,
        d_ff = 2048,
        h = 8,
        dropout = 0.1
):
    c = copy.deepcopy


    attn = MultiHeadedAttention(
        h, d_model
    )
    ff = PositionWiseFeedForward(d_model, d_ff)
    position = PositionalEncoder(d_model, dropout)

    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    )

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_normal_(p)

    return model

In [18]:
def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int8)
    return mask == 0

In [19]:
# # test
# model = make_model(100, 200)

# src = torch.tensor([[1,2,3,4], [5,6,7,8]])
# tgt = torch.tensor([[11,22,33,44], [55,66,77,88]])

# src_mask = None
# tgt_mask = causal_mask(4)

# out = model(src, tgt, src_mask, tgt_mask)
# pred = model.generator(out)
# print(pred.shape)


# Now let build train the Transformer Model on real data

In [20]:
class BilingualDataset(Dataset):
    def __init__(
            self,
            ds,
            tokenizer_src,
            tokenizer_tgt,
            src_lang,
            tgt_lang,
            seq_len
    ):
        super().__init__()

        self.seq_len = seq_len
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token_id = torch.tensor(
            [self.tokenizer_tgt.token_to_id("[SOS]")], dtype = torch.int64
        )
        self.eos_token_id = torch.tensor(
            [self.tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64
        )
        self.pad_token_id = torch.tensor(
            [self.tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64
        )

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        # Get training source words and target words
        src_target_pair = self.ds[idx]
        src_text = src_target_pair["translation"][self.src_lang]
        tgt_text = src_target_pair["translation"][self.tgt_lang]


        # Transforms to tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add <SOS>, <EOS>, <PAD> to each source sentence
        # The number of <PAD> we need to add, -2 because we have <SOS><EOS>
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # -1 because only has <SOS>


        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError(
                f"The Sentence is too long for the model. Max length is {self.seq_len - 2} tokens."
            )

        encoder_input = torch.cat([
            self.sos_token_id,
            torch.tensor(enc_input_tokens, dtype = torch.int64),
            self.eos_token_id,
            torch.tensor([self.pad_token_id] * enc_num_padding_tokens, dtype = torch.int64)
        ], dim = 0)

        decoder_input = torch.cat([
            self.sos_token_id,
            torch.tensor(dec_input_tokens, dtype = torch.int64),
            torch.tensor([self.pad_token_id] * dec_num_padding_tokens, dtype = torch.int64)
        ], dim = 0)

        label = torch.cat([
            torch.tensor(dec_input_tokens, dtype=torch.int64),
            self.eos_token_id,
            torch.tensor([self.pad_token_id] * dec_num_padding_tokens, dtype=torch.int64),
        ], dim = 0)


        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,
            "decoder_input": decoder_input,
            "encoder_mask": (encoder_input != self.pad_token_id).unsqueeze(0).unsqueeze(0).int(),
            "decoder_mask": (decoder_input != self.pad_token_id).unsqueeze(0).int()
            & causal_mask(
                decoder_input.size(0)
            ),  # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

## Tokenization

In [21]:
def get_all_senteces(ds, lang):
    for example in ds:
        yield example["translation"][lang]

In [22]:
def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config.DATA.TOKENIZER_FILE.format(lang))
    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(
            special_tokens=[f"[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2
        )
        tokenizer.train_from_iterator(get_all_senteces(ds, lang), trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        print("Loaded tokenizer from file")
        tokenizer = Tokenizer.from_file(str(tokenizer_path))

    return tokenizer


# Get training and validation dataloader

In [23]:
def get_dataloader(config):
    ds_raw = load_dataset(
        f"{config.DATA.DATASOURCE}",
        f"{config.DATA.LANG_SRC}-{config.DATA.LANG_TGT}",
        split="train",
    )

    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config.DATA.LANG_SRC)
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config.DATA.LANG_TGT)

    # Keep 90% for training, 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    # Build Dataset
    train_ds = BilingualDataset(
        train_ds_raw,
        tokenizer_src,
        tokenizer_tgt,
        config.DATA.LANG_SRC,
        config.DATA.LANG_TGT,
        config.MODEL.SEQ_LEN,
    )
    val_ds = BilingualDataset(
        val_ds_raw,
        tokenizer_src,
        tokenizer_tgt,
        config.DATA.LANG_SRC,
        config.DATA.LANG_TGT,
        config.MODEL.SEQ_LEN,
    )

    # Find the maximum length of each sentence in the source and target
    max_len_src = 0
    max_len_tgt = 0

    for item in ds_raw:
        src_ids = tokenizer_src.encode(item["translation"][config.DATA.LANG_SRC]).ids
        tgt_ids = tokenizer_tgt.encode(item["translation"][config.DATA.LANG_TGT]).ids

        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f"Max length of source sentence: {max_len_src}")
    print(f"Max length of target sentence: {max_len_tgt}")


    train_dataloader = DataLoader(
        train_ds, batch_size=config.TRAINING.BATCH_SIZE, shuffle=True
    )
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=False)

    return {
        "train_dataloader": train_dataloader,
        "val_dataloader": val_dataloader,
        "tokenizer_src": tokenizer_src,
        "tokenizer_tgt": tokenizer_tgt,
    }


## Test Dataloader

# Define Decoding Methods

## Greedy Decoding

In [24]:
def greedy_decode(
        model,
        source,
        source_mask,
        tokenizer_tgt,
        device,
        max_len = 100,
):
    sos_idx = tokenizer_tgt.token_to_id("[SOS]")
    eos_idx = tokenizer_tgt.token_to_id("[EOS]")

    # Pre-compute the encoder output and reuse it for every step
    memory = model.encode(source, source_mask)
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type(source).to(device)

    while True:
        if decoder_input.size(-1) == max_len:
            break

        # Build mask for target
        decoder_mask = causal_mask(decoder_input.size(-1)).type_as(source_mask).to(device)

        # Calculate output
        out = model.decode(memory, src_mask, decoder_input, decoder_mask)
        prob = model.generator(out)
        _, next_word = prob.max(dim = -1)
        decoder_input = torch.cat([
            decoder_input,
            torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device),
        ], dim = 1)

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)

In [25]:
def beam_search_decode(
    model,
    beam_size,
    source,
    source_mask,
    tokenizer_tgt,
    device,
    max_len=100,
):
    sos_idx = tokenizer_tgt.token_to_id("[SOS]")
    eos_idx = tokenizer_tgt.token_to_id("[EOS]")

    memory = model.encode(source, source_mask)
    decoder_initial_input = torch.empty(1, 1).fill_(sos_idx).type(source).to(device)

    candidates = [(decoder_initial_input, 1)]

    while True:
        if any([cand.size(1) == max_len for cand, _ in candidates]):
            break

        new_candidates = []
        for candidate, score in candidates:
            if candidate[0, -1] == eos_idx:
                continue

            candidate_mask = causal_mask(candidate.size(1)).type_as(source_mask).to(device)

            out = model.decode(memory, src_mask, candidate, candidate_mask)
            prob = model.generator(out)

            topk_prob, topk_idx = torch.topk(prob, beam_size, dim = 1)

            for i in range(beam_size):
                token = topk_idx[0][i].unsqueeze(0).unsqueeze(0)
                token_prob = topk_prob[0][i].item()

                # Create new candidate
                new_candidate = torch.cat([candidate, token], dim=1)
                new_candidates.append((new_candidate, score + token_prob))

        # Sort the new candidates by score
        new_candidates = sorted(new_candidates, key=lambda x: x[1], reverse=True)
        # Get the top k candidates
        candidates = new_candidates[:beam_size]

        if all([cand[0][-1].item() == eos_idx for cand, _ in candidates]):
            break

    return candidates[0][0].squeeze(0)

# Next, let define training process

In [26]:
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_dataloader(config).values()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded tokenizer from file
Loaded tokenizer from file
Max length of source sentence: 309
Max length of target sentence: 274


In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [29]:
@torch.no_grad()
def valida_one_epoch(
        model,
        val_dataloader,
        loss_fn,
        device,
        toeknizer_tgt,
        max_len = 100,
        writer = None,
        global_step = 0
):
    model.eval()

    count = 0
    source_texts = []
    expected = []
    predicted = []

    for batch in val_dataloader:
        count += 1

        encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
        encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

        assert encoder_input.size(0) == 1, "Batch size must be 1 for validation"

        model_out = greedy_decode(
            model,
            encoder_input,
            encoder_mask,
            toeknizer_tgt,
            device,
            max_len,
        )

        source_text = batch["src_text"][0]
        target_text = batch["tgt_text"][0]
        model_out_text = toeknizer_tgt.decode(model_out.detach().cpu().numpy())

        source_texts.append(source_text)
        expected.append(target_text)
        predicted.append(model_out_text)

        print(f"{f'SOURCE: ':>12}{source_text}")
        print(f"{f'TARGET: ':>12}{target_text}")
        print(f"{f'PREDICTED: ':>12}{model_out_text}")

    if writer is not None:
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation cer', cer, global_step)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation wer', wer, global_step)
        writer.flush()

        # Compute the BLEU metric
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('validation BLEU', bleu, global_step)
        writer.flush()

In [30]:
def train_one_epoch(
        model,
        train_dataloader,
        optimizer,
        loss_fn,
        cur_epoch,
        tokenizer_tgt,
):
    model.train()

    batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {cur_epoch:02d}")
    steps = 0
    for batch in batch_iterator:
        encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
        decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
        encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
        decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

        # Run the tensors through the encoder, decoder and the projection layer
        encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
        decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
        proj_output = model.generator(decoder_output) # (B, seq_len, vocab_size)

        # Compare the output with the label
        label = batch['label'].to(device) # (B, seq_len)

        loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
        batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        steps += 1

    return steps

In [31]:
def train_model(
    model,
    loss_fn,
    optimizer,
    train_dataloader,
    val_dataloader,
    config
):
    writer = SummaryWriter(config.EXPERIMENT.NAME)
    global_step = 0
    for epoch in range(config.TRAINING.NUM_EPOCHS):
        torch.cuda.empty_cache()

        global_step += train_one_epoch(
            model,
            train_dataloader,
            optimizer,
            loss_fn,
            epoch,
            tokenizer_tgt
        )

        valida_one_epoch(
            model,
            val_dataloader,
            loss_fn,
            device,
            tokenizer_tgt,
            writer = writer,
            global_step = global_step
        )


        if epoch % 5 == 0:
            save_checkpoint(
                model,
                optimizer,
                epoch,
                config
            )

In [None]:
model = make_model(
    tokenizer_src.get_vocab_size(),
    tokenizer_tgt.get_vocab_size(),
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = config.TRAINING.LEARNING_RATE, eps=1e-9)
loss_fn = nn.CrossEntropyLoss(
    ignore_index=tokenizer_src.token_to_id("[PAD]"),
    label_smoothing=0.1
).to(device)

train_model(
    model,
    loss_fn ,
    optimizer,
    train_dataloader,
    val_dataloader,
    config
)

Processing Epoch 00:   0%|          | 3/3638 [01:27<29:14:55, 28.97s/it, loss=9.747]

# Let do some testing!!

In [None]:
# # Ensure TensorBoard extension is loaded
# %load_ext tensorboard
# # Start TensorBoard and point it to the log directory
# %tensorboard --logdir=runs