The below cell has the Super Attention Layer that has a single W_a matrix managed by the Multi Head Attention Module. Run it if you want to use that version of Super Attention (all other attention layers are the same)

In [6]:
import math
import torch
from torch import nn

class AttentionLayer(nn.Module):
    def __init__(self,
                 d_model : int,
                 d_q : int,
                 d_k : int,
                 d_v : int,
                 W_a : nn.Linear = None,
                 layer_type : str = 'SDPA',
                 idx : int = 0,
                 max_len : int = 32):
        super().__init__()
        self.d_model    = d_model
        self.d_q        = d_q
        self.d_k        = d_k
        self.d_v        = d_v
        self.layer_type = layer_type
        self.idx        = idx
        self.max_len    = max_len
        self.W_a        = W_a
        self._set_layer_type()

    def _set_layer_type(self):
        self.softmax = nn.Softmax(dim = 1)
        self.W_q     = nn.Linear(self.d_model,self.d_q)
        nn.init.xavier_uniform_(self.W_q.weight)
        nn.init.constant_(self.W_q.bias, 0)
        if self.layer_type == 'Optimised':
            self.W_k     = nn.Linear(self.d_model,self.d_k)
            nn.init.xavier_uniform_(self.W_k.weight)
            nn.init.constant_(self.W_k.bias, 0)
            self.forward = self._forward_optimised
        elif self.layer_type == 'Efficient':
            self.forward = self._forward_efficient
        elif self.layer_type == 'Super':
            self.forward = self._forward_super
        else:
            # Default to SDPA
            self.W_k     = nn.Linear(self.d_model,self.d_k)
            self.W_v     = nn.Linear(self.d_model,self.d_v)
            nn.init.xavier_uniform_(self.W_k.weight)
            nn.init.constant_(self.W_k.bias, 0)
            nn.init.xavier_uniform_(self.W_v.weight)
            nn.init.constant_(self.W_v.bias, 0)
            self.forward = self._forward_SDPA

    def _forward_SDPA(self, inp_q, inp_k, inp_v):
        Q     = self.W_q(inp_q)
        K     = self.W_k(inp_k)
        V     = self.W_v(inp_v)
        K_t   = K.permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        H     = S @ V
        return H

    def _forward_optimised(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        K     = self.W_k(inp_k)
        K_t   = K.permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        v_lo  = ((self.idx) * self.d_v)
        v_hi  = ((self.idx + 1) * self.d_v)
        V     = inp_v[:,:, v_lo : v_hi]
        H     = S @ V
        return H

    def _forward_efficient(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        lo    = ((self.idx) * self.d_k)
        hi    = ((self.idx + 1) * self.d_k)
        K_t   = inp_k[:, :, lo : hi].permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        V     = inp_v[:,:, lo : hi]
        H     = S @ V
        return H

    def _forward_super(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        lo    = ((self.idx) * self.d_k)
        hi    = ((self.idx + 1) * self.d_k)
        K_t   = inp_k[:, :, lo : hi].permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        V     = self.W_a(inp_v[:,:, lo : hi].permute(0,2,1)).permute(0,2,1)
        H     = S @ V
        return H

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model, d_k, d_v, max_len, layer_type):
        super().__init__()
        self.layers  = nn.Sequential()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k     = d_k
        self.d_v     = d_v
        self.W_a     = None
        if layer_type == 'Super':
            self.W_a = nn.Linear(max_len,max_len)
            nn.init.xavier_uniform_(self.W_a.weight)
            nn.init.constant_(self.W_a.bias, 0)
        for i in range(n_heads):
            self.layers.add_module("Attention_Layer "+str(i),
                                   AttentionLayer(d_model,d_k,d_k,d_v,self.W_a,layer_type,i,max_len))
        self.W_o     = nn.Linear(n_heads * d_v, d_model)

    def forward(self, inp_q, inp_k, inp_v):
        for i, layer in enumerate(self.layers):
            if i == 0:
                H = layer(inp_q,inp_k,inp_v)
            else:
                h_i = layer(inp_q,inp_k,inp_v)
                h_cat = (H.clone(),h_i)
                H = torch.cat(h_cat,2)
        out = self.W_o(H)
        return out

The below cell has the Super Attention Layer that has a single W_a matrix FOR EACH Super Attention Layer, independent of the Multi Head Attention Module. Run it if you want to use that version of Super Attention (all other attention layers are the same)

In [2]:
import math
import torch
from torch import nn

class AttentionLayer(nn.Module):
    def __init__(self,
                 d_model : int,
                 d_q : int,
                 d_k : int,
                 d_v : int,
                 layer_type : str = 'SDPA',
                 idx : int = 0,
                 max_len : int = 32):
        super().__init__()
        self.d_model    = d_model
        self.d_q        = d_q
        self.d_k        = d_k
        self.d_v        = d_v
        self.layer_type = layer_type
        self.idx        = idx
        self.max_len    = max_len
        self._set_layer_type()


    def _set_layer_type(self):
        self.softmax = nn.Softmax(dim = 1)
        self.W_q     = nn.Linear(self.d_model,self.d_q)
        nn.init.xavier_uniform_(self.W_q.weight)
        nn.init.constant_(self.W_q.bias, 0)
        if self.layer_type == 'Optimised':
            self.W_k     = nn.Linear(self.d_model,self.d_k)
            nn.init.xavier_uniform_(self.W_k.weight)
            nn.init.constant_(self.W_k.bias, 0)
            self.forward = self._forward_optimised
        elif self.layer_type == 'Efficient':
            self.forward = self._forward_efficient
        elif self.layer_type == 'Super':
            self.forward = self._forward_super
            self.W_a     = nn.Linear(self.max_len,self.max_len)
            nn.init.xavier_uniform_(self.W_a.weight)
            nn.init.constant_(self.W_a.bias, 0)
        else:
            # Default to SDPA
            self.W_k     = nn.Linear(self.d_model,self.d_k)
            self.W_v     = nn.Linear(self.d_model,self.d_v)
            nn.init.xavier_uniform_(self.W_k.weight)
            nn.init.constant_(self.W_k.bias, 0)
            nn.init.xavier_uniform_(self.W_v.weight)
            nn.init.constant_(self.W_v.bias, 0)
            self.forward = self._forward_SDPA

    def _forward_SDPA(self, inp_q, inp_k, inp_v):
        Q     = self.W_q(inp_q)
        K     = self.W_k(inp_k)
        V     = self.W_v(inp_v)
        K_t   = K.permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        H     = S @ V
        return H

    def _forward_optimised(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        K     = self.W_k(inp_k)
        K_t   = K.permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        v_lo  = ((self.idx) * self.d_v)
        v_hi  = ((self.idx + 1) * self.d_v)
        V     = inp_v[:,:, v_lo : v_hi]
        H     = S @ V
        return H

    def _forward_efficient(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        lo    = ((self.idx) * self.d_k)
        hi    = ((self.idx + 1) * self.d_k)
        K_t   = inp_k[:, :, lo : hi].permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        V     = inp_v[:,:, lo : hi]
        H     = S @ V
        return H

    def _forward_super(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        lo    = ((self.idx) * self.d_k)
        hi    = ((self.idx + 1) * self.d_k)
        K_t   = inp_k[:, :, lo : hi].permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        V     = self.W_a(inp_v[:,:, lo : hi].permute(0,2,1)).permute(0,2,1)
        H     = S @ V
        return H


class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model, d_k, d_v, max_len, layer_type):
        super().__init__()
        self.layers  = nn.Sequential()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k     = d_k
        self.d_v     = d_v
        for i in range(n_heads):
            self.layers.add_module("Attention_Layer "+str(i),
                                   AttentionLayer(d_model,d_k,d_k,d_v,layer_type,i,max_len))
        self.W_o     = nn.Linear(n_heads * d_v, d_model)

    def forward(self, inp_q, inp_k, inp_v):
        for i, layer in enumerate(self.layers):
            if i == 0:
                H = layer(inp_q,inp_k,inp_v)
            else:
                h_i = layer(inp_q,inp_k,inp_v)
                h_cat = (H.clone(),h_i)
                H = torch.cat(h_cat,2)
        out = self.W_o(H)
        return out

The next cell initializes the Dataloader and runs the main script. Remember to download the IMDB_dataset.csv file from the github repo and either upload it to google colab or to have it in the same working directory locally.

In [None]:
"""
## "You Need to Pay Better Attention" Pytorch Transformer Example

## Paper Link: https://arxiv.org/abs/2403.01643

## Author: Nicholas Mesa-Cucalon (https://github.com/NMesaC)
"""
import re
import math
import torch
import time
import os

import pandas as pd

from torch import nn
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from typing import Dict, List, Tuple

# Set device since some classes need info
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(1019)

"""
## Dataloader
"""
class IMDBDataset(Dataset):
    def __init__(self, csv_path: str, vocab_size: int = 10000, max_length: int = 200):
        self.data = pd.read_csv(csv_path)
        self.max_length = max_length
        self.vocab_size = vocab_size

        self.preprocess_data()
        self.build_vocabulary()
        self.tokenize_reviews()

    def preprocess_data(self):
        self.data['review'] = self.data['review'].apply(self.clean_text)
        self.data['sentiment'] = self.data['sentiment'].map({"positive": 1, "negative": 0})

    def clean_text(self, text: str) -> str:
        text = text.lower()
        text = re.sub(r"<br\s*/?>", " ", text)
        text = re.sub(r"[^a-z0-9\s]", "", text)
        return text.strip()

    def build_vocabulary(self):
        word_freq = Counter()
        for review in self.data['review']:
            word_freq.update(review.split())

        special_tokens = ['<PAD>', '<SOS>']
        common_words = [word for word, _ in word_freq.most_common(self.vocab_size - len(special_tokens))]
        self.vocab = {word: idx for idx, word in enumerate(special_tokens + common_words)}

    def tokenize_reviews(self):
        self.tokenized_reviews = []
        for review in self.data['review']:
            tokens = [self.vocab['<SOS>']]
            tokens.extend([self.vocab.get(word, self.vocab['<PAD>']) for word in review.split()[:self.max_length-1]])
            self.tokenized_reviews.append(torch.tensor(tokens))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        return self.tokenized_reviews[idx], self.data['sentiment'].iloc[idx]

def collate_imdb(batch: List[Tuple[torch.Tensor, int]]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    reviews, sentiments = zip(*batch)
    padded_reviews = pad_sequence(reviews, batch_first=True, padding_value=0)
    lengths = torch.tensor([len(review) for review in reviews])
    sentiments = torch.tensor(sentiments, dtype=torch.float32)
    return padded_reviews, lengths, sentiments

def get_dataloader(csv_path: str, vocab_size: int, max_length: int, batch_size: int, val_split : float) -> Tuple[DataLoader, DataLoader, Dict[str, int]]:
    dataset = IMDBDataset(csv_path, vocab_size, max_length)
    train_size = 40000
    val_size   = (int)(train_size * val_split)
    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, len(dataset) - train_size - val_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  collate_fn=collate_imdb)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, collate_fn=collate_imdb)
    test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, collate_fn=collate_imdb)
    return train_loader, val_loader, test_loader


"""
## Transformer Block Module
"""
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, ff_dim, layer_type = 'SDPA', max_len = 32, dropout_rate=0.1):
        super().__init__()
        d_k, d_v = d_model // num_heads
        self.att = MultiHeadAttention(num_heads, d_model, d_k, d_v, max_len, layer_type)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, d_model)
        )
        self.layernorm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x):
        attn_output, _ = self.att(x, x, x)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

"""
## Embedding Layer
"""
class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.maxlen = maxlen
        self.token_emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
        self.pos_emb = nn.Embedding(num_embeddings=maxlen, embedding_dim=embed_dim)

    def forward(self, x):
        positions = torch.arange(self.maxlen, device=x.device)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions.unsqueeze(0)

"""
## Transformer-Encoder-Only Arch
"""
class EncoderLayer(nn.Module):
    def __init__(self, n_heads, d_model, d_k, d_v, d_lin, max_len, layer_type, drop_p):
        super().__init__()
        self.multi      = MultiHeadAttention(n_heads,d_model,d_k,d_v,max_len,layer_type)
        self.ff         = nn.Sequential(
                            nn.Linear(d_model,d_lin),
                            nn.ReLU(),
                            nn.Linear(d_lin,d_model)
                         )
        self.norm_multi = nn.LayerNorm(d_model, eps=1e-6)
        self.norm_ff    = nn.LayerNorm(d_model, eps=1e-6)
        self.drop_multi = nn.Dropout(drop_p)
        self.drop_ff    = nn.Dropout(drop_p)
    def forward(self, inp):
        multi = self.multi(inp,inp,inp)
        multi = self.drop_multi(multi)
        z     = self.norm_multi(inp + multi)
        ff    = self.ff(z)
        ff    = self.drop_ff(ff)
        return self.norm_ff(z + ff)

class Encoder(nn.Module):
    def __init__(self, n_heads, d_model, d_k, d_v, d_lin, n_layers, vocab_size, max_len, layer_type, drop_p):
        super().__init__()
        self.encoder_layers  = nn.Sequential()
        self.embedding       = TokenAndPositionEmbedding(max_len,vocab_size,d_model)
        self.n_layers        = n_layers
        for i in range(n_layers):
            self.encoder_layers.add_module("Encoder_Layer"+str(i),EncoderLayer(n_heads,
                                                                               d_model,
                                                                               d_k,
                                                                               d_v,
                                                                               d_lin,
                                                                               max_len,
                                                                               layer_type,
                                                                               drop_p))
    def forward(self, inp):
        embed_i = self.embedding(inp)
        for layer in self.encoder_layers:
            embed_i     = layer(embed_i)
        return embed_i

class Transformer(nn.Module):
    def __init__(self,
                 vocab_size = 20000,
                 n_heads    = 4,
                 d_model    = 32,
                 d_k        = 8,
                 d_v        = 8,
                 d_lin      = 32,
                 n_layers   = 1,
                 max_len    = 32,
                 layer_type = 'SDPA',
                 drop_p     = 0.1):
        super().__init__()
        d_k = d_model // n_heads
        d_v = d_k
        self.encoder         = Encoder(n_heads,d_model,d_k,d_v,d_lin,n_layers,vocab_size,max_len,layer_type,drop_p)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.dropout1        = nn.Dropout(0.1)
        self.dense1          = nn.Linear(d_model, 6)
        self.dropout2        = nn.Dropout(0.1)
        self.dense2          = nn.Linear(6, 1)

    def forward(self, inp):
        x = self.encoder(inp)
        x = x.transpose(1,2)
        x = self.global_avg_pool(x).squeeze(2)
        x = self.dropout1(x)
        x = torch.relu(self.dense1(x))
        x = self.dropout2(x)
        res = self.dense2(x)
        return res

"""
## Training Loop
"""
def train_loop(device, model, optim, loader, loss_func, epoch, train=False):
    model.train(mode=train)
    total_loss    = 0
    total_correct = 0
    n_samples     = 0
    label         = 'Training' if train else 'Test'
    for reviews, _, labels in tqdm(loader, desc=f'{label} Epoch: {epoch}'):
        reviews = reviews.to(device)
        labels = labels.to(device)
        #Forward
        if train:
            optim.zero_grad()
            logits = model(reviews)
            loss = loss_func(logits, labels.reshape(logits.shape))
            loss.backward()
            optim.step()
        else:
            with torch.no_grad():
                logits = model(reviews)
                loss   = loss_func(logits, labels.reshape(logits.shape))
        #Predictions
        preds = (logits > 0.5).float()
        #Compute accuracy
        acc = torch.sum(preds == labels.reshape(logits.shape))
        #Track stats
        total_loss += reviews.shape[0] * loss
        n_samples += reviews.shape[0]
        total_correct += acc
    return total_loss / n_samples, total_correct / n_samples

"""
## Checkpoint Callbacks
"""
def save_checkpoint(model, optimizer, epoch, best_metric, filename):
    state = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
        'best_metric': best_metric
    }
    torch.save(state, filename)

def load_checkpoint(model, optimizer, filename):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    best_metric = checkpoint['best_metric']
    return model, optimizer, epoch, best_metric

def count_parameters(model):
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        if ("attention" in name.lower()) or ("w_o" in name.lower()):
            params = parameter.numel()
            total_params += params
    return total_params

def main():
    # Initialize hyperparameters
    vocab_size        = 20000
    batch_size        = 64
    d_model           = 32
    ff_dim            = 32
    max_len           = 32
    num_epochs        = 10
    num_runs          = 5
    n_heads           = 4
    n_layers          = 1
    drop_p            = 0.1
    val_split         = 0.1
    layers            = ['SDPA','Optimised', 'Efficient', 'Super']

    # Load data
    train_dataloader, val_dataloader, test_dataloader = get_dataloader("./IMDB_Dataset.csv",
                                                                       vocab_size,
                                                                       max_len,
                                                                       batch_size,
                                                                       val_split)
    for layer_type in layers:
        avg_train_loss  = 0
        avg_train_acc   = 0
        avg_test_loss   = 0
        avg_test_acc    = 0
        avg_model_size  = 0
        num_params      = 0
        run_times       = []
        for _ in range(num_runs):
            # Initialize model, optimizer, and criterion and train/test the model
            print(f"Working with layer type: {layer_type}")
            model_name  = 'best_model.pth'
            transformer = Transformer(vocab_size=vocab_size,
                                      n_heads=n_heads,
                                      n_layers=n_layers,
                                      d_model=d_model,
                                      d_lin=ff_dim,
                                      max_len=max_len,
                                      drop_p = drop_p,
                                      layer_type = layer_type)
            transformer = transformer.to(device)
            #Setup loss function and optimizer
            loss_func = nn.BCEWithLogitsLoss()
            #Performs slightly differently than Keras optimizer
            optim = torch.optim.Adam(transformer.parameters(),lr=1e-3)
            start_time       = time.time()
            best_val_acc     = -float('inf')
            for epoch in range(num_epochs):
                train_loss, train_acc = train_loop(device,transformer,optim,train_dataloader,loss_func,epoch,True)
                val_loss, val_acc     = train_loop(device,transformer,optim,val_dataloader,loss_func,epoch,False)
                #Print Results per epoch
                print(f" Epoch {epoch}: Train loss: {round(train_loss.item(), 4)} |  Train acc: {round(train_acc.item(), 4)} | \
                Val loss: {round(val_loss.item(), 4)} | Val acc: {round(val_acc.item(), 4)}")
                #Check if our model improved
                if val_acc >= best_val_acc:
                    best_val_acc = val_acc
                    save_checkpoint(transformer, optim, epoch, best_val_acc, model_name)
            end_time = time.time()
            # Check the best models performance
            best_model, best_optim, start_epoch, _ = load_checkpoint(transformer, optim, model_name)
            test_loss, test_acc   = train_loop(device,best_model,best_optim,test_dataloader,loss_func,start_epoch,False)
            print(f"Best Model: Test Acc {test_acc} | Test Loss {test_loss} \n")
            # Check the size of the best model
            param_size  = 0
            for param in best_model.parameters():
                param_size  += param.nelement() * param.element_size()
            buffer_size = 0
            for buffer in best_model.buffers():
                buffer_size += buffer.nelement() * buffer.element_size()
            size_all_mb = (param_size + buffer_size) / 1024**2
            # Count number of parameters
            num_params = count_parameters(transformer)
            # Accumulate results
            avg_train_loss += train_loss
            avg_train_acc  += train_acc
            avg_test_loss  += test_loss
            avg_test_acc   += test_acc
            avg_model_size += size_all_mb
            run_times.append(end_time-start_time)
        run_times.sort()
        med_run_time = run_times[len(run_times) // 2]
        file_name = f"{layer_type}_results_final.txt"
        f = open(file_name,"a")
        f.write(f"Average Train Acc over {num_runs} for {layer_type}: {avg_train_acc / num_runs} \n")
        f.write(f"Average Train Loss over {num_runs} for {layer_type}: {avg_train_loss / num_runs} \n")
        f.write(f"Average Test Acc over {num_runs} for {layer_type}: {avg_test_acc / num_runs} \n")
        f.write(f"Average Test Loss over {num_runs} for {layer_type}: {avg_test_loss / num_runs} \n")
        f.write(f"Average Model Size over {num_runs} for {layer_type}: {avg_model_size / num_runs} \n")
        f.write(f"Median Run Time over {num_runs} for {layer_type}: {med_run_time} \n")
        f.write(f"Number of parameters: {num_params} \n")
        f.write("\n")
        f.close()



if __name__ == '__main__':
    main()







Working with layer type: SDPA


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 102.35it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 308.78it/s]


 Epoch 0: Train loss: 0.6637 |  Train acc: 0.5527 |                 Val loss: 0.6217 | Val acc: 0.6515


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 104.47it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 186.72it/s]


 Epoch 1: Train loss: 0.573 |  Train acc: 0.6878 |                 Val loss: 0.5672 | Val acc: 0.6905


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 102.30it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 296.64it/s]


 Epoch 2: Train loss: 0.5118 |  Train acc: 0.7399 |                 Val loss: 0.5506 | Val acc: 0.7015


Training Epoch: 3:  40%|████      | 250/625 [00:02<00:03, 110.82it/s]