The below cell has the Super Attention Layer that has a single W_a matrix managed by the Multi Head Attention Module. Run it if you want to use that version of Super Attention (all other attention layers are the same)

In [1]:
import math
import torch
from torch import nn

class AttentionLayer(nn.Module):
    def __init__(self,
                 d_model : int,
                 d_q : int,
                 d_k : int,
                 d_v : int,
                 W_a : nn.Linear = None,
                 layer_type : str = 'SDPA',
                 idx : int = 0,
                 max_len : int = 32):
        super().__init__()
        self.d_model    = d_model
        self.d_q        = d_q
        self.d_k        = d_k
        self.d_v        = d_v
        self.layer_type = layer_type
        self.idx        = idx
        self.max_len    = max_len
        self.W_a        = W_a
        self._set_layer_type()

    def _set_layer_type(self):
        self.softmax = nn.Softmax(dim = 1)
        self.W_q     = nn.Linear(self.d_model,self.d_q)
        nn.init.xavier_uniform_(self.W_q.weight)
        nn.init.constant_(self.W_q.bias, 0)
        if self.layer_type == 'Optimised':
            self.W_k     = nn.Linear(self.d_model,self.d_k)
            nn.init.xavier_uniform_(self.W_k.weight)
            nn.init.constant_(self.W_k.bias, 0)
            self.forward = self._forward_optimised
        elif self.layer_type == 'Efficient':
            self.forward = self._forward_efficient
        elif self.layer_type == 'Super':
            self.forward = self._forward_super
        else:
            # Default to SDPA
            self.W_k     = nn.Linear(self.d_model,self.d_k)
            self.W_v     = nn.Linear(self.d_model,self.d_v)
            nn.init.xavier_uniform_(self.W_k.weight)
            nn.init.constant_(self.W_k.bias, 0)
            nn.init.xavier_uniform_(self.W_v.weight)
            nn.init.constant_(self.W_v.bias, 0)
            self.forward = self._forward_SDPA

    def _forward_SDPA(self, inp_q, inp_k, inp_v):
        Q     = self.W_q(inp_q)
        K     = self.W_k(inp_k)
        V     = self.W_v(inp_v)
        K_t   = K.permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        H     = S @ V
        return H

    def _forward_optimised(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        K     = self.W_k(inp_k)
        K_t   = K.permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        v_lo  = ((self.idx) * self.d_v)
        v_hi  = ((self.idx + 1) * self.d_v)
        V     = inp_v[:,:, v_lo : v_hi]
        H     = S @ V
        return H

    def _forward_efficient(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        lo    = ((self.idx) * self.d_k)
        hi    = ((self.idx + 1) * self.d_k)
        K_t   = inp_k[:, :, lo : hi].permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        V     = inp_v[:,:, lo : hi]
        H     = S @ V
        return H

    def _forward_super(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        lo    = ((self.idx) * self.d_k)
        hi    = ((self.idx + 1) * self.d_k)
        K_t   = inp_k[:, :, lo : hi].permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        V     = self.W_a(inp_v[:,:, lo : hi].permute(0,2,1)).permute(0,2,1)
        H     = S @ V
        return H

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model, d_k, d_v, max_len, layer_type):
        super().__init__()
        self.layers  = nn.Sequential()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k     = d_k
        self.d_v     = d_v
        self.W_a     = None
        if layer_type == 'Super':
            self.W_a = nn.Linear(max_len,max_len)
            nn.init.xavier_uniform_(self.W_a.weight)
            nn.init.constant_(self.W_a.bias, 0)
        for i in range(n_heads):
            self.layers.add_module("Attention_Layer "+str(i),
                                   AttentionLayer(d_model,d_k,d_k,d_v,self.W_a,layer_type,i,max_len))
        self.W_o     = nn.Linear(n_heads * d_v, d_model)

    def forward(self, inp_q, inp_k, inp_v):
        for i, layer in enumerate(self.layers):
            if i == 0:
                H = layer(inp_q,inp_k,inp_v)
            else:
                h_i = layer(inp_q,inp_k,inp_v)
                h_cat = (H.clone(),h_i)
                H = torch.cat(h_cat,2)
        out = self.W_o(H)
        return out

The below cell has the Super Attention Layer that has a single W_a matrix FOR EACH Super Attention Layer, independent of the Multi Head Attention Module. Run it if you want to use that version of Super Attention (all other attention layers are the same)

In [None]:
import math
import torch
from torch import nn

class AttentionLayer(nn.Module):
    def __init__(self,
                 d_model : int,
                 d_q : int,
                 d_k : int,
                 d_v : int,
                 layer_type : str = 'SDPA',
                 idx : int = 0,
                 max_len : int = 32):
        super().__init__()
        self.d_model    = d_model
        self.d_q        = d_q
        self.d_k        = d_k
        self.d_v        = d_v
        self.layer_type = layer_type
        self.idx        = idx
        self.max_len    = max_len
        self._set_layer_type()


    def _set_layer_type(self):
        self.softmax = nn.Softmax(dim = 1)
        self.W_q     = nn.Linear(self.d_model,self.d_q)
        nn.init.xavier_uniform_(self.W_q.weight)
        nn.init.constant_(self.W_q.bias, 0)
        if self.layer_type == 'Optimised':
            self.W_k     = nn.Linear(self.d_model,self.d_k)
            nn.init.xavier_uniform_(self.W_k.weight)
            nn.init.constant_(self.W_k.bias, 0)
            self.forward = self._forward_optimised
        elif self.layer_type == 'Efficient':
            self.forward = self._forward_efficient
        elif self.layer_type == 'Super':
            self.forward = self._forward_super
            self.W_a     = nn.Linear(self.max_len,self.max_len)
            nn.init.xavier_uniform_(self.W_a.weight)
            nn.init.constant_(self.W_a.bias, 0)
        else:
            # Default to SDPA
            self.W_k     = nn.Linear(self.d_model,self.d_k)
            self.W_v     = nn.Linear(self.d_model,self.d_v)
            nn.init.xavier_uniform_(self.W_k.weight)
            nn.init.constant_(self.W_k.bias, 0)
            nn.init.xavier_uniform_(self.W_v.weight)
            nn.init.constant_(self.W_v.bias, 0)
            self.forward = self._forward_SDPA

    def _forward_SDPA(self, inp_q, inp_k, inp_v):
        Q     = self.W_q(inp_q)
        K     = self.W_k(inp_k)
        V     = self.W_v(inp_v)
        K_t   = K.permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        H     = S @ V
        return H

    def _forward_optimised(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        K     = self.W_k(inp_k)
        K_t   = K.permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        v_lo  = ((self.idx) * self.d_v)
        v_hi  = ((self.idx + 1) * self.d_v)
        V     = inp_v[:,:, v_lo : v_hi]
        H     = S @ V
        return H

    def _forward_efficient(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        lo    = ((self.idx) * self.d_k)
        hi    = ((self.idx + 1) * self.d_k)
        K_t   = inp_k[:, :, lo : hi].permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        V     = inp_v[:,:, lo : hi]
        H     = S @ V
        return H

    def _forward_super(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        lo    = ((self.idx) * self.d_k)
        hi    = ((self.idx + 1) * self.d_k)
        K_t   = inp_k[:, :, lo : hi].permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        V     = self.W_a(inp_v[:,:, lo : hi].permute(0,2,1)).permute(0,2,1)
        H     = S @ V
        return H


class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model, d_k, d_v, max_len, layer_type):
        super().__init__()
        self.layers  = nn.Sequential()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k     = d_k
        self.d_v     = d_v
        for i in range(n_heads):
            self.layers.add_module("Attention_Layer "+str(i),
                                   AttentionLayer(d_model,d_k,d_k,d_v,layer_type,i,max_len))
        self.W_o     = nn.Linear(n_heads * d_v, d_model)

    def forward(self, inp_q, inp_k, inp_v):
        for i, layer in enumerate(self.layers):
            if i == 0:
                H = layer(inp_q,inp_k,inp_v)
            else:
                h_i = layer(inp_q,inp_k,inp_v)
                h_cat = (H.clone(),h_i)
                H = torch.cat(h_cat,2)
        out = self.W_o(H)
        return out

The next cell initializes the Dataloader and runs the main script. Remember to download the IMDB_dataset.csv file from the github repo and either upload it to google colab or to have it in the same working directory locally.

In [None]:
"""
## "You Need to Pay Better Attention" Pytorch Transformer Example

## Paper Link: https://arxiv.org/abs/2403.01643

## Author: Nicholas Mesa-Cucalon (https://github.com/NMesaC)
"""
import torch
import time
import os

from torch import nn
from tqdm import tqdm


import math
import torch
from torch import nn

# Set device since some classes need info
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import re
import torch
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from typing import Dict, List, Tuple

torch.manual_seed(1019)

class IMDBDataset(Dataset):
    def __init__(self, csv_path: str, vocab_size: int = 10000, max_length: int = 200):
        self.data = pd.read_csv(csv_path)
        self.max_length = max_length
        self.vocab_size = vocab_size

        self.preprocess_data()
        self.build_vocabulary()
        self.tokenize_reviews()

    def preprocess_data(self):
        self.data['review'] = self.data['review'].apply(self.clean_text)
        self.data['sentiment'] = self.data['sentiment'].map({"positive": 1, "negative": 0})

    def clean_text(self, text: str) -> str:
        text = text.lower()
        text = re.sub(r"<br\s*/?>", " ", text)
        text = re.sub(r"[^a-z0-9\s]", "", text)
        return text.strip()

    def build_vocabulary(self):
        word_freq = Counter()
        for review in self.data['review']:
            word_freq.update(review.split())

        special_tokens = ['<PAD>', '<SOS>']
        common_words = [word for word, _ in word_freq.most_common(self.vocab_size - len(special_tokens))]
        self.vocab = {word: idx for idx, word in enumerate(special_tokens + common_words)}

    def tokenize_reviews(self):
        self.tokenized_reviews = []
        for review in self.data['review']:
            tokens = [self.vocab['<SOS>']]
            tokens.extend([self.vocab.get(word, self.vocab['<PAD>']) for word in review.split()[:self.max_length-1]])
            self.tokenized_reviews.append(torch.tensor(tokens))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        return self.tokenized_reviews[idx], self.data['sentiment'].iloc[idx]

def collate_imdb(batch: List[Tuple[torch.Tensor, int]]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    reviews, sentiments = zip(*batch)
    padded_reviews = pad_sequence(reviews, batch_first=True, padding_value=0)
    lengths = torch.tensor([len(review) for review in reviews])
    sentiments = torch.tensor(sentiments, dtype=torch.float32)
    return padded_reviews, lengths, sentiments

def get_dataloader(csv_path: str, vocab_size: int, max_length: int, batch_size: int, val_split : float) -> Tuple[DataLoader, DataLoader, Dict[str, int]]:
    dataset = IMDBDataset(csv_path, vocab_size, max_length)
    train_size = 40000
    val_size   = (int)(train_size * val_split)
    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, len(dataset) - train_size - val_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  collate_fn=collate_imdb)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, collate_fn=collate_imdb)
    test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, collate_fn=collate_imdb)
    return train_loader, val_loader, test_loader


class AttentionLayer(nn.Module):
    def __init__(self,
                 d_model : int,
                 d_q : int,
                 d_k : int,
                 d_v : int,
                 W_a : nn.Linear = None,
                 layer_type : str = 'SDPA',
                 idx : int = 0,
                 max_len : int = 32):
        super().__init__()
        self.d_model    = d_model
        self.d_q        = d_q
        self.d_k        = d_k
        self.d_v        = d_v
        self.layer_type = layer_type
        self.idx        = idx
        self.max_len    = max_len
        self.W_a        = W_a
        self._set_layer_type()

    def _set_layer_type(self):
        self.softmax = nn.Softmax(dim = 1)
        self.W_q     = nn.Linear(self.d_model,self.d_q)
        nn.init.xavier_uniform_(self.W_q.weight)
        nn.init.constant_(self.W_q.bias, 0)
        if self.layer_type == 'Optimised':
            self.W_k     = nn.Linear(self.d_model,self.d_k)
            nn.init.xavier_uniform_(self.W_k.weight)
            nn.init.constant_(self.W_k.bias, 0)
            self.forward = self._forward_optimised
        elif self.layer_type == 'Efficient':
            self.forward = self._forward_efficient
        elif self.layer_type == 'Super':
            self.forward = self._forward_super
        else:
            # Default to SDPA
            self.W_k     = nn.Linear(self.d_model,self.d_k)
            self.W_v     = nn.Linear(self.d_model,self.d_v)
            nn.init.xavier_uniform_(self.W_k.weight)
            nn.init.constant_(self.W_k.bias, 0)
            nn.init.xavier_uniform_(self.W_v.weight)
            nn.init.constant_(self.W_v.bias, 0)
            self.forward = self._forward_SDPA

    def _forward_SDPA(self, inp_q, inp_k, inp_v):
        Q     = self.W_q(inp_q)
        K     = self.W_k(inp_k)
        V     = self.W_v(inp_v)
        K_t   = K.permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        H     = S @ V
        return H

    def _forward_optimised(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        K     = self.W_k(inp_k)
        K_t   = K.permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        v_lo  = ((self.idx) * self.d_v)
        v_hi  = ((self.idx + 1) * self.d_v)
        V     = inp_v[:,:, v_lo : v_hi]
        H     = S @ V
        return H

    def _forward_efficient(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        lo    = ((self.idx) * self.d_k)
        hi    = ((self.idx + 1) * self.d_k)
        K_t   = inp_k[:, :, lo : hi].permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        V     = inp_v[:,:, lo : hi]
        H     = S @ V
        return H

    def _forward_super(self, inp_q : torch.Tensor, inp_k : torch.Tensor, inp_v : torch.Tensor):
        Q     = self.W_q(inp_q)
        lo    = ((self.idx) * self.d_k)
        hi    = ((self.idx + 1) * self.d_k)
        K_t   = inp_k[:, :, lo : hi].permute(0,2,1)
        S     = self.softmax((Q @ K_t) / math.sqrt(self.d_q))
        V     = self.W_a(inp_v[:,:, lo : hi].permute(0,2,1)).permute(0,2,1)
        H     = S @ V
        return H

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model, d_k, d_v, max_len, layer_type):
        super().__init__()
        self.layers  = nn.Sequential()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k     = d_k
        self.d_v     = d_v
        self.W_a     = None
        if layer_type == 'Super':
            self.W_a = nn.Linear(max_len,max_len)
            nn.init.xavier_uniform_(self.W_a.weight)
            nn.init.constant_(self.W_a.bias, 0)
        for i in range(n_heads):
            self.layers.add_module("Attention_Layer "+str(i),
                                   AttentionLayer(d_model,d_k,d_k,d_v,self.W_a,layer_type,i,max_len))
        self.W_o     = nn.Linear(n_heads * d_v, d_model)

    def forward(self, inp_q, inp_k, inp_v):
        for i, layer in enumerate(self.layers):
            if i == 0:
                H = layer(inp_q,inp_k,inp_v)
            else:
                h_i = layer(inp_q,inp_k,inp_v)
                h_cat = (H.clone(),h_i)
                H = torch.cat(h_cat,2)
        out = self.W_o(H)
        return out



"""
## Transformer Block Module
"""
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, ff_dim, layer_type = 'SDPA', max_len = 32, dropout_rate=0.1):
        super().__init__()
        d_k, d_v = d_model // num_heads
        self.att = MultiHeadAttention(num_heads, d_model, d_k, d_v, max_len, layer_type)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, d_model)
        )
        self.layernorm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x):
        attn_output, _ = self.att(x, x, x)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

"""
## Embedding Layer
"""
class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.maxlen = maxlen
        self.token_emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
        self.pos_emb = nn.Embedding(num_embeddings=maxlen, embedding_dim=embed_dim)

    def forward(self, x):
        positions = torch.arange(self.maxlen, device=x.device)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions.unsqueeze(0)

"""
## Transformer-Encoder-Only Arch
"""
class EncoderLayer(nn.Module):
    def __init__(self, n_heads, d_model, d_k, d_v, d_lin, max_len, layer_type, drop_p):
        super().__init__()
        self.multi      = MultiHeadAttention(n_heads,d_model,d_k,d_v,max_len,layer_type)
        self.ff         = nn.Sequential(
                            nn.Linear(d_model,d_lin),
                            nn.ReLU(),
                            nn.Linear(d_lin,d_model)
                         )
        self.norm_multi = nn.LayerNorm(d_model, eps=1e-6)
        self.norm_ff    = nn.LayerNorm(d_model, eps=1e-6)
        self.drop_multi = nn.Dropout(drop_p)
        self.drop_ff    = nn.Dropout(drop_p)
    def forward(self, inp):
        multi = self.multi(inp,inp,inp)
        multi = self.drop_multi(multi)
        z     = self.norm_multi(inp + multi)
        ff    = self.ff(z)
        ff    = self.drop_ff(ff)
        return self.norm_ff(z + ff)

class Encoder(nn.Module):
    def __init__(self, n_heads, d_model, d_k, d_v, d_lin, n_layers, vocab_size, max_len, layer_type, drop_p):
        super().__init__()
        self.encoder_layers  = nn.Sequential()
        self.embedding       = TokenAndPositionEmbedding(max_len,vocab_size,d_model)
        self.n_layers        = n_layers
        for i in range(n_layers):
            self.encoder_layers.add_module("Encoder_Layer"+str(i),EncoderLayer(n_heads,
                                                                               d_model,
                                                                               d_k,
                                                                               d_v,
                                                                               d_lin,
                                                                               max_len,
                                                                               layer_type,
                                                                               drop_p))
    def forward(self, inp):
        embed_i = self.embedding(inp)
        for layer in self.encoder_layers:
            embed_i     = layer(embed_i)
        return embed_i

class Transformer(nn.Module):
    def __init__(self,
                 vocab_size = 20000,
                 n_heads    = 4,
                 d_model    = 32,
                 d_k        = 8,
                 d_v        = 8,
                 d_lin      = 32,
                 n_layers   = 1,
                 max_len    = 32,
                 layer_type = 'SDPA',
                 drop_p     = 0.1):
        super().__init__()
        d_k = d_model // n_heads
        d_v = d_k
        self.encoder         = Encoder(n_heads,d_model,d_k,d_v,d_lin,n_layers,vocab_size,max_len,layer_type,drop_p)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.dropout1        = nn.Dropout(0.1)
        self.dense1          = nn.Linear(d_model, 6)
        self.dropout2        = nn.Dropout(0.1)
        self.dense2          = nn.Linear(6, 1)

    def forward(self, inp):
        x = self.encoder(inp)
        x = x.transpose(1,2)
        x = self.global_avg_pool(x).squeeze(2)
        x = self.dropout1(x)
        x = torch.relu(self.dense1(x))
        x = self.dropout2(x)
        res = self.dense2(x)
        return res

"""
## Training Loop
"""
def train_loop(device, model, optim, loader, loss_func, epoch, train=False):
    model.train(mode=train)
    total_loss    = 0
    total_correct = 0
    n_samples     = 0
    label         = 'Training' if train else 'Test'
    for reviews, _, labels in tqdm(loader, desc=f'{label} Epoch: {epoch}'):
        reviews = reviews.to(device)
        labels = labels.to(device)
        #Forward
        if train:
            optim.zero_grad()
            logits = model(reviews)
            loss = loss_func(logits, labels.reshape(logits.shape))
            loss.backward()
            optim.step()
        else:
            with torch.no_grad():
                logits = model(reviews)
                loss   = loss_func(logits, labels.reshape(logits.shape))
        #Predictions
        preds = (logits > 0.5).float()
        #Compute accuracy
        acc = torch.sum(preds == labels.reshape(logits.shape))
        #Track stats
        total_loss += reviews.shape[0] * loss
        n_samples += reviews.shape[0]
        total_correct += acc
    return total_loss / n_samples, total_correct / n_samples

"""
## Checkpoint Callbacks
"""
def save_checkpoint(model, optimizer, epoch, best_metric, filename):
    state = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
        'best_metric': best_metric
    }
    torch.save(state, filename)

def load_checkpoint(model, optimizer, filename):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    best_metric = checkpoint['best_metric']
    return model, optimizer, epoch, best_metric

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        if ("attention" in name.lower()) or ("w_o" in name.lower()):
            params = parameter.numel()
            table.add_row([name, params])
            total_params += params
    return total_params

def main():
    # Initialize hyperparameters
    vocab_size        = 20000
    batch_size        = 64
    d_model           = 32
    ff_dim            = 32
    max_len           = 32
    num_epochs        = 1
    num_runs          = 5
    n_heads           = 4
    n_layers          = 1
    drop_p            = 0.1
    val_split         = 0.1
    layers            = ['SDPA','Optimised', 'Efficient', 'Super']

    # Load data
    train_dataloader, val_dataloader, test_dataloader = get_dataloader("./IMDB_Dataset.csv",
                                                                       vocab_size,
                                                                       max_len,
                                                                       batch_size,
                                                                       val_split)
    for layer_type in layers:
        avg_train_loss  = 0
        avg_train_acc   = 0
        avg_test_loss   = 0
        avg_test_acc    = 0
        avg_model_size  = 0
        num_params      = 0
        run_times       = []
        for _ in range(num_runs):
            # Initialize model, optimizer, and criterion and train/test the model
            print(f"Working with layer type: {layer_type}")
            model_name  = 'best_model.pth'
            transformer = Transformer(vocab_size=vocab_size,
                                      n_heads=n_heads,
                                      n_layers=n_layers,
                                      d_model=d_model,
                                      d_lin=ff_dim,
                                      max_len=max_len,
                                      drop_p = drop_p,
                                      layer_type = layer_type)
            transformer = transformer.to(device)
            #Setup loss function and optimizer
            loss_func = nn.BCEWithLogitsLoss()
            #Performs slightly differently than Keras optimizer
            optim = torch.optim.Adam(transformer.parameters(),lr=1e-3)
            start_time       = time.time()
            best_val_acc     = -float('inf')
            for epoch in range(num_epochs):
                train_loss, train_acc = train_loop(device,transformer,optim,train_dataloader,loss_func,epoch,True)
                val_loss, val_acc     = train_loop(device,transformer,optim,val_dataloader,loss_func,epoch,False)
                #Print Results per epoch
                print(f" Epoch {epoch}: Train loss: {round(train_loss.item(), 4)} |  Train acc: {round(train_acc.item(), 4)} | \
                Val loss: {round(val_loss.item(), 4)} | Val acc: {round(val_acc.item(), 4)}")
                #Check if our model improved
                if val_acc >= best_val_acc:
                    best_val_acc = val_acc
                    save_checkpoint(transformer, optim, epoch, best_val_acc, model_name)
            end_time = time.time()
            # Check the best models performance
            best_model, best_optim, start_epoch, _ = load_checkpoint(transformer, optim, model_name)
            test_loss, test_acc   = train_loop(device,best_model,best_optim,test_dataloader,loss_func,start_epoch,False)
            print(f"Best Model: Test Acc {test_acc} | Test Loss {test_loss} \n")
            # Check the size of the best model
            param_size  = 0
            for param in best_model.parameters():
                param_size  += param.nelement() * param.element_size()
            buffer_size = 0
            for buffer in best_model.buffers():
                buffer_size += buffer.nelement() * buffer.element_size()
            size_all_mb = (param_size + buffer_size) / 1024**2
            # Count number of parameters
            num_params = count_parameters(transformer)
            # Accumulate results
            avg_train_loss += train_loss
            avg_train_acc  += train_acc
            avg_test_loss  += test_loss
            avg_test_acc   += test_acc
            avg_model_size += size_all_mb
            run_times.append(end_time-start_time)
        run_times.sort()
        med_run_time = run_times[len(run_times) // 2]
        file_name = f"{layer_type}_results_final.txt"
        f = open(file_name,"a")
        f.write(f"Average Train Acc over {num_runs} for {layer_type}: {avg_train_acc / num_runs} \n")
        f.write(f"Average Train Loss over {num_runs} for {layer_type}: {avg_train_loss / num_runs} \n")
        f.write(f"Average Test Acc over {num_runs} for {layer_type}: {avg_test_acc / num_runs} \n")
        f.write(f"Average Test Loss over {num_runs} for {layer_type}: {avg_test_loss / num_runs} \n")
        f.write(f"Average Model Size over {num_runs} for {layer_type}: {avg_model_size / num_runs} \n")
        f.write(f"Median Run Time over {num_runs} for {layer_type}: {med_run_time} \n")
        f.write(f"Number of parameters: {num_params} \n")
        f.write("\n")
        f.close()



if __name__ == '__main__':
    main()







Working with layer type: SDPA


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 98.33it/s] 
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 313.03it/s]


 Epoch 0: Train loss: 0.6637 |  Train acc: 0.5527 |                 Val loss: 0.6217 | Val acc: 0.6515


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 113.81it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 320.01it/s]


 Epoch 1: Train loss: 0.573 |  Train acc: 0.6878 |                 Val loss: 0.5672 | Val acc: 0.6905


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 97.14it/s] 
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 277.31it/s]


 Epoch 2: Train loss: 0.5118 |  Train acc: 0.7399 |                 Val loss: 0.5506 | Val acc: 0.7015


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 106.83it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 166.73it/s]


 Epoch 3: Train loss: 0.464 |  Train acc: 0.777 |                 Val loss: 0.5512 | Val acc: 0.7148


Training Epoch: 4: 100%|██████████| 625/625 [00:06<00:00, 96.86it/s] 
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 301.29it/s]


 Epoch 4: Train loss: 0.4183 |  Train acc: 0.8081 |                 Val loss: 0.5604 | Val acc: 0.721


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 111.36it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 285.81it/s]


 Epoch 5: Train loss: 0.3787 |  Train acc: 0.8347 |                 Val loss: 0.5807 | Val acc: 0.7255


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 97.27it/s] 
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 308.43it/s]


 Epoch 6: Train loss: 0.335 |  Train acc: 0.8621 |                 Val loss: 0.6224 | Val acc: 0.7248


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 112.12it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 294.52it/s]


 Epoch 7: Train loss: 0.2974 |  Train acc: 0.8831 |                 Val loss: 0.6721 | Val acc: 0.7258


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 96.46it/s] 
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 305.85it/s]


 Epoch 8: Train loss: 0.2612 |  Train acc: 0.9015 |                 Val loss: 0.7337 | Val acc: 0.726


Training Epoch: 9: 100%|██████████| 625/625 [00:05<00:00, 113.95it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 278.53it/s]


 Epoch 9: Train loss: 0.2334 |  Train acc: 0.9154 |                 Val loss: 0.7575 | Val acc: 0.718


Test Epoch: 8: 100%|██████████| 94/94 [00:00<00:00, 315.09it/s]


Best Model: Test Acc 0.7233332991600037 | Test Loss 0.7258501052856445 

Working with layer type: SDPA


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 96.18it/s] 
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 296.45it/s]


 Epoch 0: Train loss: 0.6906 |  Train acc: 0.5074 |                 Val loss: 0.6651 | Val acc: 0.5768


Training Epoch: 1: 100%|██████████| 625/625 [00:06<00:00, 99.94it/s] 
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 273.27it/s]


 Epoch 1: Train loss: 0.6324 |  Train acc: 0.6362 |                 Val loss: 0.5989 | Val acc: 0.6765


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 95.50it/s] 
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 299.00it/s]


 Epoch 2: Train loss: 0.5622 |  Train acc: 0.7139 |                 Val loss: 0.5728 | Val acc: 0.6823


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 110.27it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 293.92it/s]


 Epoch 3: Train loss: 0.5082 |  Train acc: 0.7572 |                 Val loss: 0.5601 | Val acc: 0.723


Training Epoch: 4: 100%|██████████| 625/625 [00:06<00:00, 95.06it/s] 
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 319.21it/s]


 Epoch 4: Train loss: 0.4605 |  Train acc: 0.7907 |                 Val loss: 0.5714 | Val acc: 0.7248


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 111.04it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 297.93it/s]


 Epoch 5: Train loss: 0.4177 |  Train acc: 0.8205 |                 Val loss: 0.6058 | Val acc: 0.725


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 94.01it/s] 
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 296.07it/s]


 Epoch 6: Train loss: 0.3765 |  Train acc: 0.8448 |                 Val loss: 0.6148 | Val acc: 0.712


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 110.78it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 286.00it/s]


 Epoch 7: Train loss: 0.3397 |  Train acc: 0.867 |                 Val loss: 0.6921 | Val acc: 0.719


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 94.57it/s] 
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 297.67it/s]


 Epoch 8: Train loss: 0.3059 |  Train acc: 0.8859 |                 Val loss: 0.7466 | Val acc: 0.722


Training Epoch: 9: 100%|██████████| 625/625 [00:05<00:00, 110.68it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 293.83it/s]


 Epoch 9: Train loss: 0.2832 |  Train acc: 0.8987 |                 Val loss: 0.7928 | Val acc: 0.7265


Test Epoch: 9: 100%|██████████| 94/94 [00:00<00:00, 304.72it/s]


Best Model: Test Acc 0.7206666469573975 | Test Loss 0.8219985365867615 

Working with layer type: SDPA


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 94.58it/s] 
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 316.76it/s]


 Epoch 0: Train loss: 0.6663 |  Train acc: 0.5472 |                 Val loss: 0.6082 | Val acc: 0.6445


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 110.57it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 284.67it/s]


 Epoch 1: Train loss: 0.5703 |  Train acc: 0.6881 |                 Val loss: 0.5618 | Val acc: 0.6998


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 92.74it/s] 
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 308.26it/s]


 Epoch 2: Train loss: 0.507 |  Train acc: 0.7422 |                 Val loss: 0.5526 | Val acc: 0.7185


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 109.96it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 297.47it/s]


 Epoch 3: Train loss: 0.4578 |  Train acc: 0.7779 |                 Val loss: 0.5552 | Val acc: 0.704


Training Epoch: 4: 100%|██████████| 625/625 [00:06<00:00, 92.71it/s] 
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 309.76it/s]


 Epoch 4: Train loss: 0.4147 |  Train acc: 0.8095 |                 Val loss: 0.5699 | Val acc: 0.72


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 109.49it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 288.09it/s]


 Epoch 5: Train loss: 0.3715 |  Train acc: 0.8391 |                 Val loss: 0.5929 | Val acc: 0.7195


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 93.15it/s] 
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 305.36it/s]


 Epoch 6: Train loss: 0.3328 |  Train acc: 0.8613 |                 Val loss: 0.6224 | Val acc: 0.7195


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 113.58it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 286.27it/s]


 Epoch 7: Train loss: 0.2975 |  Train acc: 0.8815 |                 Val loss: 0.6659 | Val acc: 0.7248


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 95.51it/s] 
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 312.05it/s]


 Epoch 8: Train loss: 0.2606 |  Train acc: 0.9013 |                 Val loss: 0.7339 | Val acc: 0.706


Training Epoch: 9: 100%|██████████| 625/625 [00:05<00:00, 113.51it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 287.61it/s]


 Epoch 9: Train loss: 0.2354 |  Train acc: 0.9133 |                 Val loss: 0.7881 | Val acc: 0.7163


Test Epoch: 7: 100%|██████████| 94/94 [00:00<00:00, 295.77it/s]


Best Model: Test Acc 0.73416668176651 | Test Loss 0.6504974961280823 

Working with layer type: SDPA


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 94.93it/s] 
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 301.06it/s]


 Epoch 0: Train loss: 0.6618 |  Train acc: 0.5547 |                 Val loss: 0.6136 | Val acc: 0.6428


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 112.76it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 293.41it/s]


 Epoch 1: Train loss: 0.5772 |  Train acc: 0.6693 |                 Val loss: 0.5662 | Val acc: 0.6913


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 95.76it/s] 
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 285.30it/s]


 Epoch 2: Train loss: 0.5151 |  Train acc: 0.7184 |                 Val loss: 0.5453 | Val acc: 0.7195


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 114.55it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 285.40it/s]


 Epoch 3: Train loss: 0.466 |  Train acc: 0.751 |                 Val loss: 0.5581 | Val acc: 0.7193


Training Epoch: 4: 100%|██████████| 625/625 [00:06<00:00, 94.20it/s] 
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 290.90it/s]


 Epoch 4: Train loss: 0.4203 |  Train acc: 0.7788 |                 Val loss: 0.5615 | Val acc: 0.7293


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 111.32it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 297.78it/s]


 Epoch 5: Train loss: 0.3766 |  Train acc: 0.8021 |                 Val loss: 0.5857 | Val acc: 0.7348


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 94.97it/s] 
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 321.23it/s]


 Epoch 6: Train loss: 0.3357 |  Train acc: 0.8629 |                 Val loss: 0.6311 | Val acc: 0.728


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 113.75it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 283.22it/s]


 Epoch 7: Train loss: 0.2924 |  Train acc: 0.8876 |                 Val loss: 0.6852 | Val acc: 0.7233


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 95.62it/s] 
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 303.27it/s]


 Epoch 8: Train loss: 0.2577 |  Train acc: 0.9054 |                 Val loss: 0.7332 | Val acc: 0.7268


Training Epoch: 9: 100%|██████████| 625/625 [00:05<00:00, 112.59it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 294.16it/s]


 Epoch 9: Train loss: 0.2265 |  Train acc: 0.9216 |                 Val loss: 0.8225 | Val acc: 0.7238


Test Epoch: 5: 100%|██████████| 94/94 [00:00<00:00, 314.13it/s]


Best Model: Test Acc 0.734666645526886 | Test Loss 0.5884243845939636 

Working with layer type: SDPA


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 94.46it/s] 
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 317.33it/s]


 Epoch 0: Train loss: 0.6621 |  Train acc: 0.5483 |                 Val loss: 0.6123 | Val acc: 0.648


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 113.70it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 282.65it/s]


 Epoch 1: Train loss: 0.5703 |  Train acc: 0.6865 |                 Val loss: 0.5619 | Val acc: 0.7043


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 95.42it/s] 
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 291.63it/s]


 Epoch 2: Train loss: 0.508 |  Train acc: 0.738 |                 Val loss: 0.5424 | Val acc: 0.7295


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 112.63it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 286.27it/s]


 Epoch 3: Train loss: 0.4607 |  Train acc: 0.7743 |                 Val loss: 0.5443 | Val acc: 0.7305


Training Epoch: 4: 100%|██████████| 625/625 [00:06<00:00, 94.64it/s] 
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 297.28it/s]


 Epoch 4: Train loss: 0.4182 |  Train acc: 0.8041 |                 Val loss: 0.5581 | Val acc: 0.7333


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 112.48it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 293.69it/s]


 Epoch 5: Train loss: 0.3731 |  Train acc: 0.8352 |                 Val loss: 0.5823 | Val acc: 0.7353


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 94.67it/s] 
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 322.48it/s]


 Epoch 6: Train loss: 0.3313 |  Train acc: 0.8625 |                 Val loss: 0.6222 | Val acc: 0.7355


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 114.33it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 283.99it/s]


 Epoch 7: Train loss: 0.2911 |  Train acc: 0.8841 |                 Val loss: 0.6674 | Val acc: 0.731


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 95.19it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 310.80it/s]


 Epoch 8: Train loss: 0.2504 |  Train acc: 0.9061 |                 Val loss: 0.7072 | Val acc: 0.7253


Training Epoch: 9: 100%|██████████| 625/625 [00:05<00:00, 113.54it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 282.42it/s]


 Epoch 9: Train loss: 0.2176 |  Train acc: 0.9216 |                 Val loss: 0.7748 | Val acc: 0.7245


Test Epoch: 6: 100%|██████████| 94/94 [00:00<00:00, 300.71it/s]


Best Model: Test Acc 0.7294999957084656 | Test Loss 0.6290222406387329 

Working with layer type: Optimised


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 96.98it/s] 
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 307.18it/s]


 Epoch 0: Train loss: 0.6745 |  Train acc: 0.5314 |                 Val loss: 0.6215 | Val acc: 0.621


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 113.79it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 291.13it/s]


 Epoch 1: Train loss: 0.5828 |  Train acc: 0.675 |                 Val loss: 0.5648 | Val acc: 0.6855


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 97.33it/s] 
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 273.17it/s]


 Epoch 2: Train loss: 0.5169 |  Train acc: 0.7378 |                 Val loss: 0.5495 | Val acc: 0.7158


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 113.78it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 294.17it/s]


 Epoch 3: Train loss: 0.4642 |  Train acc: 0.7775 |                 Val loss: 0.5452 | Val acc: 0.7155


Training Epoch: 4: 100%|██████████| 625/625 [00:06<00:00, 96.60it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 284.40it/s]


 Epoch 4: Train loss: 0.4162 |  Train acc: 0.8124 |                 Val loss: 0.5582 | Val acc: 0.7265


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 113.43it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 284.83it/s]


 Epoch 5: Train loss: 0.3667 |  Train acc: 0.8433 |                 Val loss: 0.5976 | Val acc: 0.7293


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 99.60it/s]
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 189.81it/s]


 Epoch 6: Train loss: 0.3168 |  Train acc: 0.8724 |                 Val loss: 0.6358 | Val acc: 0.7198


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 113.46it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 278.41it/s]


 Epoch 7: Train loss: 0.2681 |  Train acc: 0.8975 |                 Val loss: 0.7151 | Val acc: 0.7243


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 101.31it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 192.48it/s]


 Epoch 8: Train loss: 0.2266 |  Train acc: 0.9185 |                 Val loss: 0.7957 | Val acc: 0.7213


Training Epoch: 9: 100%|██████████| 625/625 [00:05<00:00, 107.68it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 282.25it/s]


 Epoch 9: Train loss: 0.1922 |  Train acc: 0.9332 |                 Val loss: 0.8651 | Val acc: 0.7155


Test Epoch: 5: 100%|██████████| 94/94 [00:00<00:00, 294.03it/s]


Best Model: Test Acc 0.7224999666213989 | Test Loss 0.5998744964599609 

Working with layer type: Optimised


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 98.30it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 199.16it/s]


 Epoch 0: Train loss: 0.6686 |  Train acc: 0.5398 |                 Val loss: 0.6192 | Val acc: 0.612


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 109.67it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 276.70it/s]


 Epoch 1: Train loss: 0.5759 |  Train acc: 0.6851 |                 Val loss: 0.5599 | Val acc: 0.6923


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 103.24it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 193.97it/s]


 Epoch 2: Train loss: 0.5116 |  Train acc: 0.7436 |                 Val loss: 0.5397 | Val acc: 0.713


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 104.84it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 294.86it/s]


 Epoch 3: Train loss: 0.4615 |  Train acc: 0.78 |                 Val loss: 0.5459 | Val acc: 0.717


Training Epoch: 4: 100%|██████████| 625/625 [00:06<00:00, 101.51it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 193.61it/s]


 Epoch 4: Train loss: 0.4135 |  Train acc: 0.8161 |                 Val loss: 0.5654 | Val acc: 0.7213


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 105.89it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 278.88it/s]


 Epoch 5: Train loss: 0.3625 |  Train acc: 0.8482 |                 Val loss: 0.6013 | Val acc: 0.7285


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 101.17it/s]
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 187.98it/s]


 Epoch 6: Train loss: 0.3131 |  Train acc: 0.8751 |                 Val loss: 0.6391 | Val acc: 0.7355


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 106.32it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 284.96it/s]


 Epoch 7: Train loss: 0.2636 |  Train acc: 0.9017 |                 Val loss: 0.7215 | Val acc: 0.718


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 99.88it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 195.94it/s]


 Epoch 8: Train loss: 0.2199 |  Train acc: 0.922 |                 Val loss: 0.7784 | Val acc: 0.7185


Training Epoch: 9: 100%|██████████| 625/625 [00:06<00:00, 104.16it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 275.95it/s]


 Epoch 9: Train loss: 0.1805 |  Train acc: 0.9376 |                 Val loss: 0.8284 | Val acc: 0.72


Test Epoch: 6: 100%|██████████| 94/94 [00:00<00:00, 307.47it/s]


Best Model: Test Acc 0.7318333387374878 | Test Loss 0.6314769983291626 

Working with layer type: Optimised


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 99.05it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 198.00it/s]


 Epoch 0: Train loss: 0.6831 |  Train acc: 0.5168 |                 Val loss: 0.6344 | Val acc: 0.618


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 110.59it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 281.60it/s]


 Epoch 1: Train loss: 0.5937 |  Train acc: 0.6662 |                 Val loss: 0.575 | Val acc: 0.6913


Training Epoch: 2: 100%|██████████| 625/625 [00:05<00:00, 105.61it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 208.28it/s]


 Epoch 2: Train loss: 0.5248 |  Train acc: 0.7299 |                 Val loss: 0.5536 | Val acc: 0.683


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 106.19it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 273.64it/s]


 Epoch 3: Train loss: 0.4742 |  Train acc: 0.7672 |                 Val loss: 0.5466 | Val acc: 0.727


Training Epoch: 4: 100%|██████████| 625/625 [00:05<00:00, 109.63it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 214.52it/s]


 Epoch 4: Train loss: 0.4286 |  Train acc: 0.8032 |                 Val loss: 0.5462 | Val acc: 0.7213


Training Epoch: 5: 100%|██████████| 625/625 [00:06<00:00, 103.67it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 279.01it/s]


 Epoch 5: Train loss: 0.3802 |  Train acc: 0.835 |                 Val loss: 0.587 | Val acc: 0.721


Training Epoch: 6: 100%|██████████| 625/625 [00:05<00:00, 110.92it/s]
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 225.57it/s]


 Epoch 6: Train loss: 0.3282 |  Train acc: 0.8671 |                 Val loss: 0.624 | Val acc: 0.7183


Training Epoch: 7: 100%|██████████| 625/625 [00:06<00:00, 101.19it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 262.33it/s]


 Epoch 7: Train loss: 0.2823 |  Train acc: 0.8928 |                 Val loss: 0.6866 | Val acc: 0.721


Training Epoch: 8: 100%|██████████| 625/625 [00:05<00:00, 113.17it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 278.89it/s]


 Epoch 8: Train loss: 0.2404 |  Train acc: 0.9139 |                 Val loss: 0.7585 | Val acc: 0.719


Training Epoch: 9: 100%|██████████| 625/625 [00:06<00:00, 98.33it/s] 
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 290.73it/s]


 Epoch 9: Train loss: 0.204 |  Train acc: 0.9311 |                 Val loss: 0.8437 | Val acc: 0.7178


Test Epoch: 3: 100%|██████████| 94/94 [00:00<00:00, 308.77it/s]


Best Model: Test Acc 0.715666651725769 | Test Loss 0.5557748079299927 

Working with layer type: Optimised


Training Epoch: 0: 100%|██████████| 625/625 [00:05<00:00, 112.17it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 320.38it/s]


 Epoch 0: Train loss: 0.6693 |  Train acc: 0.5428 |                 Val loss: 0.6144 | Val acc: 0.6388


Training Epoch: 1: 100%|██████████| 625/625 [00:06<00:00, 97.87it/s] 
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 291.65it/s]


 Epoch 1: Train loss: 0.5774 |  Train acc: 0.6833 |                 Val loss: 0.56 | Val acc: 0.699


Training Epoch: 2: 100%|██████████| 625/625 [00:05<00:00, 111.46it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 310.97it/s]


 Epoch 2: Train loss: 0.512 |  Train acc: 0.7416 |                 Val loss: 0.5411 | Val acc: 0.714


Training Epoch: 3: 100%|██████████| 625/625 [00:06<00:00, 98.67it/s] 
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 279.49it/s]


 Epoch 3: Train loss: 0.4628 |  Train acc: 0.7792 |                 Val loss: 0.5417 | Val acc: 0.7153


Training Epoch: 4: 100%|██████████| 625/625 [00:05<00:00, 113.12it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 312.62it/s]


 Epoch 4: Train loss: 0.4151 |  Train acc: 0.8101 |                 Val loss: 0.5695 | Val acc: 0.7263


Training Epoch: 5: 100%|██████████| 625/625 [00:06<00:00, 99.16it/s] 
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 281.94it/s]


 Epoch 5: Train loss: 0.3641 |  Train acc: 0.843 |                 Val loss: 0.6151 | Val acc: 0.7185


Training Epoch: 6: 100%|██████████| 625/625 [00:05<00:00, 112.31it/s]
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 300.90it/s]


 Epoch 6: Train loss: 0.3152 |  Train acc: 0.8733 |                 Val loss: 0.6296 | Val acc: 0.725


Training Epoch: 7: 100%|██████████| 625/625 [00:06<00:00, 98.41it/s] 
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 279.36it/s]


 Epoch 7: Train loss: 0.2663 |  Train acc: 0.8966 |                 Val loss: 0.707 | Val acc: 0.725


Training Epoch: 8: 100%|██████████| 625/625 [00:05<00:00, 113.63it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 313.38it/s]


 Epoch 8: Train loss: 0.2223 |  Train acc: 0.9196 |                 Val loss: 0.7649 | Val acc: 0.7243


Training Epoch: 9: 100%|██████████| 625/625 [00:06<00:00, 97.58it/s] 
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 289.42it/s]


 Epoch 9: Train loss: 0.1865 |  Train acc: 0.9339 |                 Val loss: 0.829 | Val acc: 0.72


Test Epoch: 4: 100%|██████████| 94/94 [00:00<00:00, 311.19it/s]


Best Model: Test Acc 0.7281666398048401 | Test Loss 0.5689165592193604 

Working with layer type: Optimised


Training Epoch: 0: 100%|██████████| 625/625 [00:05<00:00, 112.30it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 322.05it/s]


 Epoch 0: Train loss: 0.6655 |  Train acc: 0.5444 |                 Val loss: 0.613 | Val acc: 0.6505


Training Epoch: 1: 100%|██████████| 625/625 [00:06<00:00, 98.68it/s] 
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 283.29it/s]


 Epoch 1: Train loss: 0.5784 |  Train acc: 0.6777 |                 Val loss: 0.562 | Val acc: 0.6955


Training Epoch: 2: 100%|██████████| 625/625 [00:05<00:00, 112.81it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 287.56it/s]


 Epoch 2: Train loss: 0.5144 |  Train acc: 0.7379 |                 Val loss: 0.5454 | Val acc: 0.7283


Training Epoch: 3: 100%|██████████| 625/625 [00:06<00:00, 99.28it/s] 
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 289.61it/s]


 Epoch 3: Train loss: 0.465 |  Train acc: 0.7746 |                 Val loss: 0.5479 | Val acc: 0.7158


Training Epoch: 4: 100%|██████████| 625/625 [00:05<00:00, 112.18it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 306.39it/s]


 Epoch 4: Train loss: 0.4156 |  Train acc: 0.8076 |                 Val loss: 0.5523 | Val acc: 0.731


Training Epoch: 5: 100%|██████████| 625/625 [00:06<00:00, 97.49it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 282.09it/s]


 Epoch 5: Train loss: 0.3693 |  Train acc: 0.8372 |                 Val loss: 0.5894 | Val acc: 0.7303


Training Epoch: 6: 100%|██████████| 625/625 [00:05<00:00, 112.01it/s]
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 308.53it/s]


 Epoch 6: Train loss: 0.3194 |  Train acc: 0.8681 |                 Val loss: 0.6252 | Val acc: 0.7318


Training Epoch: 7: 100%|██████████| 625/625 [00:06<00:00, 100.45it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 248.93it/s]


 Epoch 7: Train loss: 0.2755 |  Train acc: 0.8919 |                 Val loss: 0.6998 | Val acc: 0.7203


Training Epoch: 8: 100%|██████████| 625/625 [00:05<00:00, 112.59it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 300.08it/s]


 Epoch 8: Train loss: 0.2343 |  Train acc: 0.912 |                 Val loss: 0.7619 | Val acc: 0.727


Training Epoch: 9: 100%|██████████| 625/625 [00:06<00:00, 103.70it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 188.86it/s]


 Epoch 9: Train loss: 0.201 |  Train acc: 0.9284 |                 Val loss: 0.8083 | Val acc: 0.7205


Test Epoch: 6: 100%|██████████| 94/94 [00:00<00:00, 225.93it/s]


Best Model: Test Acc 0.7288333177566528 | Test Loss 0.6218332648277283 

Working with layer type: Efficient


Training Epoch: 0: 100%|██████████| 625/625 [00:05<00:00, 119.46it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 319.61it/s]


 Epoch 0: Train loss: 0.6662 |  Train acc: 0.5481 |                 Val loss: 0.6211 | Val acc: 0.615


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 109.75it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 198.27it/s]


 Epoch 1: Train loss: 0.5836 |  Train acc: 0.6741 |                 Val loss: 0.5677 | Val acc: 0.6923


Training Epoch: 2: 100%|██████████| 625/625 [00:05<00:00, 111.17it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 286.29it/s]


 Epoch 2: Train loss: 0.5214 |  Train acc: 0.7333 |                 Val loss: 0.5477 | Val acc: 0.7135


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 118.68it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 210.40it/s]


 Epoch 3: Train loss: 0.4725 |  Train acc: 0.7711 |                 Val loss: 0.5509 | Val acc: 0.7255


Training Epoch: 4: 100%|██████████| 625/625 [00:05<00:00, 104.83it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 310.31it/s]


 Epoch 4: Train loss: 0.429 |  Train acc: 0.8016 |                 Val loss: 0.5574 | Val acc: 0.7235


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 119.53it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 266.40it/s]


 Epoch 5: Train loss: 0.3855 |  Train acc: 0.8292 |                 Val loss: 0.5829 | Val acc: 0.728


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 99.95it/s] 
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 319.59it/s]


 Epoch 6: Train loss: 0.3409 |  Train acc: 0.8566 |                 Val loss: 0.6429 | Val acc: 0.7213


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 118.68it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 279.62it/s]


 Epoch 7: Train loss: 0.2943 |  Train acc: 0.885 |                 Val loss: 0.683 | Val acc: 0.7178


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 100.49it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 304.65it/s]


 Epoch 8: Train loss: 0.2502 |  Train acc: 0.9084 |                 Val loss: 0.7288 | Val acc: 0.7175


Training Epoch: 9: 100%|██████████| 625/625 [00:05<00:00, 118.45it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 324.68it/s]


 Epoch 9: Train loss: 0.2093 |  Train acc: 0.9265 |                 Val loss: 0.8099 | Val acc: 0.7115


Test Epoch: 5: 100%|██████████| 94/94 [00:00<00:00, 319.67it/s]


Best Model: Test Acc 0.7238333225250244 | Test Loss 0.5824313759803772 

Working with layer type: Efficient


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 101.04it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 292.98it/s]


 Epoch 0: Train loss: 0.6638 |  Train acc: 0.5487 |                 Val loss: 0.6148 | Val acc: 0.6145


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 118.80it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 295.06it/s]


 Epoch 1: Train loss: 0.5724 |  Train acc: 0.6786 |                 Val loss: 0.5604 | Val acc: 0.6975


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 101.34it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 202.70it/s]


 Epoch 2: Train loss: 0.5097 |  Train acc: 0.7293 |                 Val loss: 0.5426 | Val acc: 0.7153


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 118.47it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 306.99it/s]


 Epoch 3: Train loss: 0.4636 |  Train acc: 0.7655 |                 Val loss: 0.5427 | Val acc: 0.7328


Training Epoch: 4: 100%|██████████| 625/625 [00:05<00:00, 109.67it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 200.18it/s]


 Epoch 4: Train loss: 0.4228 |  Train acc: 0.797 |                 Val loss: 0.548 | Val acc: 0.7293


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 111.66it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 308.07it/s]


 Epoch 5: Train loss: 0.3811 |  Train acc: 0.8272 |                 Val loss: 0.5673 | Val acc: 0.7303


Training Epoch: 6: 100%|██████████| 625/625 [00:05<00:00, 113.75it/s]
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 224.42it/s]


 Epoch 6: Train loss: 0.3387 |  Train acc: 0.8558 |                 Val loss: 0.6028 | Val acc: 0.7258


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 104.30it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 290.82it/s]


 Epoch 7: Train loss: 0.293 |  Train acc: 0.882 |                 Val loss: 0.6707 | Val acc: 0.7273


Training Epoch: 8: 100%|██████████| 625/625 [00:05<00:00, 116.74it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 317.75it/s]


 Epoch 8: Train loss: 0.2502 |  Train acc: 0.9031 |                 Val loss: 0.7007 | Val acc: 0.727


Training Epoch: 9: 100%|██████████| 625/625 [00:06<00:00, 100.24it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 275.19it/s]


 Epoch 9: Train loss: 0.2079 |  Train acc: 0.925 |                 Val loss: 0.7583 | Val acc: 0.7215


Test Epoch: 3: 100%|██████████| 94/94 [00:00<00:00, 322.12it/s]


Best Model: Test Acc 0.7210000157356262 | Test Loss 0.5390888452529907 

Working with layer type: Efficient


Training Epoch: 0: 100%|██████████| 625/625 [00:05<00:00, 115.55it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 304.03it/s]


 Epoch 0: Train loss: 0.6692 |  Train acc: 0.5328 |                 Val loss: 0.6215 | Val acc: 0.6108


Training Epoch: 1: 100%|██████████| 625/625 [00:06<00:00, 98.72it/s] 
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 297.40it/s]


 Epoch 1: Train loss: 0.5747 |  Train acc: 0.674 |                 Val loss: 0.5579 | Val acc: 0.6968


Training Epoch: 2: 100%|██████████| 625/625 [00:05<00:00, 114.65it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 307.64it/s]


 Epoch 2: Train loss: 0.5065 |  Train acc: 0.7334 |                 Val loss: 0.541 | Val acc: 0.7235


Training Epoch: 3: 100%|██████████| 625/625 [00:06<00:00, 99.74it/s] 
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 274.24it/s]


 Epoch 3: Train loss: 0.4569 |  Train acc: 0.7745 |                 Val loss: 0.5312 | Val acc: 0.7308


Training Epoch: 4: 100%|██████████| 625/625 [00:05<00:00, 114.53it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 320.92it/s]


 Epoch 4: Train loss: 0.4151 |  Train acc: 0.8025 |                 Val loss: 0.5421 | Val acc: 0.7393


Training Epoch: 5: 100%|██████████| 625/625 [00:06<00:00, 98.59it/s] 
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 299.55it/s]


 Epoch 5: Train loss: 0.3729 |  Train acc: 0.8335 |                 Val loss: 0.5807 | Val acc: 0.747


Training Epoch: 6: 100%|██████████| 625/625 [00:05<00:00, 112.82it/s]
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 319.24it/s]


 Epoch 6: Train loss: 0.3282 |  Train acc: 0.8608 |                 Val loss: 0.6138 | Val acc: 0.7335


Training Epoch: 7: 100%|██████████| 625/625 [00:06<00:00, 100.45it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 297.76it/s]


 Epoch 7: Train loss: 0.2808 |  Train acc: 0.8862 |                 Val loss: 0.6883 | Val acc: 0.7298


Training Epoch: 8: 100%|██████████| 625/625 [00:05<00:00, 112.61it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 320.05it/s]


 Epoch 8: Train loss: 0.2354 |  Train acc: 0.911 |                 Val loss: 0.7456 | Val acc: 0.7315


Training Epoch: 9: 100%|██████████| 625/625 [00:06<00:00, 102.57it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 287.20it/s]


 Epoch 9: Train loss: 0.1936 |  Train acc: 0.9316 |                 Val loss: 0.8496 | Val acc: 0.7238


Test Epoch: 5: 100%|██████████| 94/94 [00:00<00:00, 304.23it/s]


Best Model: Test Acc 0.7333333492279053 | Test Loss 0.5888476967811584 

Working with layer type: Efficient


Training Epoch: 0: 100%|██████████| 625/625 [00:05<00:00, 117.88it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 308.27it/s]


 Epoch 0: Train loss: 0.6668 |  Train acc: 0.5414 |                 Val loss: 0.6273 | Val acc: 0.578


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 106.51it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 198.90it/s]


 Epoch 1: Train loss: 0.5797 |  Train acc: 0.6744 |                 Val loss: 0.5576 | Val acc: 0.6828


Training Epoch: 2: 100%|██████████| 625/625 [00:05<00:00, 115.69it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 277.01it/s]


 Epoch 2: Train loss: 0.5146 |  Train acc: 0.7331 |                 Val loss: 0.5357 | Val acc: 0.7208


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 116.59it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 233.52it/s]


 Epoch 3: Train loss: 0.4672 |  Train acc: 0.7686 |                 Val loss: 0.5314 | Val acc: 0.7303


Training Epoch: 4: 100%|██████████| 625/625 [00:05<00:00, 104.88it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 288.18it/s]


 Epoch 4: Train loss: 0.4252 |  Train acc: 0.7977 |                 Val loss: 0.5379 | Val acc: 0.7288


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 120.69it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 296.84it/s]


 Epoch 5: Train loss: 0.3815 |  Train acc: 0.827 |                 Val loss: 0.5776 | Val acc: 0.7225


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 100.44it/s]
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 290.63it/s]


 Epoch 6: Train loss: 0.3352 |  Train acc: 0.8564 |                 Val loss: 0.6237 | Val acc: 0.7348


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 119.90it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 291.54it/s]


 Epoch 7: Train loss: 0.2903 |  Train acc: 0.8833 |                 Val loss: 0.6637 | Val acc: 0.7295


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 99.57it/s] 
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 312.46it/s]


 Epoch 8: Train loss: 0.2455 |  Train acc: 0.9052 |                 Val loss: 0.7588 | Val acc: 0.7293


Training Epoch: 9: 100%|██████████| 625/625 [00:05<00:00, 120.10it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 307.00it/s]


 Epoch 9: Train loss: 0.2053 |  Train acc: 0.9259 |                 Val loss: 0.796 | Val acc: 0.7218


Test Epoch: 6: 100%|██████████| 94/94 [00:00<00:00, 297.59it/s]


Best Model: Test Acc 0.7364999651908875 | Test Loss 0.6132660508155823 

Working with layer type: Efficient


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 101.47it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 318.46it/s]


 Epoch 0: Train loss: 0.6687 |  Train acc: 0.541 |                 Val loss: 0.6176 | Val acc: 0.6203


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 120.49it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 274.93it/s]


 Epoch 1: Train loss: 0.5833 |  Train acc: 0.6671 |                 Val loss: 0.5588 | Val acc: 0.707


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 100.68it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 308.86it/s]


 Epoch 2: Train loss: 0.5188 |  Train acc: 0.7279 |                 Val loss: 0.5341 | Val acc: 0.7233


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 121.02it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 280.44it/s]


 Epoch 3: Train loss: 0.4691 |  Train acc: 0.7631 |                 Val loss: 0.5307 | Val acc: 0.7275


Training Epoch: 4: 100%|██████████| 625/625 [00:05<00:00, 104.62it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 204.57it/s]


 Epoch 4: Train loss: 0.4262 |  Train acc: 0.7989 |                 Val loss: 0.5447 | Val acc: 0.7338


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 116.78it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 265.68it/s]


 Epoch 5: Train loss: 0.3799 |  Train acc: 0.8293 |                 Val loss: 0.5583 | Val acc: 0.7315


Training Epoch: 6: 100%|██████████| 625/625 [00:05<00:00, 111.91it/s]
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 230.87it/s]


 Epoch 6: Train loss: 0.3351 |  Train acc: 0.8578 |                 Val loss: 0.6171 | Val acc: 0.733


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 107.57it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 302.61it/s]


 Epoch 7: Train loss: 0.2874 |  Train acc: 0.8835 |                 Val loss: 0.6579 | Val acc: 0.7315


Training Epoch: 8: 100%|██████████| 625/625 [00:05<00:00, 118.53it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 319.98it/s]


 Epoch 8: Train loss: 0.245 |  Train acc: 0.9047 |                 Val loss: 0.7339 | Val acc: 0.7245


Training Epoch: 9: 100%|██████████| 625/625 [00:06<00:00, 103.74it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 283.13it/s]


 Epoch 9: Train loss: 0.2012 |  Train acc: 0.9262 |                 Val loss: 0.8081 | Val acc: 0.7323


Test Epoch: 4: 100%|██████████| 94/94 [00:00<00:00, 315.73it/s]


Best Model: Test Acc 0.7266666293144226 | Test Loss 0.5571174025535583 

Working with layer type: Super


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 103.01it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 215.20it/s]


 Epoch 0: Train loss: 0.6681 |  Train acc: 0.535 |                 Val loss: 0.6126 | Val acc: 0.624


Training Epoch: 1: 100%|██████████| 625/625 [00:06<00:00, 91.06it/s] 
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 266.13it/s]


 Epoch 1: Train loss: 0.5715 |  Train acc: 0.6835 |                 Val loss: 0.5438 | Val acc: 0.706


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 101.16it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 213.77it/s]


 Epoch 2: Train loss: 0.505 |  Train acc: 0.7373 |                 Val loss: 0.5245 | Val acc: 0.7268


Training Epoch: 3: 100%|██████████| 625/625 [00:06<00:00, 93.65it/s] 
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 242.05it/s]


 Epoch 3: Train loss: 0.4612 |  Train acc: 0.7693 |                 Val loss: 0.5126 | Val acc: 0.723


Training Epoch: 4: 100%|██████████| 625/625 [00:06<00:00, 96.86it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 177.63it/s]


 Epoch 4: Train loss: 0.4265 |  Train acc: 0.7917 |                 Val loss: 0.5205 | Val acc: 0.7368


Training Epoch: 5: 100%|██████████| 625/625 [00:06<00:00, 97.33it/s] 
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 260.78it/s]


 Epoch 5: Train loss: 0.3945 |  Train acc: 0.8133 |                 Val loss: 0.514 | Val acc: 0.7445


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 90.01it/s]
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 176.84it/s]


 Epoch 6: Train loss: 0.3641 |  Train acc: 0.832 |                 Val loss: 0.5458 | Val acc: 0.7465


Training Epoch: 7: 100%|██████████| 625/625 [00:06<00:00, 103.16it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 259.29it/s]


 Epoch 7: Train loss: 0.3339 |  Train acc: 0.8505 |                 Val loss: 0.5928 | Val acc: 0.7305


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 89.54it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 270.62it/s]


 Epoch 8: Train loss: 0.3011 |  Train acc: 0.869 |                 Val loss: 0.6458 | Val acc: 0.7435


Training Epoch: 9: 100%|██████████| 625/625 [00:06<00:00, 102.27it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 265.81it/s]


 Epoch 9: Train loss: 0.2688 |  Train acc: 0.8876 |                 Val loss: 0.6712 | Val acc: 0.7335


Test Epoch: 6: 100%|██████████| 94/94 [00:00<00:00, 263.49it/s]


Best Model: Test Acc 0.7423333525657654 | Test Loss 0.5536013841629028 

Working with layer type: Super


Training Epoch: 0: 100%|██████████| 625/625 [00:07<00:00, 88.58it/s] 
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 252.91it/s]


 Epoch 0: Train loss: 0.6773 |  Train acc: 0.5263 |                 Val loss: 0.6259 | Val acc: 0.6155


Training Epoch: 1: 100%|██████████| 625/625 [00:06<00:00, 103.62it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 267.23it/s]


 Epoch 1: Train loss: 0.5892 |  Train acc: 0.6692 |                 Val loss: 0.5618 | Val acc: 0.6695


Training Epoch: 2: 100%|██████████| 625/625 [00:07<00:00, 87.46it/s] 
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 276.88it/s]


 Epoch 2: Train loss: 0.5191 |  Train acc: 0.7346 |                 Val loss: 0.5293 | Val acc: 0.7133


Training Epoch: 3: 100%|██████████| 625/625 [00:06<00:00, 102.34it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 260.53it/s]


 Epoch 3: Train loss: 0.469 |  Train acc: 0.7701 |                 Val loss: 0.5158 | Val acc: 0.7368


Training Epoch: 4: 100%|██████████| 625/625 [00:07<00:00, 87.75it/s] 
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 265.95it/s]


 Epoch 4: Train loss: 0.4286 |  Train acc: 0.7988 |                 Val loss: 0.5258 | Val acc: 0.737


Training Epoch: 5: 100%|██████████| 625/625 [00:06<00:00, 102.77it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 182.86it/s]


 Epoch 5: Train loss: 0.393 |  Train acc: 0.8225 |                 Val loss: 0.5428 | Val acc: 0.747


Training Epoch: 6: 100%|██████████| 625/625 [00:07<00:00, 87.72it/s] 
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 283.56it/s]


 Epoch 6: Train loss: 0.3569 |  Train acc: 0.8433 |                 Val loss: 0.5722 | Val acc: 0.752


Training Epoch: 7: 100%|██████████| 625/625 [00:06<00:00, 96.16it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 176.27it/s]


 Epoch 7: Train loss: 0.3219 |  Train acc: 0.8652 |                 Val loss: 0.6065 | Val acc: 0.7465


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 92.35it/s] 
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 265.74it/s]


 Epoch 8: Train loss: 0.2825 |  Train acc: 0.8876 |                 Val loss: 0.6381 | Val acc: 0.7455


Training Epoch: 9: 100%|██████████| 625/625 [00:06<00:00, 89.55it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 223.79it/s]


 Epoch 9: Train loss: 0.2455 |  Train acc: 0.9092 |                 Val loss: 0.7118 | Val acc: 0.7323


Test Epoch: 6: 100%|██████████| 94/94 [00:00<00:00, 277.21it/s]


Best Model: Test Acc 0.7393333315849304 | Test Loss 0.5834963917732239 

Working with layer type: Super


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 96.75it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 243.68it/s]


 Epoch 0: Train loss: 0.6862 |  Train acc: 0.5048 |                 Val loss: 0.6347 | Val acc: 0.6023


Training Epoch: 1: 100%|██████████| 625/625 [00:07<00:00, 88.56it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 252.15it/s]


 Epoch 1: Train loss: 0.5876 |  Train acc: 0.6702 |                 Val loss: 0.5484 | Val acc: 0.7053


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 99.64it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 255.10it/s]


 Epoch 2: Train loss: 0.515 |  Train acc: 0.7325 |                 Val loss: 0.5233 | Val acc: 0.7218


Training Epoch: 3: 100%|██████████| 625/625 [00:07<00:00, 87.98it/s] 
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 260.12it/s]


 Epoch 3: Train loss: 0.4696 |  Train acc: 0.7676 |                 Val loss: 0.5223 | Val acc: 0.7448


Training Epoch: 4: 100%|██████████| 625/625 [00:06<00:00, 103.41it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 273.58it/s]


 Epoch 4: Train loss: 0.4328 |  Train acc: 0.7939 |                 Val loss: 0.5177 | Val acc: 0.7428


Training Epoch: 5: 100%|██████████| 625/625 [00:06<00:00, 90.19it/s] 
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 250.72it/s]


 Epoch 5: Train loss: 0.4005 |  Train acc: 0.8156 |                 Val loss: 0.5304 | Val acc: 0.7463


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 102.08it/s]
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 271.01it/s]


 Epoch 6: Train loss: 0.3656 |  Train acc: 0.8379 |                 Val loss: 0.5408 | Val acc: 0.742


Training Epoch: 7: 100%|██████████| 625/625 [00:07<00:00, 88.68it/s] 
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 258.99it/s]


 Epoch 7: Train loss: 0.3307 |  Train acc: 0.8591 |                 Val loss: 0.5844 | Val acc: 0.7448


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 101.23it/s]
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 190.21it/s]


 Epoch 8: Train loss: 0.2953 |  Train acc: 0.8816 |                 Val loss: 0.6134 | Val acc: 0.7478


Training Epoch: 9: 100%|██████████| 625/625 [00:06<00:00, 92.32it/s] 
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 261.07it/s]


 Epoch 9: Train loss: 0.262 |  Train acc: 0.8994 |                 Val loss: 0.6653 | Val acc: 0.738


Test Epoch: 8: 100%|██████████| 94/94 [00:00<00:00, 281.06it/s]


Best Model: Test Acc 0.7448333501815796 | Test Loss 0.6306514143943787 

Working with layer type: Super


Training Epoch: 0: 100%|██████████| 625/625 [00:06<00:00, 94.19it/s]
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 182.86it/s]


 Epoch 0: Train loss: 0.6714 |  Train acc: 0.5333 |                 Val loss: 0.6146 | Val acc: 0.625


Training Epoch: 1: 100%|██████████| 625/625 [00:06<00:00, 100.34it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 259.42it/s]


 Epoch 1: Train loss: 0.5714 |  Train acc: 0.6744 |                 Val loss: 0.5512 | Val acc: 0.7033


Training Epoch: 2: 100%|██████████| 625/625 [00:06<00:00, 90.39it/s]
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 185.74it/s]


 Epoch 2: Train loss: 0.508 |  Train acc: 0.73 |                 Val loss: 0.529 | Val acc: 0.7263


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 104.99it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 256.51it/s]


 Epoch 3: Train loss: 0.4658 |  Train acc: 0.7596 |                 Val loss: 0.5331 | Val acc: 0.706


Training Epoch: 4: 100%|██████████| 625/625 [00:06<00:00, 89.80it/s]
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 276.75it/s]


 Epoch 4: Train loss: 0.4292 |  Train acc: 0.7864 |                 Val loss: 0.5278 | Val acc: 0.7458


Training Epoch: 5: 100%|██████████| 625/625 [00:05<00:00, 105.15it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 264.19it/s]


 Epoch 5: Train loss: 0.3947 |  Train acc: 0.8087 |                 Val loss: 0.5398 | Val acc: 0.7483


Training Epoch: 6: 100%|██████████| 625/625 [00:07<00:00, 88.91it/s] 
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 259.76it/s]


 Epoch 6: Train loss: 0.3608 |  Train acc: 0.8321 |                 Val loss: 0.551 | Val acc: 0.7483


Training Epoch: 7: 100%|██████████| 625/625 [00:05<00:00, 104.61it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 262.33it/s]


 Epoch 7: Train loss: 0.3256 |  Train acc: 0.8533 |                 Val loss: 0.6017 | Val acc: 0.7425


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 90.19it/s] 
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 283.09it/s]


 Epoch 8: Train loss: 0.2911 |  Train acc: 0.8748 |                 Val loss: 0.6555 | Val acc: 0.74


Training Epoch: 9: 100%|██████████| 625/625 [00:05<00:00, 104.29it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 239.66it/s]


 Epoch 9: Train loss: 0.2581 |  Train acc: 0.8933 |                 Val loss: 0.6626 | Val acc: 0.7415


Test Epoch: 6: 100%|██████████| 94/94 [00:00<00:00, 273.19it/s]


Best Model: Test Acc 0.7416666746139526 | Test Loss 0.5463690757751465 

Working with layer type: Super


Training Epoch: 0: 100%|██████████| 625/625 [00:07<00:00, 89.26it/s] 
Test Epoch: 0: 100%|██████████| 63/63 [00:00<00:00, 292.98it/s]


 Epoch 0: Train loss: 0.6857 |  Train acc: 0.5132 |                 Val loss: 0.6298 | Val acc: 0.619


Training Epoch: 1: 100%|██████████| 625/625 [00:05<00:00, 105.14it/s]
Test Epoch: 1: 100%|██████████| 63/63 [00:00<00:00, 272.22it/s]


 Epoch 1: Train loss: 0.5856 |  Train acc: 0.6741 |                 Val loss: 0.5553 | Val acc: 0.6848


Training Epoch: 2: 100%|██████████| 625/625 [00:07<00:00, 88.73it/s] 
Test Epoch: 2: 100%|██████████| 63/63 [00:00<00:00, 281.12it/s]


 Epoch 2: Train loss: 0.5154 |  Train acc: 0.7336 |                 Val loss: 0.5323 | Val acc: 0.7048


Training Epoch: 3: 100%|██████████| 625/625 [00:05<00:00, 104.31it/s]
Test Epoch: 3: 100%|██████████| 63/63 [00:00<00:00, 181.31it/s]


 Epoch 3: Train loss: 0.4673 |  Train acc: 0.7659 |                 Val loss: 0.5181 | Val acc: 0.7318


Training Epoch: 4: 100%|██████████| 625/625 [00:06<00:00, 91.26it/s] 
Test Epoch: 4: 100%|██████████| 63/63 [00:00<00:00, 251.79it/s]


 Epoch 4: Train loss: 0.4302 |  Train acc: 0.791 |                 Val loss: 0.5198 | Val acc: 0.722


Training Epoch: 5: 100%|██████████| 625/625 [00:06<00:00, 101.47it/s]
Test Epoch: 5: 100%|██████████| 63/63 [00:00<00:00, 191.08it/s]


 Epoch 5: Train loss: 0.398 |  Train acc: 0.8101 |                 Val loss: 0.5385 | Val acc: 0.7513


Training Epoch: 6: 100%|██████████| 625/625 [00:06<00:00, 93.12it/s] 
Test Epoch: 6: 100%|██████████| 63/63 [00:00<00:00, 280.57it/s]


 Epoch 6: Train loss: 0.3643 |  Train acc: 0.8325 |                 Val loss: 0.5704 | Val acc: 0.74


Training Epoch: 7: 100%|██████████| 625/625 [00:06<00:00, 98.82it/s]
Test Epoch: 7: 100%|██████████| 63/63 [00:00<00:00, 175.45it/s]


 Epoch 7: Train loss: 0.3311 |  Train acc: 0.852 |                 Val loss: 0.5764 | Val acc: 0.7468


Training Epoch: 8: 100%|██████████| 625/625 [00:06<00:00, 94.45it/s] 
Test Epoch: 8: 100%|██████████| 63/63 [00:00<00:00, 269.86it/s]


 Epoch 8: Train loss: 0.2983 |  Train acc: 0.8752 |                 Val loss: 0.6067 | Val acc: 0.7418


Training Epoch: 9: 100%|██████████| 625/625 [00:06<00:00, 94.79it/s]
Test Epoch: 9: 100%|██████████| 63/63 [00:00<00:00, 167.46it/s]


 Epoch 9: Train loss: 0.2684 |  Train acc: 0.8931 |                 Val loss: 0.6472 | Val acc: 0.7353


Test Epoch: 5: 100%|██████████| 94/94 [00:00<00:00, 237.23it/s]


Best Model: Test Acc 0.7379999756813049 | Test Loss 0.5473145842552185 

