In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import yaml
import missingno as msno
from icecream import ic
import math

import matplotlib.pyplot as plt
import seaborn as sns   
sns.set(style="whitegrid")

import copy
from tqdm import tqdm

from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

from torch.utils.tensorboard import SummaryWriter
from imblearn.over_sampling import SMOTE

pd.set_option('display.max_columns', None)
%load_ext blackcellmagic
%load_ext autoreload

In [2]:
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [3]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
# Load data
train_df = pd.read_csv(config["paths"]["ptb_train"], header=None)
test_df = pd.read_csv(config["paths"]["ptb_test"], header=None)

In [5]:
X_train_full = train_df.iloc[:, :-1].to_numpy()
y_train_full = train_df.iloc[:, -1].to_numpy()

X_test = test_df.iloc[:, :-1].to_numpy()
y_test = test_df.iloc[:, -1].to_numpy()

X_train_full = np.c_[X_train_full, np.zeros((X_train_full.shape[0], 3))]
X_test = np.c_[X_test, np.zeros((X_test.shape[0], 3))]

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, stratify=y_train_full, random_state=42
)

In [7]:
if config["preprocessing"]["use_smote"]:
    sm = SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    #ic(X_train.shape, y_train.shape)
    #ic(np.unique(y_train, return_counts=True))

In [8]:
class PTB_Dataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [9]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [10]:
#### ATTENTION
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [11]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.0):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = int(d_model // h)
        self.h = int(h)
        self.linears = clones(nn.Linear(d_model, d_model, dtype=torch.float32), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1) # Same mask applied to all h heads.
            
        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [
            lin(x).view(-1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears[:3], [query, key, value])
        ]
        
        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = (x.transpose(1, 2).contiguous().view( -1, self.h * self.d_k))
        
        del query
        del key
        del value
        out = self.linears[-1](x)
        return out


In [12]:
## BLOCKING
class LayerNorm(nn.Module):
    "Construct a layernorm module."
    def __init__(self, features, eps=1e-6):
        super().__init__()
        self.a_2 = nn.Parameter(torch.ones(features).to(DEVICE))
        self.b_2 = nn.Parameter(torch.zeros(features).to(DEVICE))
        self.eps = eps
    
    def forward(self, x):
        #print(f'Layer Norm input type = {x.dtype}')
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
        #print(f'Layer Norm result type = {result.dtype}')

In [13]:
class SublayerConnection(nn.Module):
    "A residual connection followed by a layer norm."
    def __init__(self, size, dropout):
        super().__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [14]:
class EncoderBlock(nn.Module):
    "Encoder is made up of self-attn and feed forward"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super().__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        ic(x.shape)
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        ic(x.shape)
        x = self.sublayer[1](x, self.feed_forward)
        ic(x.shape)
        return x


In [15]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask=None):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [16]:
class FeedForward(nn.Module):
    "Construct a FeedForward network with one hidden layer"
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff, dtype=torch.float32)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(d_ff, d_model, dtype=torch.float32)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [17]:
class Transformer(nn.Module):
    "Transformer Model"
    def __init__(self, input_size, num_classes, num_heads=8, N=6, d_ff=256, dropout=0.0):
        super().__init__()
        c = copy.deepcopy
        attn = MultiHeadedAttention(num_heads, input_size)
        ff = FeedForward(input_size, d_ff, dropout)
        self.encoder = Encoder(EncoderBlock(input_size, c(attn), c(ff), dropout), N)
        self.fc = nn.Linear(input_size, num_classes, dtype=torch.float32)
    
    def forward(self, x):
        x = self.encoder(x)
        ic(x.shape)
        x = self.fc(x)
        ic(x.shape)
        return x

In [18]:
class EarlyStopping:
    def __init__(self, patience, verbose = 0, mode = 'mim'):
        self.patience = patience
        self.best_metric = float('inf') if mode == 'min' else 0.0
        self.best_epoch = -1
        self.early_stop = False
        self.verbose = verbose
        self.mode = mode

    def __call__(self, val_metric, epoch_index):
        if self.mode == 'max' and val_metric >= self.best_metric or self.mode == 'min' and val_metric <= self.best_metric:
            self.early_stop = False
            self.best_metric = val_metric
            self.best_epoch = epoch_index
        else:
            self.early_stop = (epoch_index - self.best_epoch) >= self.patience
            if self.verbose:
                print(f'EarlyStopping: current epoch {epoch_index + 1} no improvement for metric, best metric = {self.best_metric:0.4f} in epoch = {self.best_epoch + 1}')


In [19]:
train_loader = DataLoader(
    PTB_Dataset(X_train, y_train), batch_size=config["dataloader"]["train_batch_size"], shuffle=True, pin_memory=True
)
val_loader = DataLoader(
    PTB_Dataset(X_val, y_val), batch_size=config["dataloader"]["val_batch_size"], shuffle=False, pin_memory=True
)

test_loader = DataLoader(
    PTB_Dataset(X_test, y_test), batch_size=config["dataloader"]["test_batch_size"], shuffle=False, pin_memory=True
)


In [32]:
def fit_epoch(model, train_loader, criterion, optimizer, epoch_index):
    
    model.train(True) #just in case - decided to switch model to train mode
    
    running_loss = 0.0
    running_corrects = 0
    processed_data = 0
    batches_total = len(train_loader) #returns number of batches
    y_preds = np.array([])
    y_true = np.array([])
    
    #displaying pretty progress bar for each epoch
    #with tqdm(desc=f"train epoch {epoch_index}:", leave=False, total=batches_total) as pbar_train:
    
    for inputs, labels in train_loader:

            inputs = inputs.to(DEVICE)
            labels = labels.type(torch.LongTensor).to(DEVICE)
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            preds = torch.argmax(F.softmax(outputs,1), 1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
            processed_data += inputs.size(0)
            y_preds = np.concatenate((y_preds, preds.cpu().numpy()))
            y_true = np.concatenate((y_true, labels.cpu().numpy()))
            
            zero_grad_count = 0
            total_grad_count = 0
        #     for param in model.parameters():
        #         if param.grad is not None:
        #                 zero_grad_count += torch.sum(param.grad == 0).item()
        #                 total_grad_count += torch.numel(param.grad)

        #     print(f"Number of zero gradients = {zero_grad_count} / {total_grad_count}")
                
            #pbar_train.update(1)
              
    train_loss = running_loss / processed_data
    train_acc = running_corrects.cpu().numpy() / processed_data
    train_f1 = f1_score(y_true, y_preds)
    return train_loss, train_acc, train_f1


In [33]:

def eval_epoch(model, val_loader, criterion, epoch_index):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    processed_size = 0
    y_preds = np.array([])
    y_true = np.array([])

    #displaying pretty progress bar for each epoch
    #with tqdm(desc=f"eval epoch {epoch_index}:", leave=False, total=len(val_loader)) as pbar_val:
            
    for inputs, labels in val_loader:
            inputs = inputs.to(DEVICE)
            labels = labels.type(torch.LongTensor).to(DEVICE)

            with torch.set_grad_enabled(False):
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                preds = torch.argmax(F.softmax(outputs, 1), 1)

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
            processed_size += inputs.size(0)
            y_preds = np.concatenate((y_preds, preds.cpu().numpy()))
            y_true = np.concatenate((y_true, labels.cpu().numpy()))
            #pbar_val.update(1)
    
    val_loss = running_loss / processed_size
    val_acc = running_corrects.cpu().numpy() / processed_size
    val_f1 = f1_score(y_true, y_preds)
    return val_loss, val_acc, val_f1


In [22]:

def train(train_loader, val_loader, model, epochs, \
          criterion, opt, scheduler = None, early_stopping: EarlyStopping = None):
    
    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_acc {t_acc:0.4f}, val_acc {v_acc:0.4f}, \
    train_f1 {train_f1:0.4f}, val_f1 {val_f1:0.4f} \
    best_val_f1 = {best_val_f1:0.4f}"

    # adding storage for best weights, will be finally used 
    # when training for all epochs is finished
    best_model_wts = model.state_dict()
    best_f1 = 0.0

    with tqdm(desc="epoch", total=epochs) as pbar_outer:

        for epoch in range(epochs):
            
            train_loss, train_acc, train_f1 = fit_epoch(model, train_loader, criterion, opt, epoch)
            
            val_loss, val_acc, val_f1 = eval_epoch(model, val_loader, criterion, epoch)
            history.append((train_loss, train_acc, train_f1, val_loss, val_acc, val_f1))
            
            #remember model weights if current accuracy is better than "best_acc"
            if val_f1 > best_f1:
                best_f1 = val_f1
                best_model_wts = copy.deepcopy(model.state_dict())

            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, \
                                           train_f1 = train_f1, v_acc=val_acc, \
                                           val_f1 = val_f1, best_val_f1 = best_f1))
            
             # if scheduler is defined, do LR step after every epoch
            if scheduler is not None:
                scheduler.step(val_loss)
            
            if early_stopping is not None:
                early_stopping(val_f1, epoch)
                if early_stopping.early_stop:
                    print(f'Early stopping, epoch = {epoch + 1}, best VAL F1 = {best_f1:0.4f}')
                    break
            
    # finally - let's load the best model weights
    model.load_state_dict(best_model_wts)        
            
    return model, history


In [23]:

def predict(model, test_loader):
    
    with torch.no_grad():
        logits = []
        true_labels = []
    
        for inputs, labels in test_loader:
            inputs = inputs.to(DEVICE)
            model.eval()
            outputs = model(inputs).cpu()
            logits.append(outputs)
            true_labels.append(labels.cpu().numpy())
            
    probs = nn.functional.softmax(torch.cat(logits), dim=-1).numpy()
    true_labels = np.concatenate(true_labels)
    return probs, true_labels

In [24]:

def init_parameters(model):
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)


In [25]:
BATCH_SIZE = 32
INPUT_SIZE = X_train.shape[1]
N_CLASSES = len(np.unique(y_train))
DROPOUT = 0.3
LR = 5e-03
N_EPOCHS = 100
HUM_HEADS = 5
NUM_ENCODERS = 5
DIM_FF = 128

model_t = Transformer(INPUT_SIZE, N_CLASSES, HUM_HEADS, NUM_ENCODERS, d_ff = DIM_FF, dropout = DROPOUT)
#init_parameters(model_t)

model_t = model_t.to(DEVICE)

optimizer = optim.Adam(model_t.parameters(), lr=LR)
criterion = torch.nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=5, threshold=1e-05, verbose = 1)
early_stopping = EarlyStopping(patience = 10, verbose = 1, mode = 'max')



In [26]:
model_t, history = train(train_loader, val_loader, model_t, N_EPOCHS,\
                         criterion, optimizer, scheduler, early_stopping)

epoch:   0%|          | 0/100 [00:00<?, ?it/s]ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 2])
  preds = torch.argmax(F.softmax(outputs), 1)
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])
ic| x.shape: torch.Size([32, 190])


KeyboardInterrupt: 

In [None]:
# testing the model and returning key metrics
probs, y_test = predict(model_t, test_loader)
y_pred = np.argmax(probs, axis=1)
test_f1 = f1_score(y_test, y_pred)
test_acc = np.mean(y_test == y_pred)
test_loss = criterion(torch.tensor(probs), torch.tensor(y_test)).item()

print(f"Test loss: {test_loss}, Test accuracy: {test_acc}, Test F1: {test_f1}")

Test loss: 0.3397256814387403, Test accuracy: 0.9725180350395053, Test F1: 0.9809342230695901


Test loss: 0.34013988103442655, Test accuracy: 0.9728615596015115, Test F1: 0.9810959559703278

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, input_size, num_classes, num_heads, num_encoders, dim_feedforward, dropout=0.1):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Linear(input_size, dim_feedforward)
        self.positional_encoding = PositionalEncoding(dim_feedforward, dropout)

        # Custom encoder layer to extract attention
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayerWithAttention(dim_feedforward, num_heads, dropout)
            for _ in range(num_encoders)
        ])
        
        self.output_layer = nn.Linear(dim_feedforward, num_classes)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        attentions = []
        
        for layer in self.encoder_layers:
            x, attn = layer(x)
            attentions.append(attn)
        
        x = x.mean(dim=1)
        x = self.output_layer(x)
        return x#, attentions

class TransformerEncoderLayerWithAttention(nn.Module):
    def __init__(self, d_model, nhead, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_model, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src):
        src2, attn = self.self_attn(src, src, src, need_weights=True)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.dropout(self.linear1(F.relu(self.linear2(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src, attn


In [None]:
def create_padding_mask(seq, pad_token=0):
    # Assuming `seq` is of shape [batch_size, sequence_length]
    # and padded positions are denoted by `pad_token` (e.g., 0)
    mask = (seq != pad_token)  # Shape: [batch_size, 1, 1, sequence_length]
    # `True` for non-pad tokens and `False` for pad tokens
    return mask

In [60]:
class TransformerModel(nn.Module):
    def __init__(
        self, input_size, model_size, num_classes, num_heads=8, num_layers=6, d_ff=256, dropout=0.1
    ):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_size, input_size * model_size)
        #self.pos_encoder = PositionalEncoding(model_size, dropout)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=model_size, nhead=num_heads, dim_feedforward = d_ff, 
            dropout=dropout, activation='gelu', layer_norm_eps=1e-6
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layers, num_layers=num_layers,
            norm = nn.LayerNorm(model_size)
        )
        self.output_layer = nn.Linear(2 * model_size, num_classes)
        self.input_size = input_size

    def forward(self, x):
        src = self.embedding(x) * math.sqrt(self.input_size)
        src = src.reshape(x.shape[0], x.shape[1], -1)
        #ic(src.shape)
        #src = self.pos_encoder(src)
        #ic(src.shape)
        output = self.transformer_encoder(src)
        #ic(output.shape)
        #ic(output.mean(dim=1).shape)
        # max pooling:
        avg_pooled = output.mean(dim=1)
        max_pooled = output.max(dim=1).values
        output = torch.cat([avg_pooled, max_pooled], dim=1)
        output = self.output_layer(output)
        #ic(output.shape)
        return output

In [54]:
def init_parameters(model):
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

In [63]:
BATCH_SIZE = 32
INPUT_SIZE = X_train.shape[1]
N_CLASSES = len(np.unique(y_train))
DROPOUT = 0.2
LR = 1e-3
N_EPOCHS = 100
NUM_HEADS = 5
NUM_ENCODERS = 5
DIM_FF = 128
MODEL_SIZE = 10

model_t = TransformerModel(
    num_classes=N_CLASSES,
    input_size=INPUT_SIZE,
    model_size=MODEL_SIZE,
    num_heads=NUM_HEADS,
    num_layers=NUM_ENCODERS,
    d_ff=DIM_FF,
    dropout=DROPOUT
)
model_t = model_t.to(DEVICE)
init_parameters(model_t)

optimizer = optim.AdamW(model_t.parameters(), lr=LR, weight_decay = config["adamw"]["weight_decay"])
criterion = torch.nn.CrossEntropyLoss()
#scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=5, threshold=1e-05, verbose = 1)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.95)
early_stopping = EarlyStopping(patience = 10, verbose = 1, mode = 'max')



In [64]:
model_t, history = train(train_loader, val_loader, model_t, N_EPOCHS,\
                         criterion, optimizer, scheduler=None, early_stopping=early_stopping )

                                                      


Epoch 001 train_loss: 0.5561     val_loss 0.4443 train_acc 0.7201, val_acc 0.7935,     train_f1 0.8299, val_f1 0.8589     best_val_f1 = 0.8589


                                                      


Epoch 002 train_loss: 0.4752     val_loss 0.3934 train_acc 0.7529, val_acc 0.8042,     train_f1 0.8380, val_f1 0.8611     best_val_f1 = 0.8611


                                                      


Epoch 003 train_loss: 0.4267     val_loss 0.4066 train_acc 0.7908, val_acc 0.7978,     train_f1 0.8570, val_f1 0.8556     best_val_f1 = 0.8611


epoch:   3%|▎         | 3/100 [00:06<03:26,  2.13s/it]

EarlyStopping: current epoch 3 no improvement for metric, best metric = 0.8611 in epoch = 2


                                                      


Epoch 004 train_loss: 0.4057     val_loss 0.3939 train_acc 0.8035, val_acc 0.8171,     train_f1 0.8639, val_f1 0.8692     best_val_f1 = 0.8692


                                                      


Epoch 005 train_loss: 0.3965     val_loss 0.4373 train_acc 0.8081, val_acc 0.8012,     train_f1 0.8663, val_f1 0.8500     best_val_f1 = 0.8692


epoch:   5%|▌         | 5/100 [00:10<03:22,  2.13s/it]

EarlyStopping: current epoch 5 no improvement for metric, best metric = 0.8692 in epoch = 4


                                                      


Epoch 006 train_loss: 0.3995     val_loss 0.3986 train_acc 0.8085, val_acc 0.8240,     train_f1 0.8681, val_f1 0.8806     best_val_f1 = 0.8806


                                                      


Epoch 007 train_loss: 0.3936     val_loss 0.4132 train_acc 0.8171, val_acc 0.8098,     train_f1 0.8736, val_f1 0.8554     best_val_f1 = 0.8806


epoch:   7%|▋         | 7/100 [00:14<03:18,  2.13s/it]

EarlyStopping: current epoch 7 no improvement for metric, best metric = 0.8806 in epoch = 6


                                                      


Epoch 008 train_loss: 0.3778     val_loss 0.4467 train_acc 0.8204, val_acc 0.8227,     train_f1 0.8746, val_f1 0.8789     best_val_f1 = 0.8806


epoch:   8%|▊         | 8/100 [00:17<03:16,  2.13s/it]

EarlyStopping: current epoch 8 no improvement for metric, best metric = 0.8806 in epoch = 6


                                                      


Epoch 009 train_loss: 0.3597     val_loss 0.3821 train_acc 0.8326, val_acc 0.8308,     train_f1 0.8834, val_f1 0.8798     best_val_f1 = 0.8806


epoch:   9%|▉         | 9/100 [00:19<03:14,  2.14s/it]

EarlyStopping: current epoch 9 no improvement for metric, best metric = 0.8806 in epoch = 6


                                                       


Epoch 010 train_loss: 0.3530     val_loss 0.3405 train_acc 0.8396, val_acc 0.8390,     train_f1 0.8881, val_f1 0.8805     best_val_f1 = 0.8806


epoch:  10%|█         | 10/100 [00:21<03:13,  2.15s/it]

EarlyStopping: current epoch 10 no improvement for metric, best metric = 0.8806 in epoch = 6


                                                       


Epoch 011 train_loss: 0.3351     val_loss 0.3188 train_acc 0.8471, val_acc 0.8639,     train_f1 0.8924, val_f1 0.9045     best_val_f1 = 0.9045


                                                       


Epoch 012 train_loss: 0.3329     val_loss 0.3014 train_acc 0.8457, val_acc 0.8600,     train_f1 0.8919, val_f1 0.9013     best_val_f1 = 0.9045


epoch:  12%|█▏        | 12/100 [00:25<03:09,  2.15s/it]

EarlyStopping: current epoch 12 no improvement for metric, best metric = 0.9045 in epoch = 11


                                                       


Epoch 013 train_loss: 0.3128     val_loss 0.3071 train_acc 0.8604, val_acc 0.8639,     train_f1 0.9023, val_f1 0.9017     best_val_f1 = 0.9045


epoch:  13%|█▎        | 13/100 [00:27<03:06,  2.14s/it]

EarlyStopping: current epoch 13 no improvement for metric, best metric = 0.9045 in epoch = 11


                                                       


Epoch 014 train_loss: 0.2982     val_loss 0.3018 train_acc 0.8677, val_acc 0.8751,     train_f1 0.9069, val_f1 0.9120     best_val_f1 = 0.9120


                                                       


Epoch 015 train_loss: 0.2988     val_loss 0.2760 train_acc 0.8668, val_acc 0.8738,     train_f1 0.9067, val_f1 0.9121     best_val_f1 = 0.9121


                                                       


Epoch 016 train_loss: 0.2823     val_loss 0.3122 train_acc 0.8718, val_acc 0.8635,     train_f1 0.9098, val_f1 0.9015     best_val_f1 = 0.9121


epoch:  16%|█▌        | 16/100 [00:34<03:00,  2.14s/it]

EarlyStopping: current epoch 16 no improvement for metric, best metric = 0.9121 in epoch = 15


                                                       


Epoch 017 train_loss: 0.2773     val_loss 0.3220 train_acc 0.8802, val_acc 0.8600,     train_f1 0.9161, val_f1 0.8957     best_val_f1 = 0.9121


epoch:  17%|█▋        | 17/100 [00:36<02:57,  2.14s/it]

EarlyStopping: current epoch 17 no improvement for metric, best metric = 0.9121 in epoch = 15


                                                       


Epoch 018 train_loss: 0.2639     val_loss 0.2583 train_acc 0.8846, val_acc 0.8866,     train_f1 0.9194, val_f1 0.9211     best_val_f1 = 0.9211


                                                       


Epoch 019 train_loss: 0.2537     val_loss 0.2636 train_acc 0.8889, val_acc 0.8849,     train_f1 0.9224, val_f1 0.9170     best_val_f1 = 0.9211


epoch:  19%|█▉        | 19/100 [00:40<02:53,  2.14s/it]

EarlyStopping: current epoch 19 no improvement for metric, best metric = 0.9211 in epoch = 18


                                                       


Epoch 020 train_loss: 0.2414     val_loss 0.2464 train_acc 0.8995, val_acc 0.8901,     train_f1 0.9302, val_f1 0.9219     best_val_f1 = 0.9219


                                                       


Epoch 021 train_loss: 0.2368     val_loss 0.2470 train_acc 0.9021, val_acc 0.8970,     train_f1 0.9320, val_f1 0.9298     best_val_f1 = 0.9298


                                                       


Epoch 022 train_loss: 0.2274     val_loss 0.2398 train_acc 0.9054, val_acc 0.9004,     train_f1 0.9341, val_f1 0.9304     best_val_f1 = 0.9304


                                                       


Epoch 023 train_loss: 0.2241     val_loss 0.2739 train_acc 0.9079, val_acc 0.8914,     train_f1 0.9360, val_f1 0.9234     best_val_f1 = 0.9304


epoch:  23%|██▎       | 23/100 [00:49<02:45,  2.14s/it]

EarlyStopping: current epoch 23 no improvement for metric, best metric = 0.9304 in epoch = 22


                                                       


Epoch 024 train_loss: 0.2243     val_loss 0.2456 train_acc 0.9075, val_acc 0.9000,     train_f1 0.9358, val_f1 0.9279     best_val_f1 = 0.9304


epoch:  24%|██▍       | 24/100 [00:51<02:43,  2.15s/it]

EarlyStopping: current epoch 24 no improvement for metric, best metric = 0.9304 in epoch = 22


                                                       


Epoch 025 train_loss: 0.2142     val_loss 0.2301 train_acc 0.9123, val_acc 0.9077,     train_f1 0.9391, val_f1 0.9354     best_val_f1 = 0.9354


                                                       


Epoch 026 train_loss: 0.2056     val_loss 0.2123 train_acc 0.9165, val_acc 0.9103,     train_f1 0.9418, val_f1 0.9373     best_val_f1 = 0.9373


                                                       


Epoch 027 train_loss: 0.1991     val_loss 0.2356 train_acc 0.9170, val_acc 0.9000,     train_f1 0.9422, val_f1 0.9309     best_val_f1 = 0.9373


epoch:  27%|██▋       | 27/100 [00:57<02:36,  2.14s/it]

EarlyStopping: current epoch 27 no improvement for metric, best metric = 0.9373 in epoch = 26


                                                       


Epoch 028 train_loss: 0.1959     val_loss 0.2009 train_acc 0.9192, val_acc 0.9167,     train_f1 0.9439, val_f1 0.9417     best_val_f1 = 0.9417


                                                       


Epoch 029 train_loss: 0.1921     val_loss 0.2178 train_acc 0.9205, val_acc 0.9060,     train_f1 0.9449, val_f1 0.9347     best_val_f1 = 0.9417


epoch:  29%|██▉       | 29/100 [01:02<02:31,  2.14s/it]

EarlyStopping: current epoch 29 no improvement for metric, best metric = 0.9417 in epoch = 28


                                                       


Epoch 030 train_loss: 0.1926     val_loss 0.2007 train_acc 0.9218, val_acc 0.9137,     train_f1 0.9456, val_f1 0.9397     best_val_f1 = 0.9417


epoch:  30%|███       | 30/100 [01:04<02:29,  2.14s/it]

EarlyStopping: current epoch 30 no improvement for metric, best metric = 0.9417 in epoch = 28


                                                       


Epoch 031 train_loss: 0.1851     val_loss 0.2021 train_acc 0.9254, val_acc 0.9150,     train_f1 0.9481, val_f1 0.9411     best_val_f1 = 0.9417


epoch:  31%|███       | 31/100 [01:06<02:27,  2.14s/it]

EarlyStopping: current epoch 31 no improvement for metric, best metric = 0.9417 in epoch = 28


                                                       


Epoch 032 train_loss: 0.1844     val_loss 0.1941 train_acc 0.9263, val_acc 0.9154,     train_f1 0.9489, val_f1 0.9411     best_val_f1 = 0.9417


epoch:  32%|███▏      | 32/100 [01:08<02:25,  2.14s/it]

EarlyStopping: current epoch 32 no improvement for metric, best metric = 0.9417 in epoch = 28


                                                       


Epoch 033 train_loss: 0.1862     val_loss 0.2064 train_acc 0.9244, val_acc 0.9073,     train_f1 0.9476, val_f1 0.9359     best_val_f1 = 0.9417


epoch:  33%|███▎      | 33/100 [01:10<02:23,  2.14s/it]

EarlyStopping: current epoch 33 no improvement for metric, best metric = 0.9417 in epoch = 28


                                                       


Epoch 034 train_loss: 0.1800     val_loss 0.2116 train_acc 0.9274, val_acc 0.9163,     train_f1 0.9496, val_f1 0.9401     best_val_f1 = 0.9417


epoch:  34%|███▍      | 34/100 [01:12<02:21,  2.14s/it]

EarlyStopping: current epoch 34 no improvement for metric, best metric = 0.9417 in epoch = 28


                                                       


Epoch 035 train_loss: 0.1775     val_loss 0.1792 train_acc 0.9275, val_acc 0.9231,     train_f1 0.9497, val_f1 0.9464     best_val_f1 = 0.9464


                                                       


Epoch 036 train_loss: 0.1698     val_loss 0.2050 train_acc 0.9349, val_acc 0.9197,     train_f1 0.9549, val_f1 0.9439     best_val_f1 = 0.9464


epoch:  36%|███▌      | 36/100 [01:17<02:16,  2.13s/it]

EarlyStopping: current epoch 36 no improvement for metric, best metric = 0.9464 in epoch = 35


                                                       


Epoch 037 train_loss: 0.1634     val_loss 0.2382 train_acc 0.9350, val_acc 0.9060,     train_f1 0.9550, val_f1 0.9332     best_val_f1 = 0.9464


epoch:  37%|███▋      | 37/100 [01:19<02:14,  2.14s/it]

EarlyStopping: current epoch 37 no improvement for metric, best metric = 0.9464 in epoch = 35


                                                       


Epoch 038 train_loss: 0.1629     val_loss 0.1894 train_acc 0.9334, val_acc 0.9240,     train_f1 0.9539, val_f1 0.9480     best_val_f1 = 0.9480


                                                       


Epoch 039 train_loss: 0.1628     val_loss 0.1857 train_acc 0.9366, val_acc 0.9188,     train_f1 0.9561, val_f1 0.9431     best_val_f1 = 0.9480


epoch:  39%|███▉      | 39/100 [01:23<02:09,  2.12s/it]

EarlyStopping: current epoch 39 no improvement for metric, best metric = 0.9480 in epoch = 38


                                                       


Epoch 040 train_loss: 0.1618     val_loss 0.1997 train_acc 0.9334, val_acc 0.9188,     train_f1 0.9539, val_f1 0.9434     best_val_f1 = 0.9480


epoch:  40%|████      | 40/100 [01:25<02:07,  2.12s/it]

EarlyStopping: current epoch 40 no improvement for metric, best metric = 0.9480 in epoch = 38


                                                       


Epoch 041 train_loss: 0.1496     val_loss 0.1826 train_acc 0.9421, val_acc 0.9317,     train_f1 0.9599, val_f1 0.9520     best_val_f1 = 0.9520


                                                       


Epoch 042 train_loss: 0.1508     val_loss 0.1816 train_acc 0.9420, val_acc 0.9304,     train_f1 0.9599, val_f1 0.9519     best_val_f1 = 0.9520


epoch:  42%|████▏     | 42/100 [01:29<02:02,  2.11s/it]

EarlyStopping: current epoch 42 no improvement for metric, best metric = 0.9520 in epoch = 41


                                                       


Epoch 043 train_loss: 0.1444     val_loss 0.2055 train_acc 0.9438, val_acc 0.9240,     train_f1 0.9611, val_f1 0.9458     best_val_f1 = 0.9520


epoch:  43%|████▎     | 43/100 [01:31<02:00,  2.11s/it]

EarlyStopping: current epoch 43 no improvement for metric, best metric = 0.9520 in epoch = 41


                                                       


Epoch 044 train_loss: 0.1420     val_loss 0.1836 train_acc 0.9456, val_acc 0.9330,     train_f1 0.9624, val_f1 0.9532     best_val_f1 = 0.9532


                                                       


Epoch 045 train_loss: 0.1416     val_loss 0.1707 train_acc 0.9444, val_acc 0.9360,     train_f1 0.9615, val_f1 0.9555     best_val_f1 = 0.9555


                                                       


Epoch 046 train_loss: 0.1524     val_loss 0.1822 train_acc 0.9388, val_acc 0.9270,     train_f1 0.9576, val_f1 0.9499     best_val_f1 = 0.9555


epoch:  46%|████▌     | 46/100 [01:38<01:53,  2.11s/it]

EarlyStopping: current epoch 46 no improvement for metric, best metric = 0.9555 in epoch = 45


                                                       


Epoch 047 train_loss: 0.1266     val_loss 0.1859 train_acc 0.9491, val_acc 0.9334,     train_f1 0.9648, val_f1 0.9544     best_val_f1 = 0.9555


epoch:  47%|████▋     | 47/100 [01:40<01:52,  2.11s/it]

EarlyStopping: current epoch 47 no improvement for metric, best metric = 0.9555 in epoch = 45


                                                       


Epoch 048 train_loss: 0.1395     val_loss 0.1896 train_acc 0.9445, val_acc 0.9283,     train_f1 0.9616, val_f1 0.9504     best_val_f1 = 0.9555


epoch:  48%|████▊     | 48/100 [01:42<01:50,  2.12s/it]

EarlyStopping: current epoch 48 no improvement for metric, best metric = 0.9555 in epoch = 45


                                                       


Epoch 049 train_loss: 0.1241     val_loss 0.2009 train_acc 0.9523, val_acc 0.9253,     train_f1 0.9670, val_f1 0.9494     best_val_f1 = 0.9555


epoch:  49%|████▉     | 49/100 [01:44<01:48,  2.12s/it]

EarlyStopping: current epoch 49 no improvement for metric, best metric = 0.9555 in epoch = 45


                                                       


Epoch 050 train_loss: 0.1230     val_loss 0.1587 train_acc 0.9539, val_acc 0.9450,     train_f1 0.9681, val_f1 0.9618     best_val_f1 = 0.9618


                                                       


Epoch 051 train_loss: 0.1199     val_loss 0.1729 train_acc 0.9579, val_acc 0.9369,     train_f1 0.9709, val_f1 0.9559     best_val_f1 = 0.9618


epoch:  51%|█████     | 51/100 [01:48<01:44,  2.13s/it]

EarlyStopping: current epoch 51 no improvement for metric, best metric = 0.9618 in epoch = 50


                                                       


Epoch 052 train_loss: 0.1189     val_loss 0.1618 train_acc 0.9565, val_acc 0.9386,     train_f1 0.9699, val_f1 0.9579     best_val_f1 = 0.9618


epoch:  52%|█████▏    | 52/100 [01:51<01:42,  2.13s/it]

EarlyStopping: current epoch 52 no improvement for metric, best metric = 0.9618 in epoch = 50


                                                       


Epoch 053 train_loss: 0.1182     val_loss 0.1747 train_acc 0.9539, val_acc 0.9386,     train_f1 0.9681, val_f1 0.9579     best_val_f1 = 0.9618


epoch:  53%|█████▎    | 53/100 [01:53<01:40,  2.14s/it]

EarlyStopping: current epoch 53 no improvement for metric, best metric = 0.9618 in epoch = 50


                                                       


Epoch 054 train_loss: 0.1128     val_loss 0.1725 train_acc 0.9569, val_acc 0.9407,     train_f1 0.9702, val_f1 0.9587     best_val_f1 = 0.9618


epoch:  54%|█████▍    | 54/100 [01:55<01:38,  2.14s/it]

EarlyStopping: current epoch 54 no improvement for metric, best metric = 0.9618 in epoch = 50


                                                       


Epoch 055 train_loss: 0.1164     val_loss 0.1740 train_acc 0.9564, val_acc 0.9382,     train_f1 0.9698, val_f1 0.9571     best_val_f1 = 0.9618


epoch:  55%|█████▌    | 55/100 [01:57<01:36,  2.14s/it]

EarlyStopping: current epoch 55 no improvement for metric, best metric = 0.9618 in epoch = 50


                                                       


Epoch 056 train_loss: 0.1044     val_loss 0.2152 train_acc 0.9614, val_acc 0.9317,     train_f1 0.9733, val_f1 0.9523     best_val_f1 = 0.9618


epoch:  56%|█████▌    | 56/100 [01:59<01:34,  2.14s/it]

EarlyStopping: current epoch 56 no improvement for metric, best metric = 0.9618 in epoch = 50


                                                       


Epoch 057 train_loss: 0.1080     val_loss 0.1648 train_acc 0.9587, val_acc 0.9459,     train_f1 0.9714, val_f1 0.9625     best_val_f1 = 0.9625


                                                       


Epoch 058 train_loss: 0.0987     val_loss 0.1938 train_acc 0.9645, val_acc 0.9334,     train_f1 0.9754, val_f1 0.9532     best_val_f1 = 0.9625


epoch:  58%|█████▊    | 58/100 [02:03<01:30,  2.14s/it]

EarlyStopping: current epoch 58 no improvement for metric, best metric = 0.9625 in epoch = 57


                                                       


Epoch 059 train_loss: 0.0979     val_loss 0.1564 train_acc 0.9662, val_acc 0.9433,     train_f1 0.9766, val_f1 0.9614     best_val_f1 = 0.9625


epoch:  59%|█████▉    | 59/100 [02:06<01:27,  2.14s/it]

EarlyStopping: current epoch 59 no improvement for metric, best metric = 0.9625 in epoch = 57


                                                       


Epoch 060 train_loss: 0.1042     val_loss 0.1692 train_acc 0.9641, val_acc 0.9429,     train_f1 0.9752, val_f1 0.9611     best_val_f1 = 0.9625


epoch:  60%|██████    | 60/100 [02:08<01:25,  2.15s/it]

EarlyStopping: current epoch 60 no improvement for metric, best metric = 0.9625 in epoch = 57


                                                       


Epoch 061 train_loss: 0.0927     val_loss 0.1520 train_acc 0.9663, val_acc 0.9493,     train_f1 0.9767, val_f1 0.9651     best_val_f1 = 0.9651


                                                       


Epoch 062 train_loss: 0.0987     val_loss 0.1718 train_acc 0.9655, val_acc 0.9446,     train_f1 0.9762, val_f1 0.9617     best_val_f1 = 0.9651


epoch:  62%|██████▏   | 62/100 [02:12<01:21,  2.15s/it]

EarlyStopping: current epoch 62 no improvement for metric, best metric = 0.9651 in epoch = 61


                                                       


Epoch 063 train_loss: 0.0951     val_loss 0.1394 train_acc 0.9654, val_acc 0.9519,     train_f1 0.9761, val_f1 0.9665     best_val_f1 = 0.9665


                                                       


Epoch 064 train_loss: 0.0883     val_loss 0.1607 train_acc 0.9682, val_acc 0.9468,     train_f1 0.9780, val_f1 0.9631     best_val_f1 = 0.9665


epoch:  64%|██████▍   | 64/100 [02:16<01:17,  2.14s/it]

EarlyStopping: current epoch 64 no improvement for metric, best metric = 0.9665 in epoch = 63


                                                       


Epoch 065 train_loss: 0.0920     val_loss 0.1482 train_acc 0.9661, val_acc 0.9493,     train_f1 0.9765, val_f1 0.9650     best_val_f1 = 0.9665


epoch:  65%|██████▌   | 65/100 [02:18<01:15,  2.14s/it]

EarlyStopping: current epoch 65 no improvement for metric, best metric = 0.9665 in epoch = 63


epoch:  65%|██████▌   | 65/100 [02:20<01:15,  2.16s/it]


KeyboardInterrupt: 