<a href="https://colab.research.google.com/github/LuigiSigillo/nlp2021-hw/blob/master/hw1/stud/nlp_hw1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [11]:
from google.colab import drive
# general
import matplotlib.pyplot as plt
import numpy as np
import os
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from typing import *
from torch.utils.tensorboard import SummaryWriter

# torch
import torch
import json
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD

# NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# SKLEARN
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

drive.mount('/content/drive')
root_folder = '/content/drive/My Drive/NLP/nlp2021-hw1'
dataset_folder = os.path.join(root_folder,'data')

''' code to download and move the glove embeddings in the right folder '''
#! wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
#! unzip -d data/glove.6B
#! cd '/content/drive/My Drive/NLP/nlp2021-hw1'
#!unzip '/content/drive/My Drive/NLP/nlp2021-hw1/glove.6B.zip'
# !mv glove.6B.200d.txt '/content/drive/My Drive/NLP/nlp2021-hw1/model'
!ls "{root_folder}/model/"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
glove.6B.100d.txt  glove.6B.300d.txt  state_149.pt
glove.6B.200d.txt  glove.6B.50d.txt   state_79.pt


In [12]:
#@title Setup of parameters{ run: "auto" }
WE_LENGTH = "200" #@param [50,100,200,300]
METHOD = "avg" #@param ["avg","sum","weigthed_avg"]
USE_SEP = "True" #@param ["True", "False"]
WORDS_LIMIT = 140000 #@param {type:"slider", min:20000, max:200000, step:20000}


# First approach (word-level)


### Loading of Glove word embeddings

Added to the dictionary also the "UNK" and "SEP" words using a random vector

In [13]:
word_vectors = dict()
with open(root_folder+'/model/glove.6B.'+WE_LENGTH+'d.txt') as f:

    next(f)  # skip header

    for i, line in tqdm(enumerate(f), total=WORDS_LIMIT):

        if i == WORDS_LIMIT:
            break

        word, *vector = line.strip().split(' ')
        vector = torch.tensor([float(c) for c in vector])
        
        word_vectors[word] = vector
# word_vectors["UNK"] = np.mean(np.array(list(word_vectors.values()), dtype=np.float64), axis=0)
word_vectors["UNK"] = torch.tensor(np.random.random(int(WE_LENGTH)),dtype=torch.float)
if USE_SEP == "true":
    word_vectors["SEP"] = torch.tensor(np.random.random(int(WE_LENGTH)),dtype=torch.float)

HBox(children=(FloatProgress(value=0.0, max=140000.0), HTML(value='')))

Word-embedding-powered function $\phi$. Converts sentences to a vector by averaging the embeddings corresponding to each word in it

In [14]:
def phrase2vector(phrase: str, method: str, keyword: str) -> Optional[torch.Tensor]:
    phrases_word_vector = []
    if method == "avg":
        phrases_word_vector = [word_vectors[w] if w in word_vectors else word_vectors['UNK'] for w in phrase.split(' ')]
    elif method =="weigthed_avg":
        for w in phrase.split(' '):
            coeff = 1
            if w in word_vectors:
                if w == keyword:
                    coeff = 1.5
                phrases_word_vector.append(word_vectors[w]*coeff)
            else:
                phrases_word_vector.append(word_vectors['UNK'])

    if len(phrases_word_vector) == 0:
        return None

    phrases_word_vector = torch.stack(phrases_word_vector)  # tensor shape: (#words X #features)
    if method=="sum":
        return torch.sum(phrases_word_vector, dim=0)
    else:
        return torch.mean(phrases_word_vector, dim=0)

### Dataset class and interfaces

In [15]:
class SentencesDataset(torch.utils.data.Dataset):

    def __init__(self, dataset_path: str, phrase2vector):
        self.data_store = []
        self.init_structures(dataset_path, phrase2vector)

    def init_structures(self, dataset_path: str, phrase2vector) -> None:

        with open(dataset_path) as f:
            for json_string in f:
                single_json = json.loads(json_string)
                keyword = single_json['sentence1'][int(single_json['start1']):int(single_json['end1'])]
                sep = " " if USE_SEP == "false" else " SEP "
                sentence =  self.remove_stopwords(single_json['sentence1']) + sep + self.remove_stopwords(single_json['sentence2'])
                ground_t = np.float32(1) if single_json['label'] =='True' else np.float32(0)
                vector = phrase2vector(sentence,METHOD,keyword)
                if vector is None:
                    continue
                    
                self.data_store.append((vector,ground_t))

    def remove_stopwords(self,sent):
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(sent)
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        filtered_sentence = []

        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        
        return " ".join(filtered_sentence)
    
    def __len__(self) -> int:
        return len(self.data_store)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data_store[idx]

In [16]:
class SentencesDataModule(nn.Module):

    def __init__(
        self, 
        data_train_path: str,
        data_dev_path: str,
        batch_size: int,
        collate_fn=None
    ) -> None:
        super().__init__()
        self.data_train_path = data_train_path
        self.data_dev_path = data_dev_path
        self.batch_size = batch_size
        self.collate_fn = collate_fn

        self.train_dataset = None
        self.validation_dataset = None

    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        self.train_dataset = SentencesDataset(self.data_train_path, phrase2vector)
        return DataLoader(self.train_dataset, batch_size=self.batch_size)

    def val_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]:
        self.validation_dataset = SentencesDataset(self.data_dev_path, phrase2vector)
        return DataLoader(self.validation_dataset, batch_size=self.batch_size)


Loading and testing of the dataset

In [17]:
BATCH_SIZE = 32 #@param {type:"slider", min:8, max:64, step:8}
sentences_dm = SentencesDataModule(
    data_train_path=dataset_folder+'/train.jsonl',
    data_dev_path=dataset_folder+'/dev.jsonl',
    batch_size=BATCH_SIZE,
)
val_dataloader = sentences_dm.val_dataloader()
# print(word_vectors['test'])

for batch in val_dataloader:
    X, y = batch
    print(batch)
    print(f"batch X shape: {X.shape}")
    print(f"batch z shape: {y.shape}")
    break

[tensor([[ 0.1268,  0.1932,  0.2785,  ...,  0.1149,  0.0938,  0.1672],
        [ 0.0973,  0.1925,  0.2153,  ...,  0.0534,  0.0348,  0.0980],
        [ 0.2088,  0.1467,  0.1315,  ...,  0.1893, -0.1479,  0.2397],
        ...,
        [ 0.1161,  0.2735,  0.1302,  ...,  0.2060, -0.0544,  0.1701],
        [ 0.0340, -0.0131,  0.0375,  ...,  0.0378, -0.3205,  0.0692],
        [-0.0230,  0.0147,  0.0157,  ...,  0.0244, -0.3265,  0.1453]]), tensor([0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0.,
        1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.])]
batch X shape: torch.Size([32, 200])
batch z shape: torch.Size([32])


### Training

Create the classifier class

In [18]:
class SentencesClassifier(nn.Module):

    def __init__(self, n_features: int, n_hidden: int):
        super().__init__()
        # classification function
        self.lin1 = torch.nn.Linear(n_features, n_hidden)
        self.output_layer = torch.nn.Linear(n_hidden, 1)
        
        # criterion
        self.loss_fn = torch.nn.BCELoss()
        

    def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        # actual forward
        out = self.lin1(x)
        out = torch.relu(out)
        # compute logits (which are simply the out variable) and the actual probability distribution (pred, as it is the predicted distribution)
    
        logits = self.output_layer(out).squeeze(1)

        out = torch.sigmoid(logits)

        result = {'logits': logits, 'pred': out}

        # compute loss
        if y is not None:
            # torch optimizes its computation internally and takes as input the logits instead
            loss = self.loss(out, y)
            result['loss'] = loss

        return result

    def loss(self, pred, y):
        return self.loss_fn(pred, y)

Defining a trainer class to better separate our work.

In [22]:
class Trainer():
    def __init__(self, model, optimizer, device):

        self.device = device

        self.model = model
        self.optimizer = optimizer
        self.writer = SummaryWriter()
        # starts requires_grad for all layers
        self.model.train()  # we are using this model for training (some layers have different behaviours in train and eval mode)
        self.model.to(self.device)  # move model to GPU if available

    def train(self, train_dataset, eval_dataset, epochs=1):

        train_loss = 0.0
        for epoch in tqdm(range(epochs)):
            epoch_loss = 0.0
            len_train = 0
            epoch_val_loss = 0.0
            len_val_train = 0
            accuracy = 0
            f1 = 0
            self.model.train()
            # each element (sample) in train_dataset is a batch
            for step, sample in enumerate(train_dataset):
                # inputs in the batch
                inputs = sample[0].to(self.device)
                # outputs in the batch
                targets = sample[1].to(self.device)
                output_distribution = self.model(inputs)
                loss = self.model.loss(output_distribution['pred'], targets)  # compute loss
                # calculates the gradient and accumulates
                loss.backward()  # we backpropagate the loss
                # updates the parameters
                self.optimizer.step()
                self.optimizer.zero_grad()

                epoch_loss += loss.item()
                len_train += 1

            self.model.eval()
            for step, sample in enumerate(eval_dataset):
                # inputs in the batch
                inputs = sample[0].to(self.device)
                # outputs in the batch
                targets = sample[1].to(self.device)
                output_distribution = self.model(inputs)
                loss = self.model.loss(output_distribution['pred'], targets)  # compute loss    
                y_pred = (output_distribution['pred']>0.5).float().cpu()
                y_true = targets.cpu()
                accuracy += accuracy_score(y_true, y_pred)
                f1 += f1_score(y_true,y_pred)
                #accuracy += ((output_distribution['pred'] > 0.5) == targets).float().mean().item() #TODO
                epoch_val_loss += loss.item()
                len_val_train += 1
            
            avg_epoch_loss = epoch_loss / len_train
            avg_eval_loss = epoch_val_loss / len_val_train
            avg_accuracy_loss = accuracy / len_val_train
            avg_f1_score = f1/len_val_train
            self.writer.add_scalar("Train/loss", avg_epoch_loss, epoch)
            self.writer.add_scalar("Eval/loss", avg_eval_loss, epoch)
            self.writer.add_scalar("Eval/accuracy", avg_accuracy_loss, epoch)
            self.writer.add_scalar("Eval/F1_score", avg_f1_score, epoch)

            print('Epoch: {} avg loss = {:0.4f} avg_eval_loss = {:0.4f} avg_eval_acc = {:0.4f} avg_eval_f1 = {:0.4f}'.format(epoch, avg_epoch_loss, avg_eval_loss, avg_accuracy_loss, avg_f1_score))

            train_loss += avg_epoch_loss
            
        torch.save(self.model.state_dict(),os.path.join(root_folder +"/model", 'state_{}.pt'.format(epoch)))  # save the model state
        self.writer.flush()
        avg_epoch_loss = train_loss / epochs
        return avg_epoch_loss

Instanciating the classifier and tune some hyperparameters such as the number of hidden layers learning rate and number of epochs

In [25]:
sent_classifier = SentencesClassifier(
    n_features=int(WE_LENGTH), 
    n_hidden=150 #@param {type:"slider", min:50, max:300, step:50}
)
learning_rate = 0.0391 #@param {type:"slider", min:0.0001, max:0.1, step:0.001}
epochs = 200 #@param {type:"slider", min:50, max:300, step:10}

Defining the optimizer, we will use Stochastic gradient descent and instanciating the trainer to start the train.

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.SGD(sent_classifier.parameters(), lr=learning_rate)

trainer = Trainer(sent_classifier, optimizer, device)

train_dataloader = sentences_dm.train_dataloader()
%load_ext tensorboard
%tensorboard --logdir=runs
avg_loss = trainer.train(train_dataloader,val_dataloader, epochs=epochs)
print(avg_loss)

Method to run a prediction on a personal test phrase and see the prob of beeing of the same context

In [27]:
def predict(model, phrase2vector, phrase: str, keyword: str):
    phrase_vector = phrase2vector(phrase,METHOD,keyword).to('cuda' if torch.cuda.is_available() else 'cpu')
    forward_out = model(phrase_vector.unsqueeze(0))  # add a dimension to create a one-item batch
    print(f"# Sentences: {phrase}")
    for i,prob in enumerate(forward_out["pred"]):
        print("\n {}".format( prob) )
predict(sent_classifier, phrase2vector, "The cat eats the mouse SEP Use the mouse to click on the button", "mouse")
predict(sent_classifier, phrase2vector, "The cat eats the mouse SEP The mouse escaped from the predator", "mouse")



# Sentences: The cat eats the mouse SEP Use the mouse to click on the button

 0.9029295444488525
# Sentences: The cat eats the mouse SEP The mouse escaped from the predator

 0.9874212741851807


#Second approach (sequence encoding with RNN)


Let's start by indexing each word in our vocabulary

In [6]:
word_index = dict()
vectors_store = []

# pad token, index = 0
vectors_store.append(torch.rand(int(WE_LENGTH)))

# unk token, index = 1
vectors_store.append(torch.rand(int(WE_LENGTH)))

for word, vector in word_vectors.items():
    word_index[word] = len(vectors_store)
    vectors_store.append(vector)

word_index = defaultdict(lambda: 1, word_index)  # default dict returns 1 (unk token) when unknown word
vectors_store = torch.stack(vectors_store)

In [7]:
vocabulary_size, hidden_features = vectors_store.shape
print(f"Vocabulary size: {vocabulary_size}")
print(f"Hidden features: {hidden_features}")
word_index['pezzo']  # let's see if the word_index gives to us the unk index (1)


Vocabulary size: 140003
Hidden features: 200


1

In [8]:
def review2indices(review: str,method: str,keyword: str) -> torch.Tensor:
    return torch.tensor([word_index[word] for word in review.split(' ')], dtype=torch.long)

In [9]:
def rnn_collate_fn(
    data_elements: List[Tuple[torch.Tensor, torch.Tensor]] # list of (x, y) pairs
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

    X = [de[0] for de in data_elements]  # list of index tensors

    # to implement the many-to-one strategy
    X_lengths = torch.tensor([x.size(0) for x in X], dtype=torch.long)

    X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=0)  #  shape (batch_size x max_seq_len)

    y = [de[1] for de in data_elements]
    y = torch.tensor(y)

    return X, X_lengths, y

In [10]:
class SentencesRNNDataModule(nn.Module):

    def __init__(
        self, 
        data_train_path: str,
        data_dev_path: str,
        batch_size: int,
        collate_fn=None
    ) -> None:
        super().__init__()
        self.data_train_path = data_train_path
        self.data_dev_path = data_dev_path
        self.batch_size = batch_size
        self.collate_fn = collate_fn

        self.train_dataset = None
        self.validation_dataset = None

    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        self.train_dataset = SentencesDataset(self.data_train_path, review2indices)
        return DataLoader(self.train_dataset, batch_size=self.batch_size, collate_fn=rnn_collate_fn)

    def val_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]:
        self.validation_dataset = SentencesDataset(self.data_dev_path, review2indices)
        return DataLoader(self.validation_dataset, batch_size=self.batch_size, collate_fn=rnn_collate_fn)

In [11]:
with open(dataset_folder+'/train.jsonl') as f:
    for json_string in f:
        single_json = json.loads(json_string)
        keyword = single_json['sentence1'][int(single_json['start1']):int(single_json['end1'])]
        sep = " " if USE_SEP == "false" else " SEP "
        sentence =  single_json['sentence1'] + sep + single_json['sentence2']
        ground_t = np.float32(1) if single_json['label'] =='True' else np.float32(0)
        vector = review2indices(sentence,METHOD,keyword)
        if vector is None:
            continue
        else:
            print((sentence,vector, ground_t))
        break
sentences_rnn_dm = SentencesRNNDataModule(
    data_train_path=dataset_folder+'/train.jsonl',
    data_dev_path=dataset_folder+'/dev.jsonl',
    batch_size=32
)

('In that context of coordination and integration, Bolivia holds a key play in any process of infrastructure development. SEP A musical play on the same subject was also staged in Kathmandu for three days.', tensor([   1,   13, 4711,    4, 6410,    6,    1,    1, 2041,    8,  639,  283,
           7,  131,  547,    4, 2952,    1,    1,    1, 2149,  283,   14,    1,
         216, 1699,   16,   53, 4425,    7,    1,   11,   88,    1]), 0.0)


In [12]:

for batch in sentences_rnn_dm.val_dataloader():
    batch_X, batch_X_lengths, batch_y = batch
    print(batch_X)
    print(batch_X.shape)
    print(batch_y.shape)
    break

tensor([[    1,  8195,   954,  ...,  9580,     1,     3],
        [    1,  8195,   954,  ...,     0,     0,     0],
        [    1, 19372,   249,  ...,     0,     0,     0],
        ...,
        [    1,   360,  2604,  ...,     0,     0,     0],
        [    1,   750,  5518,  ...,     0,     0,     0],
        [    1,   750,  5518,  ...,     0,     0,     0]])
torch.Size([32, 44])
torch.Size([32])


In [13]:
class SentencesRecurrentClassifier(nn.Module):

    def __init__(
        self,
        vectors_store: torch.Tensor,
        n_hidden: int
    ) -> None:
        super().__init__()

        # embedding layer
        self.embedding = torch.nn.Embedding.from_pretrained(vectors_store)

        # recurrent layer
        self.rnn = torch.nn.LSTM(input_size=vectors_store.size(1), hidden_size=n_hidden, num_layers=1, batch_first=True)

        # classification head
        self.lin1 = torch.nn.Linear(n_hidden, n_hidden)
        self.lin2 = torch.nn.Linear(n_hidden, 1)

        # criterion
        self.loss_fn = torch.nn.BCELoss()
        self.device = 'cuda'

    def forward(
        self, 
        X: torch.Tensor, 
        X_length: torch.Tensor, 
        y: Optional[torch.Tensor] = None
    ) -> Dict[str, torch.Tensor]:

        # embedding words from indices
        embedding_out = self.embedding(X)

        # recurrent encoding
        recurrent_out = self.rnn(embedding_out)[0]
        
        # here we utilize the sequences length to retrieve the last token 
        # output for each sequence
        batch_size, seq_len, hidden_size = recurrent_out.shape

        # we flatten the recurrent output now I have a long sequence of batch x seq_len vectors 
        flattened_out = recurrent_out.reshape(-1, hidden_size)
        
        # and we use a simple trick to compute a tensor of the indices of the last token in each batch element
        last_word_relative_indices = X_length - 1
        # tensor of the start offsets of each element in the batch
        sequences_offsets = torch.arange(batch_size, device=self.device) * seq_len
        # e.g. (0, 5, 10, 15, ) + ( 3, 2, 1, 4 ) = ( 3, 7, 11, 19 )
        summary_vectors_indices = sequences_offsets + last_word_relative_indices

        # finaly we retrieve the vectors that should summarize every sentence.
        # (i.e. the last token in the sequence)
        summary_vectors = flattened_out[summary_vectors_indices]

        # now we can classify the reviews with a feedforward pass on the summary
        # vectors
        out = self.lin1(summary_vectors)
        out = torch.relu(out)
        # compute logits (which are simply the out variable) and the actual probability distribution (pred, as it is the predicted distribution)
        logits = self.lin2(out).squeeze(1)
        
        pred = torch.sigmoid(logits)

        result = {'logits': logits, 'pred': pred}

        # compute loss
        if y is not None:
            loss = self.loss(pred, y)
            result['loss'] = loss

        return result

    def loss(self, pred, y):
        return self.loss_fn(pred, y)

In [22]:
class Trainer():
    def __init__(self, model, optimizer, device):

        self.device = device

        self.model = model
        self.optimizer = optimizer

        # starts requires_grad for all layers
        self.model.train()  # we are using this model for training (some layers have different behaviours in train and eval mode)
        self.model.to(self.device)  # move model to GPU if available

    def train(self, train_dataset, eval_dataset, epochs=1):

        train_loss = 0.0
        for epoch in tqdm(range(epochs)):
            epoch_loss = 0.0
            len_train = 0
            epoch_val_loss = 0.0
            len_val_train = 0
            accuracy = 0
            self.model.train()
            # each element (sample) in train_dataset is a batch
            for step, sample in enumerate(train_dataset):
                # inputs in the batch
                inputs = sample[0].to(self.device)
                x_lenghts = sample[1].to(self.device)
                # outputs in the batch
                targets = sample[2].to(self.device)
                output_distribution = self.model(inputs, x_lenghts, targets)
                loss = output_distribution['loss']#self.model.loss(output_distribution['pred'], targets)  # compute loss

                # calculates the gradient and accumulates
                loss.backward()  # we backpropagate the loss
                # updates the parameters
                self.optimizer.step()
                self.optimizer.zero_grad()

                epoch_loss += loss.item()
                len_train += 1
                
            self.model.eval()
            for step, sample in enumerate(eval_dataset):
                # inputs in the batch
                inputs = sample[0].to(self.device)
                x_lenghts = sample[1].to(self.device)
                # outputs in the batch
                targets = sample[2].to(self.device)
                output_distribution = self.model(inputs, x_lenghts)
                loss = self.model.loss(output_distribution['pred'], targets)  # compute loss  
                #print(output_distribution['pred'], targets)  
                y_pred = (output_distribution['pred']>0.5).float().cpu()
                y_true = targets.cpu()
                accuracy += accuracy_score(y_true, y_pred)
                epoch_val_loss += loss.item()
                len_val_train += 1
            
            avg_epoch_loss = epoch_loss / len_train
            avg_eval_loss = epoch_val_loss / len_val_train
            avg_accuracy_loss = accuracy / len_val_train
            print('Epoch: {} avg loss = {:0.4f} eval loss = {:0.4f} ACC = {:0.4f}'.format(epoch, avg_epoch_loss, avg_eval_loss, avg_accuracy_loss))
            #print('Epoch: {} avg loss = {:0.4f}'.format(epoch, avg_epoch_loss))
            train_loss += avg_epoch_loss
            # torch.save(self.model.state_dict(),
            #            os.path.join(output_folder, 'state_{}.pt'.format(epoch)))  # save the model state
            
        avg_epoch_loss = train_loss / epochs
        return avg_epoch_loss

In [24]:
sentences_recurrent_classifier = SentencesRecurrentClassifier(vectors_store, n_hidden=128)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.Adam(sentences_recurrent_classifier.parameters(), lr=0.0001)
trainer = Trainer(sentences_recurrent_classifier, optimizer, device)


train_dataloader = sentences_rnn_dm.train_dataloader()
avg_loss = trainer.train(train_dataloader,sentences_rnn_dm.val_dataloader(), epochs=50)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Epoch: 0 avg loss = 0.6941 eval loss = 0.6928 ACC = 0.5107
Epoch: 1 avg loss = 0.6935 eval loss = 0.6927 ACC = 0.5400
Epoch: 2 avg loss = 0.6931 eval loss = 0.6924 ACC = 0.5195
Epoch: 3 avg loss = 0.6926 eval loss = 0.6918 ACC = 0.5244
Epoch: 4 avg loss = 0.6917 eval loss = 0.6907 ACC = 0.5371
Epoch: 5 avg loss = 0.6901 eval loss = 0.6893 ACC = 0.5391
Epoch: 6 avg loss = 0.6878 eval loss = 0.6882 ACC = 0.5381
Epoch: 7 avg loss = 0.6851 eval loss = 0.6876 ACC = 0.5498
Epoch: 8 avg loss = 0.6819 eval loss = 0.6878 ACC = 0.5430
Epoch: 9 avg loss = 0.6779 eval loss = 0.6887 ACC = 0.5479
Epoch: 10 avg loss = 0.6728 eval loss = 0.6905 ACC = 0.5322
Epoch: 11 avg loss = 0.6665 eval loss = 0.6928 ACC = 0.5371
Epoch: 12 avg loss = 0.6584 eval loss = 0.6955 ACC = 0.5283
Epoch: 13 avg loss = 0.6487 eval loss = 0.7005 ACC = 0.5273
Epoch: 14 avg loss = 0.6376 eval loss = 0.7072 ACC = 0.5254
Epoch: 15 avg loss = 0.6250 eval loss = 0.7165 ACC = 0.5234
Epoch: 16 avg loss = 0.6102 eval loss = 0.7265 ACC