In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset

In [2]:
from KoBERT.kobert.utils import get_tokenizer
from KoBERT.kobert.pytorch_kobert import get_pytorch_kobert_model

In [3]:
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [4]:
import gluonnlp as nlp

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

from tqdm import tqdm, tqdm_notebook
import sys

In [7]:
import random
random.seed(1004)

In [8]:
num_class = 3

In [9]:
device = torch.device("cuda:0")

In [10]:
bertmodel, vocab = get_pytorch_kobert_model() 
tokenizer = get_tokenizer()

# gluonnlp.data.BERTSPTokenizer(path, vocab, num_best=0, alpha=1.0, lower=True, max_input_chars_per_word=200)
# path : Path to the pre-trained subword tokenization model.
# vocab : Vocabulary for the corpus.
# num_best : default 0 – A scalar for sampling subwords. If num_best = {0,1}, no sampling is performed. If num_best > 1, 
#            then samples from the num_best results. If num_best < 0, then assume that num_best is infinite and samples 
#            from the all hypothesis (lattice) using forward-filtering-and-backward-sampling algorithm.
# alpha : A scalar for a smoothing parameter. Inverse temperature for probability rescaling.
# lower :  default True) – Whether the text strips accents and convert to lower case. If you use the BERT pre-training model,
#          lower is set to False when using the cased model, otherwise it is set to True.
# max_input_dchars_per_word : default 200
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model
using cached model
using cached model


In [11]:
import wandb

In [12]:
# !wandb login

In [14]:
## Setting parameters
# batch_size = config.batch_size
# num_epochs = config.epochs
# learning_rate = config.learning_rate
# drop_out_rate = config.dropout
# test_ratio = config.test_ratio
max_len = 512

In [15]:
# 이름 설정
filename = 'processed_labelled_df'
column_name = 'text'
labels = '구체적 기재'
# graph_title = '핵심자원'
# image_name = '핵심자원'
best_model_name = '구체적 기재_best_model.pt'

In [16]:
class BERTDataset(Dataset):
    def __init__(self, dataset, bert_tokenizer, max_len, pad, pair, column_name=column_name, labels=labels):
        
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        # BERTSentenceTransform
        # https://nlp.gluon.ai/_modules/gluonnlp/data/transforms.html
        # r"""BERT style data transformation.
        # Parameters
        #    ----------
        #    tokenizer : BERTTokenizer.
        #        Tokenizer for the sentences.
        #    max_seq_length : int.
        #        Maximum sequence length of the sentences.
        #    vocab : Vocab
        #        The vocabulary which has cls_token and sep_token registered.
        #        If vocab.cls_token is not present, vocab.bos_token is used instead.
        #        If vocab.sep_token is not present, vocab.eos_token is used instead.
        #    pad : bool, default True
        #        Whether to pad the sentences to maximum length.
        #    pair : bool, default True
        #        Whether to transform sentences or sentence pairs.
        #    """
        # vocab 인자를 넣지 않는 이유는 알 수 없음
        # https://nlp.gluon.ai/_modules/gluonnlp/data/transforms.html

        sent_data = []
    
        for i in range(len(dataset)):
            sent_data.append([str(dataset.iloc[i][column_name]), dataset.iloc[i][labels]])
        # dataset의 text컬럼과 라벨컬럼을 리스트로 묶어서 sent_data에 더함
        # 예) sent_data.append(['품질, 기술을 기반으로 반도체 및 디스플레이 생산에 효율과 수율을..., A])
        # sent_data = [[text, label],[],[]...]
        

        self.sentences = [transform([i[0]]) for i in sent_data]
        # sent_data의 text를 transform 
        # >array([  2, 993,   3,   1,   1,   1,  
        # >1,   1,   1,   1,   1,   1,   1,   1,  
        # >1,   1,   1,   1,   1,   1,   1,   1,  
        # >1,   1,   1,   1,   1,   1,   1,   1,  
        # >1,   1,   1,   1,   1,   1,   1,   1,
        # >array(3, dtype=int32),
        # >array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
        # > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
        # > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
        # > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        # > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
        
        self.labels = [np.int32(i[1]) for i in sent_data]
        # >array([0, 1, 0, 0, 0], dtype=int32),
        # >array([0, 0, 1, 0, 0], dtype=int32),
        # >array([0, 0, 0, 1, 0], dtype=int32),
        
    def __getitem__(self, i):
        return self.sentences[i] + (self.labels[i],)

    def __len__(self):
        return len(self.labels)


class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size=768,
                 num_classes=num_class,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(),
                              attention_mask=attention_mask.float().to(token_ids.device))

        if self.dr_rate:
            out = self.dropout(pooler)
        softmax = nn.Softmax(dim=1)
        output = softmax(self.classifier(out))
        # output = self.classifier(out)
        # return self.classifier(out)
        return output

In [17]:
def read_data(path='./', filename=filename, labels=labels):
    df = pd.read_csv(path + '/' + filename + '.csv', sep=',')
    df = df.dropna(axis=0)

    if num_class == 5:
        df[labels] = df[labels].replace({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4})
        df[labels] = pd.get_dummies(df[labels]).values[:, ::-1].tolist()  # one hot encoding
    elif num_class == 2:
        df = df.drop(df[df[labels] == 'C'].index)
        df[labels] = df[labels].replace({'A': 1, 'B': 1, 'D': 0, 'E': 0})
        df = df.dropna(axis=0)
    elif num_class == 3:
        # df[labels] = df[labels].replace({'A': 0, 'B': 0, 'C': 1, 'D': 2, 'E': 2})
        # df['label'] = df['label'].replace({'A': 0, 'B': 1, 'C': 2})
        df[labels] = df[labels].replace({0:0, 1:1, 2:2})
        df[labels] = pd.get_dummies(df[labels]).values[:, ::-1].tolist()  # one hot encoding
    else:
        print('wrong input\n')
        sys.exit()
    return df

In [18]:
def one_hot_ce_loss(outputs, targets):
    criterion = nn.CrossEntropyLoss()
    # nn.CrossEntropyLoss() : 다중 분류에 사용되는 손실함수, nn.LogSoftmax와 nn.NLLLoss의 연산의 조합
    # http://www.gisdeveloper.co.kr/?p=8668 참고
    
    _, labels = torch.max(targets, dim=1)
    # values, indices = torch.max(input, dim=1)
    # values : 행에서 최대값
    # indices : 최대값의 인덱스
    # https://technical-support.tistory.com/94 참고
    
    return criterion(outputs, labels)

In [19]:
def model(data=None, num_epochs=None, batch_size=None, test_ratio=None, drop_out_rate=None, learning_rate=None, graph=None, savemode=False, 
          column_name=column_name, labels=labels, graph_title=None, image_name=None, best_model_name=best_model_name):
    
    # wandb 초기값 설정
    wandb.init(project='bax_KoBERT', entity='junoe',
               config={"learning_rate": 1e-5,
                        "dropout": 0.2,
                        "test_ratio": 0.2,
                        "batch_size": 4,
                        "epochs": 5,
                        "architecture": "KoBERT",
                        "dataset": "BMC 515"})

    config = wandb.config
    
    x_train, x_test, y_train, y_test = train_test_split(data[column_name],
                                                        data[labels],
                                                        test_size=config.test_ratio,
                                                        shuffle=True,
                                                        stratify=data[labels],
                                                        random_state=42,
                                                        )
    df_train = pd.DataFrame()
    df_train[column_name] = x_train
    df_train[labels] = y_train
    df_test = pd.DataFrame()
    df_test[column_name] = x_test
    df_test[labels] = y_test

    model = BERTClassifier(bertmodel, dr_rate=config.dropout).to(device)

    data_train = BERTDataset(df_train, tok, max_len, True, False)
    data_test = BERTDataset(df_test, tok, max_len, True, False)

    train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=config.batch_size, num_workers=4)
    test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=config.batch_size, num_workers=4)
    # data_train을 배치사이즈 만큼 나눠서(gpu에 할당) 모델에 데이터를 로드
    # len(data_train) = 403
    # len(traindataloader) = 202
    # num_workers에 대한 설명 https://jybaek.tistory.com/799 참고

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)
    # The optimizer allows us to apply different hyperpameters for specific parameter groups. 
    # For example, we can apply weight decay to all parameters other than bias and layer normalization terms
    # 무슨 말인지 모르겠다...
    # https://huggingface.co/transformers/v3.3.1/training.html 참고

    loss_fn = nn.CrossEntropyLoss()
    t_total = len(train_dataloader) * config.epochs # 202 * 30
    warmup_step = int(t_total * 0.1)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
    # Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, 
    # after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    # optimizer (Optimizer) – The optimizer for which to schedule the learning rate.
    # num_warmup_steps (int) – The number of steps for the warmup phase.
    # num_training_steps (int) – The total number of training steps.
    # https://huggingface.co/transformers/main_classes/optimizer_schedules.html 참고

    train_acc_list = []
    test_acc_list = []
    train_loss_list = []
    test_loss_list = []

    for e in range(config.epochs):
        train_acc = 0.
        test_acc = 0.
        train_loss = 0.
        test_loss = 0.
        train_predict_list = []
        train_actual_list = []
        test_predict_list = []
        test_actual_list = []

        model.train()
        # 훈련모드로 변경

        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader):
            # batch_id : 배치사이즈
            #             > 0, 1, 2...

            # token_ids : train_dataloader에서 텍스트의 임베딩값을 tensor형식으로 배치사이즈 만큼 가져옴
            #            >tensor([[   2, 4893,  517,  ...,    1,    1,    1],
            #            >[   2, 1289, 6064,  ...,    1,    1,    1]], dtype=torch.int32) ...
            # valid_length : 받아온 token_ids에서 패딩을 제외한 사이즈
            #                > tensor([24], dtype=torch.int32)
            #                > tensor([81], dtype=torch.int32)
            # segment_ids : >tensor([[0, 0, 0,  ..., 0, 0, 0],
            #                        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32)
            # label : >tensor([[0, 1, 0, 0, 0],
            #         >      [0, 0, 1, 0, 0]], dtype=torch.int32)
            #         >tensor([[0, 0, 0, 1, 0],
            #         >      [0, 0, 1, 0, 0]], dtype=torch.int32)

            optimizer.zero_grad()
            # optimizer.zero_grad() : https://algopoolja.tistory.com/55 참고

            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length
            label = label.long().to(device)
            # gpu 연산을 사용할 부분 뒤에는 .to(device) 를 붙여 gpu를 사용

            out = model(token_ids, valid_length, segment_ids)
            # out : token_ids, valid_length, segment_ids를 계산해서 나온 라벨의 softmax 값
            # >tensor([[0.2530, 0.1866, 0.1529, 0.2185, 0.1890]], device='cuda:0',
            # >grad_fn=<SoftmaxBackward>)
            # >tensor([[0.3168, 0.1648, 0.1788, 0.2057, 0.1338]], device='cuda:0',
            # >grad_fn=<SoftmaxBackward>)

            if num_class == 5:
                loss = one_hot_ce_loss(out, label)
            elif num_class == 2:
                loss = loss_fn(out, label)
            elif num_class == 3:
                loss = one_hot_ce_loss(out, label)
            else:
                print('wrong loss function\n')
                sys.exit()

            train_loss += loss.item()
            # loss.item() : 손실값

            loss.backward()
            # loss.backward() : 오차(error)를 역전파하기 위해 사용
            # https://tutorials.pytorch.kr/beginner/blitz/neural_networks_tutorial.html 참고

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            # clip_grad_norm_ : gradient exploding을 방지하여 학습의 안정화를 도모하기 위해 사용하는 방법
            # torch.nn.Module.parameters() : 모듈의 파라메터들을 iterator로 반환 --> 무슨 말인지 모르겠음
            # https://easy-going-programming.tistory.com/11 참고

            optimizer.step()
            # optimizer.step() : 매개변수가 갱신

            scheduler.step()  
            # Update learning rate schedule

            # train_acc += calc_accuracy_2(out, label)
            trained_value, predicted_value, actual_value = calc_accuracy_3(out, label)
            train_predict_list.append(predicted_value)
            # predict_value_lsit에 예측한 결과 담기

            train_actual_list.append(actual_value)
            # actual_value_list에 실제 결과 담기

    #         label.tolist()
    #         actual_value_list

            if num_class == 5:
                train_acc += trained_value
            elif num_class == 2:
                train_acc += trained_value
            elif num_class == 3:
                train_acc += trained_value

            else:
                print('wrong calc accuracy\n')
                sys.exit()
        
        train_f1 = f1_score(train_actual_list, train_predict_list, average='weighted')
        train_acc = train_acc / (batch_id + 1)
        train_loss = train_loss / (batch_id + 1)
        train_acc_list.append(train_acc / (batch_id + 1))
        train_loss_list.append(train_loss / (batch_id + 1))
        wandb.log({"train_f1":train_f1, "train_acc":train_acc, "train_loss":train_loss})
        
        print("epoch {} train acc {} F1 score {}".format(e + 1, train_acc / (batch_id + 1), train_f1))

        model.eval()
        # 모델 평가모드

        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length
            label = label.long().to(device)
            out = model(token_ids, valid_length, segment_ids)

            if num_class == 5:
                loss = one_hot_ce_loss(out, label)
            elif num_class == 2:
                loss = loss_fn(out, label)
            elif num_class == 3:
                loss = one_hot_ce_loss(out, label)
            else:
                print('wrong loss function\n')
                sys.exit()

            test_loss += loss.item()

            test_value, predicted_value, actual_value = calc_accuracy_3(out, label)
            test_predict_list.append(predicted_value)
            # predict_value_lsit에 예측한 결과 담기

            test_actual_list.append(actual_value)
            # actual_value_list에 실제 결과 담기

            if num_class == 5:
                test_acc += test_value
            elif num_class == 2:
                test_acc += test_value
            elif num_class == 3:
                test_acc += test_value
            else:
                print('wrong calc accuracy\n')
                sys.exit()
                
        test_f1 = f1_score(test_actual_list, test_predict_list, average='weighted')
        test_acc = test_acc / (batch_id + 1)
        test_loss = test_loss / (batch_id + 1)
        test_acc_list.append(test_acc / (batch_id + 1))
        test_loss_list.append(test_loss / (batch_id + 1))
        wandb.log({"test_f1":test_f1, "test_acc":test_acc, "test_loss":test_loss})
                      
        print("epoch {} test acc {} F1 score {}".format(e + 1, test_acc / (batch_id + 1), test_f1))


    #     if graph:
    #         train_x_values = range(1, len(train_acc_list) + 1)
    #         train_y_values = train_acc_list
    #         test_x_values = range(1, len(test_acc_list) + 1)
    #         test_y_values = test_acc_list
    #         train_loss_x_values = range(1, len(train_loss_list) + 1)
    #         train_loss_y_values = train_loss_list
    #         test_loss_x_values = range(1, len(test_loss_list) + 1)
    #         test_loss_y_values = test_loss_list

    #         plt.figure(1)
    #         plt.plot(train_x_values, train_y_values, label='train accuracy')
    #         plt.plot(test_x_values, test_y_values, label='test accuracy')
    #         plt.xlabel('epoch', fontproperties=fprop, fontsize=15)
    #         plt.ylabel('accuracy', fontproperties=fprop, fontsize=15)
    #         plt.ylim(0.0, 1.1)
    #         plt.title(graph_title + ' accuracy', fontproperties=fprop, fontsize=20)
    #         # plt.title(file, fontsize=20)
    #         plt.legend()
    #         plt.savefig('./result/' + image_name + '_' + str(batch_size) + '_'+ str(drop_out_rate) + '_' + str(learning_rate) + '_' + str(max_acc) + '_accuracy.png')

    #         plt.figure(2)
    #         plt.plot(train_loss_x_values, train_loss_y_values, label='train loss')
    #         plt.plot(test_loss_x_values, test_loss_y_values, label='test loss')
    #         plt.xlabel('epoch', fontproperties=fprop, fontsize=15)
    #         plt.ylabel('loss', fontproperties=fprop, fontsize=15)
    #         plt.title(graph_title + ' loss', fontproperties=fprop, fontsize=20)
    #         # plt.title(file, fontsize=20)
    #         plt.legend()
    #         plt.savefig('./result/'  + image_name + '_' + str(batch_size) + '_'+ str(drop_out_rate) + '_' + str(learning_rate) + '_' + str(min_loss) + '_loss.png')

    if savemode:
        torch.save(model.state_dict(), './' + best_model_name)
                           
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [20]:
# 5 class accuracy
def calc_accuracy_1(X, Y):
    # e.g. X = tensor([[0.2003, 0.1883, 0.1572, 0.2594, 0.1948]], device='cuda:0',
    #                grad_fn=<SoftmaxBackward>)
    #      Y = tensor([[0, 0, 1, 0, 0]], device='cuda:0')
    
    max_vals, max_indices = torch.max(X, 1)
    # e.g. max_vals = 0.2594
    #      max_indices = 3
    
    predicted_value = max_indices[0].tolist()
    # F1 score 을 위해 예측값 담기
    
    _, max_Y = torch.max(Y, 1)
    actual_value = max_Y[0].tolist()
    # F1 score 을 위해 실제값 담기
    
    encoding = []

    for i in range(len(Y)):
        if int(max_indices[i]) == 0:
            encoding.append([1, 0, 0, 0, 0])
        elif int(max_indices[i]) == 1:
            encoding.append([0, 1, 0, 0, 0])
        elif int(max_indices[i]) == 2:
            encoding.append([0, 0, 1, 0, 0])
        elif int(max_indices[i]) == 3:
            encoding.append([0, 0, 0, 1, 0])
        elif int(max_indices[i]) == 4:
            encoding.append([0, 0, 0, 0, 1])
    # tensor에서 가장 높은 값을 인덱스를 원-핫 인코딩 형식으로 인코딩
    
    encoding = torch.tensor(encoding).to(device)
    # e.g. encoding : >tensor([[0, 0, 0, 1, 0]], device='cuda:0')
    
    cor_matrix = (encoding == Y).tolist()
    # Y값 즉, 실제 값과 같으면 리스트로 변환
    # e.g. [[True, True, False, False, True]]
    
    correct = 0.

    for j in range(len(Y)):
        if all(cor_matrix[j]):
            # 함수 all 은 iterable 내의 모든 요소가 참이거나 혹은 iterable 이 비어 있다면 True 를 반환하고, 
            # 그 외의 경우에는 False 를 반환
            # https://codepractice.tistory.com/87 참고
            # e.g. cor_matrix[0] : [True, True, False, False, True]
            # e.g. all(cor_matrix[0]) : False
            
            correct += 1.

    train_acc = correct / len(Y)
    # train_acc : 실제값과 예측값이 같으면 1 아니면 0을 반환

    return train_acc, predicted_value, actual_value

In [21]:
# 3 class accuracy
def calc_accuracy_3(X, Y):
    max_vals, max_indices = torch.max(X, 1)
    
    predicted_value = max_indices[0].tolist()
     # F1 score 을 위해 예측값 담기

    _, max_Y = torch.max(Y, 1)
    actual_value = max_Y[0].tolist()
     # F1 score 을 위해 실제값 담기
    
    encoding = []

    for i in range(len(Y)):
        if int(max_indices[i]) == 0:
            encoding.append([1, 0, 0])
        elif int(max_indices[i]) == 1:
            encoding.append([0, 1, 0])
        elif int(max_indices[i]) == 2:
            encoding.append([0, 0, 1])
    encoding = torch.tensor(encoding).to(device)
    # tensor에서 가장 높은 값을 인덱스를 원-핫 인코딩 형식으로 인코딩

    cor_matrix = (encoding == Y).tolist()
    correct = 0.

    for j in range(len(Y)):
        if all(cor_matrix[j]):
            # 함수 all 은 iterable 내의 모든 요소가 참이거나 혹은 iterable 이 비어 있다면 True 를 반환하고, 
            # 그 외의 경우에는 False 를 반환
            # https://codepractice.tistory.com/87 참고
            # e.g. cor_matrix[0] : [True, True, False, False, True]
            # e.g. all(cor_matrix[0]) : False
            correct += 1.

    train_acc = correct / len(Y)
    # train_acc : 실제값과 예측값이 같으면 1 아니면 0을 반환
    
    return train_acc, predicted_value, actual_value

In [22]:
# 2 class accuracy
def calc_accuracy(X, Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [23]:
if __name__ == '__main__':

    data = read_data(path='./data/')

    ## exectue model
    model(data=data, savemode=True,  column_name=column_name, labels=labels,  best_model_name=best_model_name )

[34m[1mwandb[0m: wandb version 0.12.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


epoch 1 train acc 0.004807239136582147 F1 score 0.48178183894917187
epoch 1 test acc 0.01849112426035503 F1 score 0.39903846153846156
epoch 2 train acc 0.006009048920727684 F1 score 0.5787884445651436
epoch 2 test acc 0.02551775147928994 F1 score 0.7029171808583572
epoch 3 train acc 0.0066217362616646245 F1 score 0.6385220024482959
epoch 3 test acc 0.025887573964497042 F1 score 0.7880673248320307
epoch 4 train acc 0.007163728909416533 F1 score 0.6592101993996007
epoch 4 test acc 0.02403846153846154 F1 score 0.7628402366863904
epoch 5 train acc 0.007399377886699972 F1 score 0.7171970696547785
epoch 5 test acc 0.026257396449704144 F1 score 0.801724429416737
