In [1]:
!pip install fairseq
!pip install fastbpe
!pip install vncorenlp
!pip install transformers

Collecting fairseq
  Downloading fairseq-0.12.2.tar.gz (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ | done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25l- \ | / - done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- \ done
Collecting hydra-core<1.1,>=1.0.7 (from fairseq)
  Downloading hydra_core-1.0.7-py3-none-any.whl.metadata (3.7 kB)
Collecting omegaconf<2.1 (from fairseq)
  Downloading omegaconf-2.0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting sacrebleu>=1.4.12 (from fairseq)
  Downloading sacrebleu-2.4.0-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting bitarray (from fairseq)
  Downloading bitarray-2.9.2-cp310-cp

* fairseq: proj của Fb chuyên hỗ trợ các nghiên cứu và dự án liên quan đến model seq2seq
* fastBPE: package hỗ trợ tokenize word thành các subword
* vncorenlp: package NLP trong tiếng Việt

# Download pre-trained model PhoBERT

In [2]:
!wget https://public.vinai.io/PhoBERT_base_fairseq.tar.gz
!tar -xzvf PhoBERT_base_fairseq.tar.gz

--2024-03-07 08:12:51--  https://public.vinai.io/PhoBERT_base_fairseq.tar.gz
Resolving public.vinai.io (public.vinai.io)... 52.84.162.15, 52.84.162.17, 52.84.162.36, ...
Connecting to public.vinai.io (public.vinai.io)|52.84.162.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1243308020 (1.2G) [application/x-tar]
Saving to: 'PhoBERT_base_fairseq.tar.gz'


2024-03-07 08:13:44 (22.8 MB/s) - 'PhoBERT_base_fairseq.tar.gz' saved [1243308020/1243308020]

PhoBERT_base_fairseq/
PhoBERT_base_fairseq/bpe.codes
PhoBERT_base_fairseq/model.pt
PhoBERT_base_fairseq/dict.txt


# Load model

In [3]:
from fairseq.models.roberta import RobertaModel
phoBERT = RobertaModel.from_pretrained('PhoBERT_base_fairseq', checkpoint_file='model.pt')

2024-03-07 08:14:05.075089: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-07 08:14:05.075220: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-07 08:14:05.211760: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
1042301B [00:00, 28610031.50B/s]
456318B [00:00, 15492944.67B/s]


# BPE Tokenizer

In [4]:
from fairseq.data.encoders.fastbpe import fastBPE

class BPE():
    bpe_codes = 'PhoBERT_base_fairseq/bpe.codes'

args = BPE()
phoBERT.bpe = fastBPE(args)

# example:
tokens = phoBERT.encode('Tôn Ngộ Không đang đánh răng thì bị Đường Tăng gõ')
print('Tokens list: ', tokens)
print(len(tokens))  # BERT tự thêm các ký tự <s> và </s> đánh dấu start và end của câu 

phoBERT.decode(tokens)

Tokens list:  tensor([    0, 11623, 31433,   453,    52,   480,  2429,    54,    45,  2080,
         5922,  8121,     2])
13


Loading codes from PhoBERT_base_fairseq/bpe.codes ...
Read 64000 codes from the codes file.


'Tôn Ngộ Không đang đánh răng thì bị Đường Tăng gõ'

# Extract features từ RoBERTa

In [5]:
# Extract the last layer's features
last_layer_features = phoBERT.extract_features(tokens)
# assert last_layer_features.size() == torch.Size([1, 5, 1024])
print('token size: ', tokens.size())
print('size of last layer: ', last_layer_features.size())

# Extract all layer's features (layer 0 is the embedding layer)
all_layers = phoBERT.extract_features(tokens, return_all_hiddens=True)
print('number layer in all layers: ', len(all_layers))

# last_layer_features must equal to last layer in all_layers:
print('Last layer features: ', all_layers[-1] == last_layer_features)

token size:  torch.Size([13])
size of last layer:  torch.Size([1, 13, 768])
number layer in all layers:  13
Last layer features:  tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]]])


# Filling mask
- Download package VnCoreNLP để tokenize các sentences

In [6]:
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/ 
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

--2024-03-07 08:14:47--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27412575 (26M) [application/octet-stream]
Saving to: 'VnCoreNLP-1.1.1.jar'


2024-03-07 08:14:49 (209 MB/s) - 'VnCoreNLP-1.1.1.jar' saved [27412575/27412575]

--2024-03-07 08:14:50--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 526544 (514K) [application/octet-stream]
Sav

In [7]:
from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

text = 'Tôn Ngộ Không đang đánh răng thì bị Đường Tăng gõ'

# Tokenizer câu gốc và thay từ 'đánh' = <mask>
words = rdrsegmenter.tokenize(text)[0]
for i, token in enumerate(words):
    if token == 'đánh':
        words[i] = ' <mask>'
text_masked = ' '.join(words)
print(text_masked)

Tôn_Ngộ_Không đang  <mask> răng thì bị Đường Tăng gõ


In [8]:
# Timf 10 từ thích hợp với <mask>
import numpy as np

top_filled_words = phoBERT.fill_mask(text_masked, topk=10)
topk_probs = [item[1] for item in top_filled_words]

print('Total probability: ', topk_probs)
for i, output in enumerate(top_filled_words):
    print(output[0])

Total probability:  [0.7018280029296875, 0.07104982435703278, 0.06327974051237106, 0.043107498437166214, 0.008277255110442638, 0.00821719691157341, 0.007822426036000252, 0.006525953765958548, 0.006412186659872532, 0.005651082377880812]
Tôn_Ngộ_Không đang đánh răng thì bị Đường Tăng gõ
Tôn_Ngộ_Không đang nhổ răng thì bị Đường Tăng gõ
Tôn_Ngộ_Không đang chải răng thì bị Đường Tăng gõ
Tôn_Ngộ_Không đang xỉa răng thì bị Đường Tăng gõ
Tôn_Ngộ_Không đang nhe răng thì bị Đường Tăng gõ
Tôn_Ngộ_Không đang nghiến răng thì bị Đường Tăng gõ
Tôn_Ngộ_Không đang sửa răng thì bị Đường Tăng gõ
Tôn_Ngộ_Không đang niềng răng thì bị Đường Tăng gõ
Tôn_Ngộ_Không đang khám răng thì bị Đường Tăng gõ
Tôn_Ngộ_Không đang mài răng thì bị Đường Tăng gõ


# Classification

**Chuẩn bị dữ liệu**

In [9]:
!git clone https://github.com/duyvuleo/VNTC.git
!ls VNTC/Data/10Topics/Ver1.1

Cloning into 'VNTC'...
remote: Enumerating objects: 39, done.[K
remote: Total 39 (delta 0), reused 0 (delta 0), pack-reused 39[K
Unpacking objects: 100% (39/39), 160.90 MiB | 10.87 MiB/s, done.
Updating files: 100% (15/15), done.
Filtering content: 100% (2/2), 168.95 MiB | 61.07 MiB/s, done.
Stats.txt  Test_Full.rar  Train_Full.rar


In [10]:
%cd /kaggle/working/VNTC/Data/10Topics/Ver1.1
!apt install unrar

/kaggle/working/VNTC/Data/10Topics/Ver1.1



The following NEW packages will be installed:
  unrar
0 upgraded, 1 newly installed, 0 to remove and 30 not upgraded.
Need to get 113 kB of archives.
After this operation, 406 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/multiverse amd64 unrar amd64 1:5.6.6-2build1 [113 kB]
Fetched 113 kB in 0s (308 kB/s)

7[0;23r8[1ASelecting previously unselected package unrar.
(Reading database ... 113807 files and directories currently installed.)
Preparing to unpack .../unrar_1%3a5.6.6-2build1_amd64.deb ...
7[24;0f[42m[30mProgress: [  0%][49m[39m [..........................................................] 87[24;0f[42m[30mProgress: [ 20%][49m[39m [###########...............................................] 8Unpacking unrar (1:5.6.6-2build1) ...
7[24;0f[42m[30mProgress: [ 40%][49m[39m [#######################...................................] 8Setting up unrar (1:5.6.6-2bu

In [11]:
!unrar x Test_Full.rar /kaggle/working > /dev/null
!unrar x Train_Full.rar /kaggle/working > /dev/null
%cd /kaggle/working
!ls

/kaggle/working
PhoBERT_base_fairseq	     Test_Full	 VNTC		     vncorenlp
PhoBERT_base_fairseq.tar.gz  Train_Full  __notebook__.ipynb


**Đọc dữ liệu từ file .txt**

In [12]:
import os
from tqdm import tqdm

train_path = '/kaggle/working/Train_Full'
test_path = '/kaggle/working/Test_Full'

def read_txt(path):
    with open(path, 'r', encoding='utf-16') as f:
        data = f.read()
    return data

def make_data(root_path):
    texts = []
    labels = []
    
    for root, dirs, files in tqdm(os.walk(root_path)):
        for file_name in files:
            try:
                file_path = os.path.join(root, file_name)
                content = read_txt(file_path)
                
                # Tokenize word
                content = rdrsegmenter.tokenize(content)
                content = " ".join([' '.join(x) for x in content])
                label = root.split(os.path.sep)[-1] 
                texts.append(content)
                labels.append(label)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
    
    return texts, labels

text_train, label_train = make_data(train_path)
text_test, label_test = make_data(test_path) 

11it [06:55, 37.79s/it]
11it [10:40, 58.22s/it]


In [13]:
print(text_train[0], label_train[0])
print(len(text_train), len(text_test))

“ Má Blanche ” " Má Blanche " với trẻ mồ_côi Haiti Khi ôm_ấp một đứa trẻ tàn_tật trong một cô_nhi_viện mà cô thành_lập ở Haiti , Susie_Krabacher luôn nhớ lại cuộc_sống ngập_ngụa ma_tuý , nghề làm người_mẫu cho tạp_chí khiêu_dâm Playboy và nỗi đau của một tuổi_thơ bị lạm_dụng . “ Đó là một quãng thời_gian sống khác . Đây mới là việc tôi thật_sự muốn làm ” – Krabacher nói . 42 tuổi , tóc vàng , thân_hình “ bốc_lửa ” , nổi_bật ở một đất_nước đa_số là người da đen , Krabacher với biệt_danh là “ Má Blanche ” nói_chuyện thoải_mái về cuộc_đời mình . Chào_đời ở Alabama , Krabacher bị một người_thân lạm_dụng thân_xác lúc còn bé , sau đó được nhận làm con_nuôi đến lúc 12 tuổi . Cô sống tự_lập lúc 16 tuổi , cố_gắng kiếm sống ở nhà_hàng và làm tiếp_tân văn_phòng . Một năm sau đó , một người bạn gửi bức ảnh của Krabacher cho tạp_chí Playboy . “ Tôi đã trở_thành người chụp ảnh bìa cho Playboy vào lúc 20 tuổi ( năm 1983 ) ” – Krabacher nhớ lại . Năm năm sau , Krabacher nhận thấy kiểu sống gấp của mìn

**Encode các labels**

In [14]:
from sklearn.preprocessing import LabelEncoder

lb_encoder = LabelEncoder()
label_train_encoded = lb_encoder.fit_transform(label_train)
label_test_encoded = lb_encoder.transform(label_test)

print(lb_encoder.classes_)

['Chinh tri Xa hoi' 'Doi song' 'Khoa hoc' 'Kinh doanh' 'Phap luat'
 'Suc khoe' 'The gioi' 'The thao' 'Van hoa' 'Vi tinh']


In [15]:
# Chia tập train thành tập train và valid
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(text_train, label_train_encoded, test_size=0.1, stratify=label_train_encoded)

**Tokenizer các câu và padding về cùng độ dài**

In [16]:
import argparse
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes',
                    default="/kaggle/working/PhoBERT_base_fairseq/bpe.codes",
                    required=False,
                    type=str,
                    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)


# Load dictionary
vocab = Dictionary()
vocab.add_from_file("/kaggle/working/PhoBERT_base_fairseq/dict.txt")

Loading codes from /kaggle/working/PhoBERT_base_fairseq/bpe.codes ...
Read 64000 codes from the codes file.


In [17]:
# max_sequence_length = 256

# def convert_lines(lines, vocab, bpe):
#     '''
#       lines: list các văn bản input
#       vocab: từ điển dùng để encoding subwords
#     '''
#     # Khởi tạo ma trận ouput
#     outputs = np.zeros((len(lines), max_sequence_length))
#     cls_id = 0
#     eos_id = 2
#     pad_id = 1
    
#     for ids, row in tqdm(enumerate(lines), total=len(lines)):
#         # Mã hóa subwords theo bpe
#         subwords = bpe.encode('<s>' + row + ' </s>')
#         input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
        
#         # Cắt input nếu độ dài vượt quá max_sequence_length
#         if len(input_ids) > max_sequence_length:
#             input_ids = input_ids[:max_sequence_length]
#             input_ids[-1] = eos_id
#         else:
#             input_ids = input_ids + [pad_id, ] * (max_sequence_length - len(input_ids))
        
#         outputs[ids, :] = np.array(input_ids)
#     return outputs

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch

MAX_LEN = 256

def convert_sents_ids(sentences):
    ids = []
    for word in tqdm(sentences):
        subwords = '<s> ' + bpe.encode(word) + ' </s>'
        encoded_sentence = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
        ids.append(encoded_sentence)
    ids = pad_sequences(ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
    return torch.tensor(ids)

X_train_ids = convert_sents_ids(X_train)
X_val_ids = convert_sents_ids(X_val)
X_test_ids = convert_sents_ids(text_test)

100%|██████████| 30383/30383 [02:20<00:00, 215.79it/s]
100%|██████████| 3376/3376 [00:15<00:00, 211.24it/s]
100%|██████████| 50373/50373 [04:06<00:00, 203.95it/s]


In [19]:
print(len(X_train[0]))
print(len(X_train_ids[0]))
print(X_train_ids[0])

1910
256
tensor([    0, 31553, 10157,   227, 17308,  5841,   976, 44495,    35,     9,
         1148, 12116,     4,  7725,   120,   298,   136,   403,   197,  1061,
            5,  3478,     9,  1148,   131,  2784,   185,   170,    40,    25,
          185,   280, 12901,    80,    95,   228,     4,    12,    26,    37,
          525,    18,    30,   188,    32, 17922,   976,     5,    22,  1584,
           72,    84,   204,   283,    32,   667,   170,    11,    36,     5,
         2820,     4,   523,  8417, 56756,  1175,   544,    38,  2388,    24,
         1269,   133,   228,    77,    21,    18,  7625,   403, 18295,    48,
           22,     4,   286,    16,  1148, 12116,    34,   835, 14060,  1992,
           15,  7597,     5,   157,    13,    55,     4,   525, 10157,   170,
           40,    12,    43,  2402,    35,  1148,    83,   644,    50,   474,
           48,   246,    15, 10673,     5,    22,   319,   311,    36,    26,
          283,   170,    75,  7895,   781,   824,     4

**Tạo dataloader từ text và labels**

In [20]:
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

def make_data_loader(ids, labels, BATCH_SIZE=32):
    labels = torch.tensor(labels)
    dataset = TensorDataset(ids, labels)
#     sampler = SequentialSampler(data)
    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True) 
    return dataloader


train_dataloader = make_data_loader(X_train_ids, y_train)
val_dataloader = make_data_loader(X_val_ids, y_val)
test_dataloader = make_data_loader(X_test_ids, label_test_encoded)

# Load model PhoBERT

In [21]:
from transformers import RobertaForSequenceClassification, RobertaConfig

NUM_LABELS = len(lb_encoder.classes_)
from fairseq.models.roberta import RobertaModel
pho_bert = RobertaModel.from_pretrained('PhoBERT_base_fairseq', checkpoint_file='model.pt')

pho_bert.register_classification_head('new_task', num_classes=NUM_LABELS)

class BPE():
    bpe_codes = 'PhoBERT_base_fairseq/bpe.codes'

args = BPE()
pho_bert.bpe = fastBPE(args)

pho_bert.cuda()
print('Done')

Loading codes from PhoBERT_base_fairseq/bpe.codes ...
Read 64000 codes from the codes file.


Done


In [22]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate(logits, targets):
    """
    Đánh giá model sử dụng accuracy và f1 scores.
    Args:
        logits (B,C): torch.LongTensor. giá trị predicted logit cho class output.
        targets (B): torch.LongTensor. actual target indices.
    Returns:
        acc (float): the accuracy score
        f1 (float): the f1 score
    """
    # Tính accuracy score và f1_score
    logits = logits.detach().cpu().numpy()    
    y_pred = np.argmax(logits, axis = 1)
    targets = targets.detach().cpu().numpy()
    f1 = f1_score(targets, y_pred, average='weighted')
    acc = accuracy_score(targets, y_pred)
    return acc, f1

In [23]:
from transformers import AdamW
criteria = torch.nn.NLLLoss()

device = 'cuda'
epochs = 5

param_optimizer = list(pho_bert.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay': 0},
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)

for epoch in range(epochs):
    print(f'------------- Epochs {epoch + 1} -----------------')
    print('Trainning ...')
    pho_bert.train()
    sum_loss = 0
    sum_acc = 0
    sum_f1 = 0
    nb_train_steps = 0
    
    for i, (x_batch, y_batch) in tqdm(enumerate(train_dataloader)):
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        optimizer.zero_grad()
        
        y_preds = pho_bert.predict('new_task', x_batch)
        logits = torch.exp(y_preds)
        acc, f1 = evaluate(logits, y_batch)
        loss = criteria(y_preds, y_batch)
        loss.backward()
        optimizer.step()
        
        loss_val = loss.item()
        sum_loss += loss_val
        sum_acc += acc 
        sum_f1 += f1
        nb_train_steps += 1
    
    avg_train_loss = sum_loss / len(train_dataloader)
    
    print(" Accuracy: {0:.4f}".format(sum_acc/nb_train_steps))
    print(" F1 score: {0:.4f}".format(sum_f1/nb_train_steps))
    print(" Average training loss: {0:.4f}".format(avg_train_loss))
    
    print('Running Validation ...')
    
    pho_bert.eval()
    accs = []
    f1s = []
    with torch.no_grad():
        for x_batch, y_batch in tqdm(val_dataloader):
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            
            outputs = pho_bert.predict('new_task', x_batch)
            logits = torch.exp(outputs)
            acc, f1 = evaluate(logits, y_batch)
            accs.append(acc)
            f1s.append(f1)
    mean_acc = np.mean(accs)
    mean_f1 = np.mean(f1s) 
     
    print(" Accuracy: {0:.4f}".format(mean_acc))  
    print(" F1 score: {0:.4f}".format(mean_f1)) 



------------- Epochs 1 -----------------
Trainning ...


950it [12:02,  1.32it/s]


 Accuracy: 0.9001
 F1 score: 0.8973
 Average training loss: 0.3425
Running Validation ...


100%|██████████| 106/106 [00:24<00:00,  4.32it/s]


 Accuracy: 0.9233
 F1 score: 0.9215
------------- Epochs 2 -----------------
Trainning ...


950it [12:02,  1.31it/s]


 Accuracy: 0.9508
 F1 score: 0.9507
 Average training loss: 0.1535
Running Validation ...


100%|██████████| 106/106 [00:24<00:00,  4.32it/s]


 Accuracy: 0.9272
 F1 score: 0.9266
------------- Epochs 3 -----------------
Trainning ...


950it [12:03,  1.31it/s]


 Accuracy: 0.9690
 F1 score: 0.9690
 Average training loss: 0.1007
Running Validation ...


100%|██████████| 106/106 [00:24<00:00,  4.32it/s]


 Accuracy: 0.9337
 F1 score: 0.9338
------------- Epochs 4 -----------------
Trainning ...


950it [12:03,  1.31it/s]


 Accuracy: 0.9804
 F1 score: 0.9804
 Average training loss: 0.0663
Running Validation ...


100%|██████████| 106/106 [00:24<00:00,  4.33it/s]


 Accuracy: 0.9287
 F1 score: 0.9285
------------- Epochs 5 -----------------
Trainning ...


950it [12:03,  1.31it/s]


 Accuracy: 0.9866
 F1 score: 0.9866
 Average training loss: 0.0463
Running Validation ...


100%|██████████| 106/106 [00:24<00:00,  4.32it/s]

 Accuracy: 0.9269
 F1 score: 0.9268





In [24]:
from sklearn.metrics import classification_report

def test(dataloader): 
    pho_bert.eval()
    loss, acc, f1 = 0, 0, 0
    predicts = []
    real_values = []
    
    with torch.no_grad():
        for x_batch, y_batch in tqdm(dataloader):
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            outputs = pho_bert.predict('new_task', x_batch)
            logits = torch.exp(outputs)
            acc, f1 = evaluate(logits, y_batch)
            accs.append(acc)
            f1s.append(f1)
            
            logits = logits.detach().cpu().numpy()    
            predicts.append(np.argmax(logits, axis = 1))
            real_values.append(y_batch.detach().cpu().numpy())
            
    predicts = np.concatenate(predicts)
    real_values = np.concatenate(real_values)

    print('\n', classification_report(real_values, predicts))
    
test(test_dataloader)

100%|██████████| 1575/1575 [06:04<00:00,  4.32it/s]


               precision    recall  f1-score   support

           0       0.87      0.86      0.86      7567
           1       0.76      0.42      0.54      2036
           2       0.78      0.83      0.80      2096
           3       0.93      0.88      0.90      5276
           4       0.87      0.94      0.90      3788
           5       0.92      0.94      0.93      5417
           6       0.93      0.94      0.93      6716
           7       0.98      0.99      0.98      6667
           8       0.90      0.97      0.93      6250
           9       0.93      0.96      0.94      4560

    accuracy                           0.91     50373
   macro avg       0.89      0.87      0.87     50373
weighted avg       0.90      0.91      0.90     50373




