# Import

In [3]:
import pandas as pd
import os
import json
import importlib
import torch
from tqdm import tqdm

In [33]:
import src.preprocess.zalo_processor as zlp
importlib.reload(zlp)

<module 'src.preprocess.zalo_processor' from '/code/src/preprocess/zalo_processor.py'>

In [5]:
from src.feature_extraction import phobert_embedding as pho_emb
importlib.reload(pho_emb)

<module 'src.feature_extraction.phobert_embedding' from '/code/src/feature_extraction/phobert_embedding.py'>

# Set Variable

In [6]:
DATASET_PATH = 'Dataset'
max_seq_length = 128

# Load Data

In [7]:
# print(torch.cuda.device_count())
# print(torch.cuda.get_device_name(0))
# Cần có gpu nvidia
device = torch.device('cuda:0')
phoemb = pho_emb.PhoEmbedding()
phoemb.phobert.cuda()

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(258, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [30]:

zltp = zlp.ZaloTextPreprocess()
zltp.load_data(DATASET_PATH)

In [34]:
pho_tpp = zlp.PhobertTextProcessor(phoemb.tokenizer) 

In [10]:
sample = zltp.train_df
tokens = pho_tpp.text_to_tokens(max_seq_length, sample.question[0], sents_b=sample.text[0])

In [11]:
tokens

['<s>',
 'Quang_Hải',
 'giành',
 'được',
 'chức',
 'vô_địch',
 'U21',
 'quốc_gia',
 'năm',
 'bao_nhiêu',
 'tuổi',
 '</s>',
 'Năm',
 '2013',
 ',',
 'Nguyễn_Quang_Hải',
 'giành',
 'chức',
 'vô_địch',
 'U21',
 'quốc_gia',
 '2013',
 'cùng',
 'với',
 'đội',
 'trẻ',
 'Hà_Nội',
 'T&T',
 'và',
 'tạo',
 'nên',
 'cú',
 'sốc',
 'khi',
 'trở_thành',
 'cầu_thủ',
 '16',
 'tuổi',
 'đầu_tiên',
 'giành',
 'được',
 'danh_hiệu',
 'vô_địch',
 'U21',
 'quốc_gia',
 '.',
 '</s>']

In [35]:
train_dataloader, val_dataloader = zltp.convert_to_dataloader(pho_tpp, max_seq_length)

100%|██████████| 18108/18108 [00:00<00:00, 1273823.57it/s]18108
0.2



# Model

In [36]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class SimpleModel(torch.nn.Module):
    def __init__(self, max_seq_length):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(SimpleModel, self).__init__()
        self.conv1d = nn.Conv1d(768*4, 1, 1)
        self.out = nn.Linear(max_seq_length, 1)
    def forward(self, embs):
        x = self.conv1d(embs).squeeze()
        return torch.sigmoid(self.out(x))

model = SimpleModel(max_seq_length)
model.cuda()

SimpleModel(
  (conv1d): Conv1d(3072, 1, kernel_size=(1,), stride=(1,))
  (out): Linear(in_features=128, out_features=1, bias=True)
)

In [9]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
data = TensorDataset(input_ids, masks)
sampler = SequentialSampler(data)
dataloader = DataLoader(data, sampler=sampler, batch_size=2)

NameError: name 'input_ids' is not defined

In [28]:

with torch.no_grad():
    _, embs = phoemb.embvec(input_ids, masks)
    embs = embs.permute(0, 2, 1)
    label = model(embs)
    print(label)

tensor([[0.5007],
        [0.5089]], device='cuda:0')


In [37]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
criterion = nn.BCELoss()

In [38]:
def get_accuracy(y_true, y_prob):
    assert y_true.ndim == 1 and y_true.size() == y_prob.size()
    y_prob = y_prob > 0.5
    return (y_true == y_prob).sum().item()

In [39]:
# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
default_weight = torch.tensor([0.9]*64).to(device)
for epoch in range(10):
    print('Epoch {}/{}'.format(epoch, 10 - 1))
    print('-' * 10)
    model.train()
    running_loss = 0.0
    counter = 0
    tk0 = tqdm(train_dataloader, total=int(len(train_dataloader)))
    for i, batch in enumerate(tk0):
        input_ids = batch[0].to(device)
        masks = batch[1].to(device)
        label = batch[2].to(device).float()
        _, embs = phoemb.embvec(input_ids, masks)
        embs = embs.permute(0, 2, 1)

        # Step 1. Remember that PyTorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        with torch.set_grad_enabled(True):
        
            # Step 3. Run our forward pass.
            pred = model(embs).squeeze()
            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            if label.size(0) == 64:
                criterion.weight = default_weight
            else:
                criterion.weight = torch.tensor([0.9]*label.size(0)).to(device)
            loss = criterion(pred, label)
            loss.backward()
            optimizer.step() 
        counter += 1 
        running_loss += loss.item() * embs.size(0)  
        tk0.set_postfix(loss=(running_loss / (counter * train_dataloader.batch_size)))
   
    model.eval()
    print("Eval")
    with torch.no_grad():
        ac = 0
        for batch in tqdm(val_dataloader):
            input_ids = batch[0].to(device)
            masks = batch[1].to(device)
            label = batch[2].to(device).float()
            _, embs = phoemb.embvec(input_ids, masks)
            embs = embs.permute(0, 2, 1)
            # Step 3. Run our forward pass.
            pred = model(embs).squeeze()
            ac += get_accuracy(pred, label)
        print('Accuracy:', ac/len(val_dataloader.dataset))

  0%|          | 0/906 [00:00<?, ?it/s]Epoch 0/9
----------
100%|██████████| 906/906 [01:33<00:00,  9.73it/s, loss=0.659]
  0%|          | 1/227 [00:00<00:23,  9.52it/s]Eval
100%|██████████| 227/227 [00:22<00:00,  9.98it/s]
  0%|          | 1/906 [00:00<01:47,  8.46it/s, loss=0.608]Accuracy: 0.0
Epoch 1/9
----------
 17%|█▋        | 158/906 [00:16<01:17,  9.65it/s, loss=0.645]


KeyboardInterrupt: 