In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
path = "/content/drive/My Drive/Aspect-based Sentiment Analysis/"

In [3]:
!pip install fairseq
!pip3 install vncorenlp
!pip3 install transformers
!pip install fastBPE
!pip install keras-radam

Collecting fairseq
[?25l  Downloading https://files.pythonhosted.org/packages/67/bf/de299e082e7af010d35162cb9a185dc6c17db71624590f2f379aeb2519ff/fairseq-0.9.0.tar.gz (306kB)
[K     |████████████████████████████████| 307kB 3.0MB/s 
Collecting sacrebleu
[?25l  Downloading https://files.pythonhosted.org/packages/a3/c4/8e948f601a4f9609e8b2b58f31966cb13cf17b940b82aa3e767f01c42c52/sacrebleu-1.4.14-py3-none-any.whl (64kB)
[K     |████████████████████████████████| 71kB 7.0MB/s 
Collecting portalocker
  Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl
Building wheels for collected packages: fairseq
  Building wheel for fairseq (setup.py) ... [?25l[?25hdone
  Created wheel for fairseq: filename=fairseq-0.9.0-cp36-cp36m-linux_x86_64.whl size=2046433 sha256=682b8b745503e4086acda8fcc3f9add56c7ff76238caf490e82d299eaa8a8c2f
  Stored in directory: /root/.cache/pip/wheels/37/3e/1b/0fa30695

In [4]:
import pandas as pd, numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers

In [5]:
!wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
!tar -xzvf PhoBERT_base_transformers.tar.gz

--2020-10-18 09:18:34--  https://public.vinai.io/PhoBERT_base_transformers.tar.gz
Resolving public.vinai.io (public.vinai.io)... 13.226.49.2, 13.226.49.111, 13.226.49.123, ...
Connecting to public.vinai.io (public.vinai.io)|13.226.49.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 322405979 (307M) [application/x-tar]
Saving to: ‘PhoBERT_base_transformers.tar.gz’


2020-10-18 09:18:37 (121 MB/s) - ‘PhoBERT_base_transformers.tar.gz’ saved [322405979/322405979]

PhoBERT_base_transformers/
PhoBERT_base_transformers/config.json
PhoBERT_base_transformers/bpe.codes
PhoBERT_base_transformers/model.bin
PhoBERT_base_transformers/dict.txt


In [6]:
phoBERT = 'PhoBERT_base_transformers'

In [7]:
import pickle
import pandas as pd
df = pd.read_csv(path+"VLSP2018/review_train.csv")
df_dev = pd.read_csv(path+'VLSP2018/review_dev.csv')
df_test = pd.read_csv(path+'VLSP2018/review_test.csv')

In [8]:
data_train = df.review.to_list()
label_train = pickle.load(open(path+'VLSP2018/sentiment_train.pkl','rb'))
data_dev = df_dev.review.tolist()
label_dev =  pickle.load(open(path+'VLSP2018/sentiment_dev.pkl','rb'))
data_test = df_test.review.tolist()
label_test = pickle.load(open(path+'VLSP2018/sentiment_test.pkl','rb'))

In [9]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
class BPE():
  bpe_codes = 'PhoBERT_base_transformers/bpe.codes'

args = BPE()
bpe = fastBPE(args)

In [10]:
vocab = Dictionary()
vocab.add_from_file('PhoBERT_base_transformers/dict.txt')

In [11]:
import numpy as np
from tqdm import tqdm
max_sequence_length = 256
def convert_lines(lines, vocab, bpe):
  '''
  lines: list các văn bản input
  vocab: từ điển dùng để encoding subwords
  bpe: 
  '''
  # Khởi tạo ma trận output
  outputs = np.zeros((len(lines), max_sequence_length), dtype=np.int32) # --> shape (number_lines, max_seq_len)
  mask_token = np.zeros((len(lines), max_sequence_length), dtype=np.int32)
  # Index của các token cls (đầu câu), eos (cuối câu), padding (padding token)
  cls_id = 0
  eos_id = 2
  pad_id = 1

  for idx, row in tqdm(enumerate(lines), total=len(lines)): 
    # Mã hóa subwords theo byte pair encoding(bpe)
    subwords = bpe.encode('<s> '+ row +' </s>')
    input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
    # Truncate input nếu độ dài vượt quá max_seq_len
    if len(input_ids) > max_sequence_length: 
      input_ids = input_ids[:max_sequence_length] 
      input_ids[-1] = eos_id
    else:
      # Padding nếu độ dài câu chưa bằng max_seq_len
      input_ids = input_ids + [pad_id, ]*(max_sequence_length - len(input_ids))
      mask = [0 if i==1 else 1 for i in input_ids]
    outputs[idx,:] = np.array(input_ids)
    mask_token[idx,:] = np.array(mask)
  return outputs, mask_token

In [12]:
X_train, X_train_mask = convert_lines(data_train, vocab, bpe)
X_dev, X_dev_mask = convert_lines(data_dev, vocab, bpe)
X_test, X_test_mask = convert_lines(data_test, vocab, bpe)

100%|██████████| 2961/2961 [00:01<00:00, 1707.45it/s]
100%|██████████| 1290/1290 [00:00<00:00, 1692.30it/s]
100%|██████████| 500/500 [00:00<00:00, 756.52it/s]


In [13]:
y_train = np.array(label_train).astype('float32')
y_dev = np.array(label_dev).astype('float32')
y_test = np.array(label_test).astype('float32')

In [14]:
from tensorflow.keras.layers import Dense, Input, Flatten,SpatialDropout1D,Bidirectional, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, Concatenate, Dropout,GlobalMaxPool1D,Lambda,MaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Bidirectional,LSTM,GRU
from tensorflow.keras.optimizers import Adam
from keras_radam.training import RAdamOptimizer

In [15]:
config = RobertaConfig.from_pretrained(
  phoBERT+'/config.json',
  output_hidden_states=True,
  num_labels=36
  )
model_bert = TFRobertaModel.from_pretrained(phoBERT+'/model.bin', config=config, from_pt=True)

input_ids = Input(shape=(256,), name='input_token', dtype='int32')
input_mask = Input(shape=(256,), name='mask_token', dtype='int32')

pretrain = model_bert(input_ids, attention_mask=input_mask)[0]
cls_token = pretrain[:,0,:]
dropout_layer = Dropout(0.2)(cls_token)

dense_layer = Dense(768, activation='relu')(dropout_layer)
dense_layer = Dense(512, activation='relu')(dense_layer)
dense_layer = Dense(256, activation='relu')(dense_layer)
preds = Dense(36, activation='sigmoid')(dense_layer)

model = Model([input_ids,input_mask], preds)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [16]:
def f1(y_true, y_pred):
    y_true = K.flatten(y_true)
    y_pred = K.flatten(y_pred)
    return 2 * (K.sum(y_true * y_pred)+ K.epsilon()) / (K.sum(y_true) + K.sum(y_pred) + K.epsilon())

In [17]:
LR = 3e-5
EPOCHS = 10
BATCH_SIZE = 16

In [18]:
import tensorflow_addons as tfa
optimizer = tfa.optimizers.RectifiedAdam(lr=LR)
loss = tfa.losses.SigmoidFocalCrossEntropy()

In [19]:
model.compile(loss='binary_crossentropy',
              optimizer=Adam(learning_rate=LR),
              metrics=[f1])

In [20]:
from tensorflow.keras.callbacks import ModelCheckpoint 
checkpoint = ModelCheckpoint(path+'phoBERT_checkpoint.h5', monitor='val_f1', verbose=1, save_best_only=True, mode='max')
callback_list = [checkpoint]

In [21]:
model.fit([X_train, X_train_mask], y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose = 1, validation_data=([X_test, X_test_mask], y_test))     

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f971f1df7b8>

In [22]:
model.fit([X_train, X_train_mask], y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose = 1, validation_data=([X_test, X_test_mask], y_test))     

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f971a124198>

In [23]:
model.fit([X_train, X_train_mask], y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose = 1, validation_data=([X_test, X_test_mask], y_test))     

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f971a0e8cf8>

In [25]:
model.fit([X_train, X_train_mask], y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose = 1, validation_data=([X_test, X_test_mask], y_test))     

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f958f026fd0>

In [24]:
  model.save(path+'phoBERT_checkpoint1.model')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


TypeError: ignored

In [None]:
tf.keras.models.save_model(model,path+'phoBERT_checkpoint')