In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import sys
sys.path.append('/content/drive/MyDrive/NER-MOCK')

In [4]:
!pip install tensorflow-addons



In [5]:
from utils import *
import pickle
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from crf import CRF
from model import *

from tensorflow.keras.layers import Embedding, Bidirectional, Dense, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras import backend as K

from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.model_selection import train_test_split
%matplotlib inline 

In [6]:
tf.random.set_seed(42)

### Data processing

In [7]:
def processed_data(path, name='pkl'):
    """
    Processing data end-to-end
    Argument:
    --path: filepath data
    --name: pkl or txt

    Return:
    X_train, y_train
    """
    # Load data and convert to format [word, tag]
    data = load_data_tags(path, name=name)
    # Merge tag
    data_merged = merge_tags(data)
    # Cutting if len of sequences >= 256
    X_train = cutting_sequences(data_merged)
    # Convert to tokens
    train_tokens, train_labels = word_to_sequences(X_train)
    
    return train_tokens, train_labels

In [12]:
def convert_to_tensor(tokens, labels, vocabs, max_len=256):
    """
    convert tokens to tensor can be train
    Argument:
    --data: tokens data
    --labels: tokens labels

    Return:
    --X: tensor for data
    --y: tensor for labels
    """
    trans_x = transform_x(max_len, vocabs)
    trans_y = transform_y(max_len)

    X = trans_x.fit(tokens)
    y = trans_y.to_onehot(labels)
    target_names = trans_y.tag_values
    return X, y, target_names

In [8]:
train_path = '/content/drive/MyDrive/NER-MOCK/data/vlsp2018/train_word.pkl'
val_path = '/content/drive/MyDrive/NER-MOCK/data/vlsp2018/val_word.pkl'

In [9]:
# Convert to tokens
train_tokens, train_labels = processed_data(train_path, name='pkl')
test_tokens, test_labels = processed_data(val_path, name='pkl')

In [10]:
vocabs = pd.read_csv('/content/drive/MyDrive/NER-MOCK/vocabs.csv')
vocabs = vocabs.values.squeeze().tolist()

In [13]:
X_train, y_train, target_names = convert_to_tensor(train_tokens, train_labels, vocabs, max_len=256)
X_test, y_test, _ = convert_to_tensor(test_tokens, test_labels, vocabs, max_len=256)

In [14]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((9592, 256), (9592, 256, 6), (3262, 256), (3262, 256, 6))

#### Model

In [27]:
class MyCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        y_pred = model.predict(X_test)
        
        y_true = np.argmax(y_test, axis=-1).flatten()
        y_pred = np.argmax(y_pred, axis=-1).flatten()

        report = classification_report(y_true, y_pred, target_names=target_names, digits=4)
        #f1 = f1_score(y_test, y_pred , average="macro")
        print(report)
callbacks = MyCallback()

In [28]:
filepath = '/content/drive/MyDrive/NER-MOCK/crf_model_weights.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=filepath,
    verbose=1,
    save_weights_only=True,
    monitor='val_f1',
    mode='max',
    save_best_only=True)

In [29]:
batch_size = 64
num_epochs = 30
embedding_dim = 300
vocab_size = len(vocabs) + 2
n_tags = len(target_names)

In [30]:
create_model = BiLSRM_CRF(embedding_dim, vocab_size, n_tags, lr=8e-4)
model = create_model.build()

In [31]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         3869100   
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 400)         801600    
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 400)         0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 400)         961600    
_________________________________________________________________
dropout_3 (Dropout)          (None, None, 400)         0         
_________________________________________________________________
dense_1 (Dense)              (None, None, 6)           2406      
_________________________________________________________________
crf_1 (CRF)                  (None, None, 6)          

In [33]:
hist = model.fit(X_train, y_train,
                 batch_size=batch_size, epochs=num_epochs, 
                 validation_data=(X_test, y_test),
                 callbacks=[callbacks, model_checkpoint_callback])