In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import sys
sys.path.append('/content/drive/MyDrive/NER-MOCK')

In [4]:
%cd /content/drive/MyDrive/NER-MOCK

/content/drive/MyDrive/NER-MOCK


In [5]:
# !pip install pyvi
# !pip install transformers
!pip install tensorflow_addons



In [93]:
from utils import *
import functools
import pickle
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from crf import CRF
from model import *
from val import *

from tensorflow.keras.layers import Embedding, Bidirectional, Dense, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras import backend as K

from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.model_selection import train_test_split
%matplotlib inline 

In [7]:
tf.random.set_seed(42)

### Data processing

In [8]:
def processed_data(path, name='pkl'):
    """
    Processing data end-to-end
    Argument:
    --path: filepath data
    --name: pkl or txt

    Return:
    X_train, y_train
    """
    # Load data and convert to format [word, tag]
    data = load_data_tags(path, name=name)
    # Merge tag
    data_merged = merge_tags(data)
    # Cutting if len of sequences >= 256
    X_train = cutting_sequences(data_merged)
    # Convert to tokens
    train_tokens, train_labels = word_to_sequences(X_train)
    
    return train_tokens, train_labels

In [47]:
def convert_to_tensor(tokens, labels, vocabs, max_len=256):
    """
    convert tokens to tensor can be train
    Argument:
    --data: tokens data
    --labels: tokens labels

    Return:
    --X: tensor for data
    --y: tensor for labels
    """
    trans_x = transform_x(max_len, vocabs)
    trans_y = transform_y(max_len)

    X = trans_x.fit(tokens)
    y = trans_y.to_onehot(labels)

    target_names = trans_y.tag_values
    idx2tag = trans_y.idx2tag
    return X, y, target_names, idx2tag

In [48]:
val_path = 'data/vlsp2018/val_word.pkl'
# Convert to tokens
test_tokens, test_labels = processed_data(val_path, name='pkl')

df = pd.read_csv('vocabs.csv')
vocabs = df.values.squeeze().tolist()

X_test, y_test, target_names, idx2tag = convert_to_tensor(test_tokens, test_labels, vocabs, max_len=256)
X_test.shape, y_test.shape

((3262, 256), (3262, 256, 6))

#### Model

In [15]:
filepath = 'crf_model_weights.h5'

In [16]:
batch_size = 64
num_epochs = 30
embedding_dim = 300
vocab_size = len(vocabs) + 2
n_tags = len(target_names)

In [17]:
create_model = BiLSRM_CRF(embedding_dim, vocab_size, n_tags, lr=8e-4)
model = create_model.build()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 300)         3869100   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 400)         801600    
_________________________________________________________________
dropout (Dropout)            (None, None, 400)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 400)         961600    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 400)         0         
_________________________________________________________________
dense (Dense)                (None, None, 6)           2406      
_________________________________________________________________
crf (CRF)                    (None, None, 6)           3

In [18]:
model.load_weights(filepath)

In [33]:
y_pred_pre = model.predict(X_test)
y_true = np.argmax(y_test, axis=-1).flatten()
y_pred = np.argmax(y_pred_pre, axis=-1).flatten()
report = classification_report(y_true, y_pred, target_names=target_names, digits=4)
print(report)

              precision    recall  f1-score   support

           O     0.9837    0.9933    0.9885    147219
         LOC     0.8509    0.8373    0.8441      4874
         ORG     0.8644    0.7541    0.8055      6372
         PER     0.8942    0.7994    0.8442      4333
        MISC     0.5784    0.4951    0.5336       618
     PADDING     1.0000    1.0000    1.0000    671656

    accuracy                         0.9946    835072
   macro avg     0.8619    0.8132    0.8360    835072
weighted avg     0.9944    0.9946    0.9944    835072



#### Real text

In [20]:
text = 'Ken Ko Ở Trong Team 4 nGười nữa à anh : ANH VINH , TRÂM ANH , TLOO , KEN Đó'

In [21]:
list_test = evaluate(model, text, max_len=256, vocabs=vocabs)

In [25]:
text_visualize()

In [26]:
visualize(list_test)

#### Analysis Error CSV format

In [113]:
def analysis_csv(model, tensor, tokens, labels):
    y_pred_pre = model.predict(tensor)
    pred_tags = np.argmax(y_pred_pre, axis=-1).tolist()

    lst_tags = []
    for i, line in enumerate(pred_tags):
        lst = []
        for i in range(len(tokens[i])):
            lst.append(idx2tag[line[i]])
        lst_tags.append(lst)

    data = []
    for sequence, tags, pred in zip(tokens, labels, lst_tags):
        temp = dict()
        if tags != pred:
            temp['tokens'] = sequence
            temp['true_tags'] = tags
            temp['pred_tags'] = pred
            data.append(temp)
    df = pd.DataFrame.from_dict(data)
    return df

In [117]:
df_test_csv = analysis_csv(model, X_test, test_tokens, test_labels)
df_test_csv.to_csv('analysis_test.csv')

In [118]:
train_path = 'data/vlsp2018/train_word.pkl'
train_tokens, train_labels = processed_data(train_path, name='pkl')
X_train, y_train, target_names, idx2tag = convert_to_tensor(train_tokens, train_labels, vocabs, max_len=256)

In [119]:
df_train_csv = analysis_csv(model, X_train, train_tokens, train_labels)
df_train_csv.to_csv('analysis_train.csv')