In [1]:
import numpy as np
import pandas as pd
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Embedding, LSTM, Bidirectional, Dense
from keras.models import Sequential, load_model
from keras_contrib.layers import CRF
from keras.callbacks import ModelCheckpoint
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Config
max_len = 80
batch_size = 64
epochs = 100
embedding = 64
hidden_size = 50

## Load data

In [3]:
def load_data(path_file = r'D:\Documents\Đồ án tốt nghiệp\Mobile-pricing\Mobile\NER\tagger-model\train.txt'):
    sentences = []
    sentence = []
    word = []
    tag = []
    with open(path_file, encoding='utf8') as file_in:
        for line in file_in:
            if line == '\n': 
                sentences.append(sentence)
                sentence = []
            else:
                w, t = line.lower().split()
                word.append(w)
                tag.append(t)
                sentence.append((w, t))
    return sentences, word, tag

## Process data

In [8]:
def process_data(sentence, word, tag):
    # Tạo dict word, tag
    word = list(set(word))
    tag = list(set(tag))
    num_word = len(word) + 2
    num_tag = len(tag) + 1
    
    # Tạo dict của word to index
    word_index = {w: i + 2 for i, w in enumerate(word)}
    word_index['UNK'] = 1
    word_index['PAD'] = 0
    # index to word
    index_word = {i: w for w, i in word_index.items()}
    # word to vector
    X = [[word_index[w[0]] for w in s]for s in sentence]
    # padding 
    X = pad_sequences(X, maxlen=max_len, padding='post', truncating='post', value = word_index['PAD'])
    
    # Tạo dict của tag to index 
    tag_index = {t : i + 1 for i, t in enumerate(tag)}
    tag_index['PAD'] = 0
    # Tạo index to tag
    index_tag = {i : t for t, i in tag_index.items()}
    # tag to vector
    y = [[tag_index[t[1]] for t in s]for s in sentence]
    # padd
    y = pad_sequences(y, maxlen=max_len, padding='post', truncating='post', value=tag_index['PAD'])
    
    # Chuyển y về dạng one hot vector
    y = [to_categorical(i, num_classes=num_tag) for i in y]
    return X, y, word_index, index_word, tag_index, index_tag, num_tag, num_word

In [9]:
def process_data_test(sentence, word_index, tag_index, word):
    # word to vector
    X = []
    for s in sentence:
        line = []
        for w in s:
            if(w[0] not in set(word)):
                line.append(1)
            else:
                line.append(word_index[w[0]])
        X.append(line)
                
    # padding 
    X = pad_sequences(X, maxlen=max_len, padding='post', truncating='post', value = word_index['PAD'])
    
    # tag to vector
    y = [[tag_index[t[1]] for t in s]for s in sentence]
    # padd
    y = pad_sequences(y, maxlen=max_len, padding='post', truncating='post', value=tag_index['PAD'])
    
    # Chuyển y về dạng one hot vector
    y = [to_categorical(i, num_classes=num_tag) for i in y]
    return X, y

## Build model

In [10]:
def build_model(num_word, num_tag):
    model = Sequential()
    model.add(Embedding(input_dim = num_word+1, output_dim=embedding, input_length=max_len))
    model.add(Bidirectional(LSTM(units=hidden_size, return_sequences=True)))
    model.add(Dense(hidden_size, activation='relu'))
    crf = CRF(num_tag)
    model.add(crf)
    model.summary()
    model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
    return model

In [11]:
sentence_train, word, tag = load_data()
sentence_test , word_test, tag_test = load_data(r'D:\Documents\Đồ án tốt nghiệp\Mobile-pricing\Mobile\NER\tagger-model\test.txt')
X_train, y_train, word_index, index_word, tag_index, index_tag, num_tag, num_word = process_data(sentence_train, word, tag)
X_test, y_test = process_data_test(sentence_test, word_index, tag_index, word)

In [14]:
if not os.path.exists("model.hdf5"):
    model = build_model(num_word, num_tag)
    checkpoint = ModelCheckpoint(filepath = 'model.hdf5', save_best_only = True, monitor='val_loss')
    history = model.fit(X_train, np.array(y_train), batch_size=batch_size, epochs=epochs,
                        validation_split=0.1, callbacks=[checkpoint])
else:
    model = build_model(num_word, num_tag)
    model.load_weights("model.hdf5")

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 80, 64)            197056    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 80, 100)           46000     
_________________________________________________________________
dense_1 (Dense)              (None, 80, 50)            5050      
_________________________________________________________________
crf_1 (CRF)                  (None, 80, 36)            3204      
Total params: 251,310
Trainable params: 251,310
Non-trainable params: 0
_________________________________________________________________




Instructions for updating:
Use tf.cast instead.
Train on 1440 samples, validate on 161 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoc

Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


## Kiểm thử

In [15]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, axis=-1)

# Kiểm thử F1-score
y_pred = [[index_tag[i] for i in row]for row in y_pred]
y_test_true = [[index_tag[i] for i in row]for row in y_test_true]
print('F1-score:', f1_score(y_test_true, y_pred))

F1-score: 0.7855375832540439


