In [32]:
from keras.models import Sequential, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers.crf import CRF
from keras_contrib.utils import save_load_utils
from keras_contrib.metrics import crf_accuracy
from keras_contrib.losses import crf_loss
from keras.utils import to_categorical
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.models import load_model
from keras import backend as K
import tensorflow as tf
import pickle, os

In [33]:
BERT_BASE = os.path.join(os.getcwd(), 'bert/bert_model/uncased_L-12_H-768_A-12')

In [34]:
from utils import *

In [35]:
class LSTMmodel:
    def __init__(self, input_length, para_emb_dim, num_tags, hidden_dim=200, dropout=0.5):
        self.num_tags = num_tags
        self.model = Sequential()
        self.model.add(Bidirectional(LSTM(hidden_dim, return_sequences=True), input_shape=(input_length, para_emb_dim)))
        self.model.add(Dropout(dropout))
        # self.model.add(Bidirectional(LSTM(hidden_dim, return_sequences=True), input_shape=(input_length, para_emb_dim)))
        # self.model.add(Dropout(dropout))
        self.model.add(TimeDistributed(Dense(self.num_tags)))
        crf = CRF(self.num_tags)
        self.model.add(crf)
        self.model.compile('rmsprop', loss=crf_loss, metrics=[crf_accuracy])
    
    def save_model(self, filepath):
        save_load_utils.save_all_weights(self.model, filepath)
    
    def restore_model(self, filepath):
        save_load_utils.load_all_weights(self.model, filepath)
        
    def train(self, trainX, trainY, batch_size=32, epochs=10, validation_split=0.1, verbose=1):
        return self.model.fit(trainX, np.array(trainY), batch_size=batch_size, epochs=epochs, 
                             validation_split=validation_split, verbose=verbose)
    
    @staticmethod
    def myloss(y_true, y_pred):   
        y_pred /= tf.reduce_sum(y_pred, -1, True)
        # manual computation of crossentropy
        y_pred = K.clip(y_pred, K.epsilon(), 1. - K.epsilon())
        return -tf.reduce_sum(y_true * tf.log(y_pred), -1)
        

In [36]:
# from Dataprocessor import Dataprocessor

# filelist = [('data/%d.json' % i) for i in range(500)]
# processor = Dataprocessor()
# train_texts, train_tags, train_rawtags = processor.load_data(filelist)

# save_train_data(train_texts, train_tags, train_rawtags)
train_texts, train_tags, train_rawtags = load_train_data()
print(len(train_texts), len(train_tags), len(train_rawtags))

114975 114975 114975


In [37]:
from bert_utils import get_all_features

bert_config_file = os.path.join(BERT_BASE, 'bert_config.json')
vocab_file = os.path.join(BERT_BASE, 'vocab.txt')
bert_checkpoint = os.path.join(BERT_BASE, 'bert_model.ckpt')
    
# feature = get_all_features(train_texts[:], bert_config_file, vocab_file, bert_checkpoint)
# with open('save_model/feature_1.pk', 'rb') as f:
#     feature = pickle.load(f)
# print(len(feature))


In [38]:
INPUT_LENGTH = 100
PARAGRAPH_EMB_DIM = 768
NUM_TAGS = 12

model = LSTMmodel(INPUT_LENGTH, PARAGRAPH_EMB_DIM, NUM_TAGS)
model.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_3 (Bidirection (None, 100, 400)          1550400   
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 400)          0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 100, 12)           4812      
_________________________________________________________________
crf_3 (CRF)                  (None, 100, 12)           324       
Total params: 1,555,536
Trainable params: 1,555,536
Non-trainable params: 0
_________________________________________________________________


In [39]:
def get_train(i):
    with open('save_model/feature_%d.pk' % i, 'rb') as f:
        feature = pickle.load(f)
    
    tags = train_tags[i*5000:(i+1)*5000]
    if i == 1:
        tags = tags[1000:]
    if i == 0:
        feature = feature[0:5000]
    
    X, rawY = [], [] # X is 3D: article, paragraph, embedding; Y is 2D: article, paragraph
    for f, t in zip(feature, tags):
        while len(f) < INPUT_LENGTH:
            f.append(np.zeros(PARAGRAPH_EMB_DIM))
        while len(t) < INPUT_LENGTH:
            t.append(0)
        f = f[0:INPUT_LENGTH]
        t = t[0:INPUT_LENGTH]
        X.append(f)
        rawY.append(t)
    
    Y = [to_categorical(y, num_classes=NUM_TAGS) for y in rawY] # Y is now 3D
    return X, Y

        
def get_test():
    with open('save_model/feature_0.pk', 'rb') as f:
        feature = pickle.load(f)[5000:6000]
    tags = train_tags[5000:6000]
    X, rawY = [], [] # X is 3D: article, paragraph, embedding; Y is 2D: article, paragraph
    for f, t in zip(feature, tags):
        while len(f) < INPUT_LENGTH:
            f.append(np.zeros(PARAGRAPH_EMB_DIM))
        while len(t) < INPUT_LENGTH:
            t.append(0)
        f = f[0:INPUT_LENGTH]
        t = t[0:INPUT_LENGTH]
        X.append(f)
        rawY.append(t)
    
    Y = [to_categorical(y, num_classes=NUM_TAGS) for y in rawY] # Y is now 3D
    return X, Y

In [40]:
testX, testY = get_test()
def test_accuracy():
    # Predict on test
    test_pred = model.model.predict(np.array(testX), verbose=1)
    truecnt = 0
    falsecnt = 0
    for (i, pred) in enumerate(test_pred):
        for j, p in enumerate(pred):
            if np.argmax(testY[i][j]) != 0:
                if np.argmax(p) == np.argmax(testY[i][j]):
                    truecnt += 1
                else:
                    falsecnt += 1
    acc = truecnt/(truecnt+falsecnt)
    print('True: %d, False: %d, Acc: %f' % (truecnt, falsecnt, acc))
    return acc

In [41]:
'''
# load data
tags = train_tags[6000:10000]
X, rawY = [], [] # X is 3D: article, paragraph, embedding; Y is 2D: article, paragraph
for f, t in zip(feature, tags):
    while len(f) < INPUT_LENGTH:
        f.append(np.zeros(PARAGRAPH_EMB_DIM))
    while len(t) < INPUT_LENGTH:
        t.append(0)
    f = f[0:INPUT_LENGTH]
    t = t[0:INPUT_LENGTH]
    X.append(f)
    rawY.append(t)
    
Y = [to_categorical(y, num_classes=NUM_TAGS) for y in rawY] # Y is now 3D

data_size = len(X)
train_size = int(data_size * 0.9)
trainX, trainY = X[:train_size], Y[:train_size]
testX, testY = X[train_size:], Y[train_size:]
'''

'\n# load data\ntags = train_tags[6000:10000]\nX, rawY = [], [] # X is 3D: article, paragraph, embedding; Y is 2D: article, paragraph\nfor f, t in zip(feature, tags):\n    while len(f) < INPUT_LENGTH:\n        f.append(np.zeros(PARAGRAPH_EMB_DIM))\n    while len(t) < INPUT_LENGTH:\n        t.append(0)\n    f = f[0:INPUT_LENGTH]\n    t = t[0:INPUT_LENGTH]\n    X.append(f)\n    rawY.append(t)\n    \nY = [to_categorical(y, num_classes=NUM_TAGS) for y in rawY] # Y is now 3D\n\ndata_size = len(X)\ntrain_size = int(data_size * 0.9)\ntrainX, trainY = X[:train_size], Y[:train_size]\ntestX, testY = X[train_size:], Y[train_size:]\n'

In [42]:
# train
model.model.load_weights('save_model/base_1_4.h5')
for i in range(3):
    for j in range(6):
        trainX, trainY = get_train(j)
        model.model.fit(np.array(trainX), np.array(trainY), batch_size=32, epochs=4, validation_split=0.1)
        acc = test_accuracy()
        fname = 'save_model/base_100_%d_%d.h5' % (i+1, j+1)
        print('Saved to ' + fname)
        model.model.save_weights(fname)

Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 24972, False: 9989, Acc: 0.714282
Saved to save_model/base_100_1_1.h5
Train on 3600 samples, validate on 400 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 25232, False: 9729, Acc: 0.721718
Saved to save_model/base_100_1_2.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 25221, False: 9740, Acc: 0.721404
Saved to save_model/base_100_1_3.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 25281, False: 9680, Acc: 0.723120
Saved to save_model/base_100_1_4.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 25053, False: 9908, Acc: 0.716598
Saved to save_model/base_100_1_5.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 25157, False: 9804, Acc: 0.719573
Saved to save_model/base_100_1_6.h5
Train on 4500 samples,

Epoch 3/4
Epoch 4/4
True: 25426, False: 9535, Acc: 0.727268
Saved to save_model/base_100_2_5.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 25387, False: 9574, Acc: 0.726152
Saved to save_model/base_100_2_6.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 25426, False: 9535, Acc: 0.727268
Saved to save_model/base_100_3_1.h5
Train on 3600 samples, validate on 400 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 25271, False: 9690, Acc: 0.722834
Saved to save_model/base_100_3_2.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 25412, False: 9549, Acc: 0.726867
Saved to save_model/base_100_3_3.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 25416, False: 9545, Acc: 0.726981
Saved to save_model/base_100_3_4.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Tr

In [43]:
# plot
# plt.style.use("ggplot")
# plt.figure(figsize=(12,12))
# plt.plot(hist["acc"])
# plt.plot(hist["val_acc"])
# plt.show()