In [1]:
from keras.models import Sequential, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers.crf import CRF
from keras_contrib.utils import save_load_utils
from keras_contrib.metrics import crf_accuracy
from keras_contrib.losses import crf_loss
from keras.utils import to_categorical
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.models import load_model
from keras import backend as K
import tensorflow as tf
import pickle, os

Using TensorFlow backend.


In [2]:
BERT_BASE = os.path.join(os.getcwd(), 'bert/bert_model/uncased_L-12_H-768_A-12')

In [3]:
from utils import *

In [4]:
class LSTMmodel:
    def __init__(self, input_length, para_emb_dim, num_tags, hidden_dim=200, dropout=0.5):
        self.num_tags = num_tags
        self.model = Sequential()
        self.model.add(Bidirectional(LSTM(hidden_dim, return_sequences=True), input_shape=(input_length, para_emb_dim)))
        self.model.add(Dropout(dropout))
        # self.model.add(Bidirectional(LSTM(hidden_dim, return_sequences=True), input_shape=(input_length, para_emb_dim)))
        # self.model.add(Dropout(dropout))
        self.model.add(TimeDistributed(Dense(self.num_tags)))
        crf = CRF(self.num_tags)
        self.model.add(crf)
        self.model.compile('rmsprop', loss=crf_loss, metrics=[crf_accuracy])
    
    def save_model(self, filepath):
        save_load_utils.save_all_weights(self.model, filepath)
    
    def restore_model(self, filepath):
        save_load_utils.load_all_weights(self.model, filepath)
        
    def train(self, trainX, trainY, batch_size=32, epochs=10, validation_split=0.1, verbose=1):
        return self.model.fit(trainX, np.array(trainY), batch_size=batch_size, epochs=epochs, 
                             validation_split=validation_split, verbose=verbose)
    
    @staticmethod
    def myloss(y_true, y_pred):   
        y_pred /= tf.reduce_sum(y_pred, -1, True)
        # manual computation of crossentropy
        y_pred = K.clip(y_pred, K.epsilon(), 1. - K.epsilon())
        return -tf.reduce_sum(y_true * tf.log(y_pred), -1)
        

In [5]:
# from Dataprocessor import Dataprocessor

# filelist = [('data/%d.json' % i) for i in range(500)]
# processor = Dataprocessor()
# train_texts, train_tags, train_rawtags = processor.load_data(filelist)

# save_train_data(train_texts, train_tags, train_rawtags)
train_texts, train_tags, train_rawtags = load_train_data()
print(len(train_texts), len(train_tags), len(train_rawtags))

114975 114975 114975


In [6]:
from bert_utils import get_all_features

bert_config_file = os.path.join(BERT_BASE, 'bert_config.json')
vocab_file = os.path.join(BERT_BASE, 'vocab.txt')
bert_checkpoint = os.path.join(BERT_BASE, 'bert_model.ckpt')
    
# feature = get_all_features(train_texts[:], bert_config_file, vocab_file, bert_checkpoint)
# with open('save_model/feature_1.pk', 'rb') as f:
#     feature = pickle.load(f)
# print(len(feature))


In [7]:
INPUT_LENGTH = 100
PARAGRAPH_EMB_DIM = 768
NUM_TAGS = 12

model = LSTMmodel(INPUT_LENGTH, PARAGRAPH_EMB_DIM, NUM_TAGS)
model.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 100, 400)          1550400   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 400)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 12)           4812      
_________________________________________________________________
crf_1 (CRF)                  (None, 100, 12)           324       
Total params: 1,555,536
Trainable params: 1,555,536
Non-trainable params: 0
_________________________________________________________________


In [33]:
def get_train(i):
    with open('save_model/feature_%d.pk' % i, 'rb') as f:
        feature = pickle.load(f)
    
    tags = train_tags[i*5000:(i+1)*5000]
    
    X, rawY = [], [] # X is 3D: article, paragraph, embedding; Y is 2D: article, paragraph
    for f, t in zip(feature, tags):
        while len(f) < INPUT_LENGTH:
            f.append(np.zeros(PARAGRAPH_EMB_DIM))
        while len(t) < INPUT_LENGTH:
            t.append(0)
        f = f[0:INPUT_LENGTH]
        t = t[0:INPUT_LENGTH]
        X.append(f)
        rawY.append(t)
    
    Y = [to_categorical(y, num_classes=NUM_TAGS) for y in rawY] # Y is now 3D
    return X, Y

        
def get_test(i):
    with open('save_model/feature_%d.pk' % i, 'rb') as f:
        feature = pickle.load(f)
    tags = train_tags[i*5000:(i+1)*5000]
    X, rawY = [], [] # X is 3D: article, paragraph, embedding; Y is 2D: article, paragraph
    for f, t in zip(feature, tags):
        while len(f) < INPUT_LENGTH:
            f.append(np.zeros(PARAGRAPH_EMB_DIM))
        while len(t) < INPUT_LENGTH:
            t.append(0)
        f = f[0:INPUT_LENGTH]
        t = t[0:INPUT_LENGTH]
        X.append(f)
        rawY.append(t)
    
    Y = [to_categorical(y, num_classes=NUM_TAGS) for y in rawY] # Y is now 3D
    return X, Y

In [31]:
testPickleID = 12
testX, testY = get_test(testPickleID)
def test_accuracy(X=testX, Y=testY):
    # Predict on test
    test_pred = model.model.predict(np.array(X), verbose=1)
    truecnt = 0
    falsecnt = 0
    _max = 0
    _maxarg = 0
    for (i, pred) in enumerate(test_pred):
        tcnt = 0
        fcnt = 0
        for j, p in enumerate(pred):
            if np.argmax(Y[i][j]) != 0:
                if np.argmax(p) == np.argmax(Y[i][j]):
                    tcnt += 1
                else:
                    fcnt += 1
            else:
                break
        sample_acc = tcnt/(tcnt+fcnt) if tcnt+fcnt != 0 else 0
        if sample_acc > _max and j >= 6:
            _max = sample_acc
            _maxarg = i
        truecnt += tcnt
        falsecnt += fcnt
        
    acc = truecnt/(truecnt+falsecnt)
    print('True: %d, False: %d, Acc: %f' % (truecnt, falsecnt, acc))
    return acc, _max, _maxarg

In [17]:
'''
# load data
tags = train_tags[6000:10000]
X, rawY = [], [] # X is 3D: article, paragraph, embedding; Y is 2D: article, paragraph
for f, t in zip(feature, tags):
    while len(f) < INPUT_LENGTH:
        f.append(np.zeros(PARAGRAPH_EMB_DIM))
    while len(t) < INPUT_LENGTH:
        t.append(0)
    f = f[0:INPUT_LENGTH]
    t = t[0:INPUT_LENGTH]
    X.append(f)
    rawY.append(t)
    
Y = [to_categorical(y, num_classes=NUM_TAGS) for y in rawY] # Y is now 3D

data_size = len(X)
train_size = int(data_size * 0.9)
trainX, trainY = X[:train_size], Y[:train_size]
testX, testY = X[train_size:], Y[train_size:]
'''

'\n# load data\ntags = train_tags[6000:10000]\nX, rawY = [], [] # X is 3D: article, paragraph, embedding; Y is 2D: article, paragraph\nfor f, t in zip(feature, tags):\n    while len(f) < INPUT_LENGTH:\n        f.append(np.zeros(PARAGRAPH_EMB_DIM))\n    while len(t) < INPUT_LENGTH:\n        t.append(0)\n    f = f[0:INPUT_LENGTH]\n    t = t[0:INPUT_LENGTH]\n    X.append(f)\n    rawY.append(t)\n    \nY = [to_categorical(y, num_classes=NUM_TAGS) for y in rawY] # Y is now 3D\n\ndata_size = len(X)\ntrain_size = int(data_size * 0.9)\ntrainX, trainY = X[:train_size], Y[:train_size]\ntestX, testY = X[train_size:], Y[train_size:]\n'

In [18]:
# train
model.model.load_weights('save_model/base_100n_1_5.h5')
for i in range(5):
    for j in range(15, -1, -1):
        if j == testPickleID:
            continue
        trainX, trainY = get_train(j)
        model.model.fit(np.array(trainX), np.array(trainY), batch_size=32, epochs=1, validation_split=0.1)
        acc, _m, _marg = test_accuracy()
    fname = 'save_model/base_100nn_%d.h5' % (i+1)
    print('Saved to ' + fname)
    model.model.save_weights(fname)

Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 127159, False: 46702, Acc: 0.731383
Saved to save_model/base_100n_1_12.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 127143, False: 46718, Acc: 0.731291
Saved to save_model/base_100n_1_11.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 127045, False: 46816, Acc: 0.730727
Saved to save_model/base_100n_1_10.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 127176, False: 46685, Acc: 0.731481
Saved to save_model/base_100n_1_9.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 127015, False: 46846, Acc: 0.730555
Saved to save_model/base_100n_1_8.h5
Train on 4500 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
True: 126417, False: 47444, Acc: 0.727115
Saved to save_model/base_100n_1_7.h5
T

KeyboardInterrupt: 

In [None]:
# plot
# plt.style.use("ggplot")
# plt.figure(figsize=(12,12))
# plt.plot(hist["acc"])
# plt.plot(hist["val_acc"])
# plt.show()

In [32]:
model.model.load_weights('save_model/base_100n_1_5.h5')
for i in range(16):
    trainX, trainY = get_train(i)
    acc, _m, _marg = test_accuracy(trainX, trainY)
    print(i, acc, _m, _marg)

True: 127643, False: 43690, Acc: 0.745000
0 0.7449995038900854 1.0 3
True: 32108, False: 103180, Acc: 0.237331
1 0.23733073147655373 0.9473684210526315 3718
True: 126840, False: 45364, Acc: 0.736568
2 0.7365682562542101 1.0 25
True: 124491, False: 43727, Acc: 0.740058
3 0.7400575443769395 1.0 6
True: 128801, False: 41533, Acc: 0.756167
4 0.7561672948442472 1.0 4
True: 128398, False: 43906, Acc: 0.745183
5 0.7451829324914105 1.0 25
True: 127299, False: 45097, Acc: 0.738410
6 0.7384104039536881 1.0 32
True: 125879, False: 45687, Acc: 0.733706
7 0.7337059790401361 1.0 4


KeyboardInterrupt: 

In [34]:
print(train_texts[8718])

["The National College of Arts (Urdu: قومی کالج هنر\u202c\u200e or NCA) is a public art school located in Lahore Punjab, Pakistan. NCA is the oldest art school in Pakistan and the second oldest in South Asia. As of 2016, the college is ranked as Pakistan's top art school. NCA maintains five departments in fine art, design film and TV, musicology and architecture and consists of over 800 students. The college runs faculty and student exchange programs with School of Fine Arts, University of New South Wales, École nationale supérieure des Beaux-Arts and the Instituto Superior de Arte. It also hosts the UNESCO Chair in architecture.", "NCA was originally founded in 1875 as the Mayo School of Industrial Arts and was one of two art colleges created by the British Crown in British India in reaction to the Arts & Crafts Movement. The Mayo School of Industrial Arts was named in honor of the recently assassinated British Viceroy Lord Mayo in 1875. John Lockwood Kipling becoming the school's fir

In [37]:
with open('save_model/example.txt', 'w') as f:
    for line in train_texts[8718]:
        f.write(line + '\n')