In [3]:
from keras.models import Sequential, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers.crf import CRF
from keras_contrib.utils import save_load_utils
from keras_contrib.metrics import crf_accuracy
from keras_contrib.losses import crf_loss
from keras.utils import to_categorical
import pandas as pd
import matplotlib.pyplot as plt

import json
import os


In [4]:
def get_json(fn):
    with open(fn, 'r', encoding='utf-8') as f:
        text = ''.join(f.readlines())
    return json.loads(text)


class Dataprocessor:
    def __init__(self):
        self.id2tag = {0: 'PAD', 1: 'B0', 2: 'B1', 3: 'B2', 4: 'B3', 5: 'B4', 6: 'B-1', 
                       7: 'B-2', 8: 'B-3', 9: 'B-4', 10: 'M', 11: 'E'}
        self.tag2id = {}
        for i in self.id2tag:
            self.tag2id[self.id2tag[i]] = i
        self.max_paragraph = 0
    
    def getid(self, tag):
        return self.tag2id[tag]

    def gettag(self, id):
        return self.id2tag[id]

    def section_split(self, datalist):
        for js in datalist:
            del js["simple"]
            new_full = []
            for i in js["full"]["sectionContents"]:
                if len(i["text"]) == 0 or i["isContentSection"] == False:
                    continue
                i["text"] = i["text"].split('\n')
                new_full.append(i)
            self.max_paragraph = max(self.max_paragraph, len(new_full))
            js["full"]["sectionContents"] = new_full

    def tagging(self, data):
        for js in data:
            last = None
            for i in js["full"]["sectionContents"]:
                i["tags"] = []
                i["raw_tags"] = []
                for id, x in enumerate(i["text"]):
                    if last is None:
                        last = i["depth"]
                    th = i["depth"] - last
                    if id == len(i["text"]) - 1:
                        tag = "E"
                    else:
                        tag = "M"
                    if id == 0:
                        tag = "B%d"%(th)
                    i["tags"].append(self.getid(tag))
                    i["raw_tags"].append(tag)
                last = i["depth"]
    
    def load_data(self, jsonfilelist):
        article_texts = []
        article_tags = []
        article_rawtags = []
        for f in jsonfilelist:
            print('loading %s...' % (os.path.basename(f)))
            data = get_json(f)
            self.section_split(data)
            self.tagging(data)
            for article in data:
                paragraphs = []
                tags = []
                rawtags = []
                for sec in article['full']['sectionContents']:
                    paragraphs.extend(sec['text'])
                    tags.extend(sec['tags'])
                    rawtags.extend(sec['raw_tags'])
                article_texts.append(paragraphs)
                article_tags.append(tags)
                article_rawtags.append(rawtags)
        
        print('Finish loading data! Total %d articles.' % (len(article_texts)))
        return article_texts, article_tags, article_rawtags


In [15]:
data_path = './data/'
filelist = [(data_path + '%d.json' % i) for i in range(1)]
processor = Dataprocessor()
train_texts, train_tags, train_rawtags = processor.load_data(filelist)
print(len(train_texts), len(train_tags))
print(train_texts[1], train_tags[1])

loading 0.json...
Finish loading data! Total 226 articles.
226 226
['The 1996 African Cup of Nations was the 20th edition of the Africa Cup of Nations, the soccer championship of Africa (CAF). It was hosted by South Africa, who replaced original hosts Kenya. The field expanded for the first time to sixteen teams, split into four groups of four; the top two teams in each group advancing to the quarterfinals. However, Nigeria withdrew from the tournament at the final moment under pressure from then-dictator Sani Abacha, reducing the field to fifteen. South Africa won its first championship, beating Tunisia in the final 2−0.', '\xa0Algeria', '\xa0Angola', '\xa0Burkina Faso', '\xa0Cameroon', '\xa0Ivory Coast', '\xa0Egypt', '\xa0Gabon', '\xa0Ghana', '\xa0Liberia', '\xa0Mozambique', '\xa0Nigeria (holders)*', '\xa0Sierra Leone', '\xa0South Africa (hosts)', '\xa0Tunisia', '\xa0Zaire', '\xa0Zambia', "* Nigeria withdrew prior to the start of the finals. Guinea, as the best side to not qualify, w

In [51]:
from Dataprocessor import Dataprocessor

filelist = [('data/%d.json' % i) for i in range(500)]
processor = Dataprocessor()
train_texts, train_tags, train_rawtags = processor.load_data(filelist)

loading 0.json...
loading 1.json...
loading 2.json...
loading 3.json...
loading 4.json...
loading 5.json...
loading 6.json...
loading 7.json...
loading 8.json...
loading 9.json...
loading 10.json...
loading 11.json...
loading 12.json...
loading 13.json...
loading 14.json...
loading 15.json...
loading 16.json...
loading 17.json...
loading 18.json...
loading 19.json...
loading 20.json...
loading 21.json...
loading 22.json...
loading 23.json...
loading 24.json...
loading 25.json...
loading 26.json...
loading 27.json...
loading 28.json...
loading 29.json...
loading 30.json...
loading 31.json...
loading 32.json...
loading 33.json...
loading 34.json...
loading 35.json...
loading 36.json...
loading 37.json...
loading 38.json...
loading 39.json...
loading 40.json...
loading 41.json...
loading 42.json...
loading 43.json...
loading 44.json...
loading 45.json...
loading 46.json...
loading 47.json...
loading 48.json...
loading 49.json...
loading 50.json...
loading 51.json...
loading 52.json...
loa

loading 416.json...
loading 417.json...
loading 418.json...
loading 419.json...
loading 420.json...
loading 421.json...
loading 422.json...
loading 423.json...
loading 424.json...
loading 425.json...
loading 426.json...
loading 427.json...
loading 428.json...
loading 429.json...
loading 430.json...
loading 431.json...
loading 432.json...
loading 433.json...
loading 434.json...
loading 435.json...
loading 436.json...
loading 437.json...
loading 438.json...
loading 439.json...
loading 440.json...
loading 441.json...
loading 442.json...
loading 443.json...
loading 444.json...
loading 445.json...
loading 446.json...
loading 447.json...
loading 448.json...
loading 449.json...
loading 450.json...
loading 451.json...
loading 452.json...
loading 453.json...
loading 454.json...
loading 455.json...
loading 456.json...
loading 457.json...
loading 458.json...
loading 459.json...
loading 460.json...
loading 461.json...
loading 462.json...
loading 463.json...
loading 464.json...
loading 465.json...


In [57]:
from bert_utils import get_all_features
import os
BERT_BASE = os.path.join(os.getcwd(), 'bert/bert_model/uncased_L-12_H-768_A-12')

sample_sum = 200

bert_config_file = os.path.join(BERT_BASE, 'bert_config.json')
vocab_file = os.path.join(BERT_BASE, 'vocab.txt')
bert_checkpoint = os.path.join(BERT_BASE, 'bert_model.ckpt')
    
feature = get_all_features(train_texts[0:sample_sum], bert_config_file, vocab_file, bert_checkpoint)
print(len(feature))

# 8520: paragraphs
# 6000: 
# 0: item
# 1: 14.. sentence per item
# 2: 768 vec

Total 8520 paragraphs
0 ...
2000 ...
4000 ...
6000 ...
8000 ...
200


In [74]:
print(np.array(train_texts[0]).shape)
print(np.array(feature).shape)
print(np.array(train_tags[0]).shape)
print(train_texts[0])
print(train_tags[0])

                


(14,)
(200,)
(100,)
["The 1992 Republican National Convention was held in the Astrodome in Houston, Texas, from August 17 to August 20, 1992. The convention nominated President George H. W. Bush and Vice President Dan Quayle for reelection. It was Bush's fourth consecutive appearance as a candidate on a major party ticket; only Bush and Franklin D. Roosevelt have been nominated on four consecutive presidential tickets. Richard M. Nixon and Roosevelt were nominated five times, but not consecutively.", "The convention is notable in that it featured the last major address of former President and Bush's Predecessor Ronald Reagan's long political career. In his speech, Reagan told Americans that:", "Whatever else history may say about me when I'm gone, I hope it will record that I appealed to your best hopes, not your worst fears, to your confidence rather than your doubts. My dream is that you will travel the road ahead with liberty's lamp guiding your steps and opportunity's arm steadying

In [32]:
class LSTMmodel:
    def __init__(self, input_length, para_emb_dim, num_tags, hidden_dim=200, dropout=0.5):
        self.num_tags = num_tags
        self.model = Sequential()
        self.model.add(Bidirectional(LSTM(hidden_dim, return_sequences=True), input_shape=(input_length, para_emb_dim)))
        self.model.add(Dropout(dropout))
        # self.model.add(Bidirectional(LSTM(hidden_dim, return_sequences=True), input_shape=(input_length, para_emb_dim)))
        # self.model.add(Dropout(dropout))
        self.model.add(TimeDistributed(Dense(self.num_tags)))
        crf = CRF(self.num_tags)
        self.model.add(crf)
        self.model.compile('rmsprop', loss=crf_loss, metrics=[crf_accuracy])
    
    def save_model(self, filepath):
        save_load_utils.save_all_weights(self.model, filepath)
    
    def restore_model(self, filepath):
        save_load_utils.load_all_weights(self.model, filepath)
        
    def train(self, trainX, trainY, batch_size=32, epochs=10, validation_split=0.1, verbose=1):
        return self.model.fit(trainX, np.array(trainY), batch_size=batch_size, epochs=epochs, 
                             validation_split=validation_split, verbose=verbose)

In [33]:
INPUT_LENGTH = 100
PARAGRAPH_EMB_DIM = 768
NUM_TAGS = 12

model = LSTMmodel(INPUT_LENGTH, PARAGRAPH_EMB_DIM, NUM_TAGS)
model.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 100, 400)          1550400   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 400)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 12)           4812      
_________________________________________________________________
crf_1 (CRF)                  (None, 100, 12)           324       
Total params: 1,555,536
Trainable params: 1,555,536
Non-trainable params: 0
_________________________________________________________________


In [59]:
# load data
import numpy as np

tags = train_tags[0:sample_sum]
X, rawY = [], [] # X is 3D: article, paragraph, embedding; Y is 2D: article, paragraph
for f, t in zip(feature, tags):
    
    while len(f) < INPUT_LENGTH:
        f.append(np.zeros(PARAGRAPH_EMB_DIM))
        t.append(0)
    f = f[0:INPUT_LENGTH]
    t = t[0:INPUT_LENGTH]
    X.append(f)
    rawY.append(t)
    
Y = [to_categorical(y, num_classes=NUM_TAGS) for y in rawY] # Y is now 3D

data_size = len(X)
train_size = int(data_size * 0.9)
trainX, trainY = X[:train_size], Y[:train_size]
testX, testY = X[train_size:], Y[train_size:]

In [60]:
# train
history = model.model.fit(np.array(trainX), np.array(trainY), batch_size=32, epochs=10, validation_split=0.1)

Train on 162 samples, validate on 18 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [61]:
# Predict on test
test_pred = model.model.predict(np.array(testX), verbose=1)



In [62]:
truecnt = 0
falsecnt = 0
for (i, pred) in enumerate(test_pred):
    for j, p in enumerate(pred):
        if np.argmax(testY[i][j]) != 0:
            if np.argmax(p) == np.argmax(testY[i][j]):
                truecnt += 1
            else:
                falsecnt += 1
print(truecnt, falsecnt, truecnt/(truecnt+falsecnt))

409 326 0.5564625850340136


In [64]:
model.save_model("model_save/init.h5")

In [67]:
# plot
# plt.style.use("ggplot")
# plt.figure(figsize=(12,12))
# plt.plot(history["acc"])
# plt.plot(history["val_acc"])
# plt.show()

In [77]:
import csv
def save_train_tsv(texts, tags):
    
    file_path = "./data/train.tsv"

    with open(file_path, 'w') as file:
        tsv_writer = csv.writer(file, delimiter='\t')
        
        for text, tag in zip(texts, tags):
            for i in range(len(text)):
                tsv_writer.writerow([text[i], tag[i]])

save_train_tsv(train_texts, train_tags)

In [81]:
def read_tsv():
    
    file_path = "./data/train.tsv"
    with open(file_path, 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            print(row)
            break

read_tsv()

["The 1992 Republican National Convention was held in the Astrodome in Houston, Texas, from August 17 to August 20, 1992. The convention nominated President George H. W. Bush and Vice President Dan Quayle for reelection. It was Bush's fourth consecutive appearance as a candidate on a major party ticket; only Bush and Franklin D. Roosevelt have been nominated on four consecutive presidential tickets. Richard M. Nixon and Roosevelt were nominated five times, but not consecutively.", '1']


In [105]:
import csv
from bert import modeling

from bert.run_classifier import DataProcessor

data_dir = './data/'

class MyProcessor(DataProcessor):
    

    def get_train_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        return ["0", "1", "2", "3"]

    def _create_examples(self, lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            if set_type == "test":
                text_a = tokenization.convert_to_unicode(line[0])
                label = "0"
            else:
                text_a = tokenization.convert_to_unicode(line[1])
                label = tokenization.convert_to_unicode(line[0])
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples

class EditTsv:
    
    file_path = "./data/"
    train_rate = 0.8
    test_rate = 0.2

    @staticmethod
    def read_tsv():

        with open(EditTsv.file_path + 'train.tsv', 'r') as file:
            reader = csv.reader(tsvfile, delimiter='\t')
            for row in reader:
                print(row)
                input()

    @staticmethod
    def save_train_tsv(texts, tags):
        
        with open(EditTsv.file_path + 'train.tsv', 'w') as file:
            tsv_writer = csv.writer(file, delimiter='\t')
            for text, tag in zip(texts, tags):
                for i in len(text):
                    tsv_writer.writerow([text[i], tag[i]])

    @staticmethod
    def save_dev_tsv(texts, tags):

        with open(file_path + 'dev.tsv', 'w') as file:
            tsv_writer = csv.writer(file, delimiter='\t')
            for text, tag in zip(texts, tags):
                for i in len(text):
                    tsv_writer.writerow([text[i], tag[i]])
    
    @staticmethod
    def save_test_tsv(texts, tags):    
        
        with open(EditTsv.file_path + 'test.tsv', 'w') as file:
            tsv_writer = csv.writer(file, delimiter='\t')
            for text, tag in zip(texts, tags):
                for i in len(text):
                    tsv_writer.writerow([text[i], tag[i]])
                    
processors = {
      "myproc": MyProcess
}

ModuleNotFoundError: No module named 'modeling'