<a href="https://colab.research.google.com/github/Herais/NLP_Learning_by_Selective_Data/blob/main/Open.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
####[1]####
# Cells to run before this one: None
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to project directory
import os
if not os.path.exists('/content/drive/MyDrive/Github'):
  !mkdir Github
path_github   = '/content/drive/MyDrive/Github'
os.chdir(path_github)

# git clone
#!git clone https://github.com/Herais/NLP_Learning_by_Selective_Data.git
path_wd = path_github + '/' + 'NLP_Learning_by_Selective_Data'
os.chdir(path_wd)
!pwd

# INSTALL REQUIRED PACKAGES
!pip install -r requirements.txt
%tensorflow_version 1.x

Mounted at /content/drive
/content/drive/MyDrive/Github/NLP_Learning_by_Selective_Data


In [None]:
####[2]####
# LOAD AND EXPLORE DATASETS
# Cells to run before this one: [1]

# Import Libaries
import numpy as np
import pandas as pd

# Path
path_datasets ='/datasets'

# Load Dataset: Movie Title and Summary
path_dataset_movie = '/Movie_Title_and_Summary'
df_movie = pd.read_json(path_wd + path_datasets + path_dataset_movie + '/train.json', orient='records', lines=True)
df_movie.head(10)

# Load Dataset: Computer Research Literature (CSL) Title and Abstract
path_dataset_csl = '/CSL'
df_csl = pd.read_json(path_wd + path_datasets + path_dataset_csl + '/train_new_2500.json', orient='records', lines=True)
df_csl.head(10)

In [None]:
####[3]####
# LOAD TRAINED MODELS
# Cells to run before this one: [1],[2]

#Load Libraries
from tensorflow import keras

# Set Path
path_models = '/models'

# Download Existing Models if Necesarry
# 1. model StarChaser
# 2. model Scholar 
# 3. model Knowitall

#Load Models
# the Star Chaser Model trained with movie dataset only
model_StarChaser = keras.models.load_model((path_wd + path_models + '/model_StarChaser.bin')

# the Scholar Model trained with Computer Science Literature only
model_Scholar = keras.models.load_model((path_wd + path_models + '/model_Scholar.bin')

# the Know-it-all Model Trained with both movie and cscl dataset
model_Scholar = keras.models.load_model((path_wd + path_models + '/model_Knowitall.bin')

In [3]:
# SET MODEL PARAMETERS
# Cells to run before this one: [1],[2]

# Set Model Parameter
epochs = 10 # number of epochs to train
batch_size = 8 # 
maxlen = 256
topk = 1

# Set Pretrained Model Parameters (BERT)
config_path = path_wd + path_pretrained_model + '/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = path_wd + path_pretrained_model + '/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = path_wd + path_pretrained_model + '/chinese_L-12_H-768_A-12/vocab.txt'

# Load Vocab
from bert4keras.tokenizers import Tokenizer, load_vocab

token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)

In [5]:
# LOAD TRAINED MODELS
# Cells to run before this one: [1]

# Load Libraries
from tensorflow import keras
from bert4keras.models import build_transformer_model

# Set Path
path_models = '/models'

# the Star Chaser Model trained with movie dataset only
model_StarChaser = build_transformer_model(
    config_path,
    checkpoint_path,
    application='unilm',
    keep_tokens=keep_tokens,  # include only tokens in keep tokens
)
model_StarChaser.load_weights(path_wd + path_models + '/best_model_StarChaser.weights')

# the Scholar Model trained with Computer Science Literature only
model_Scholar = build_transformer_model(
    config_path,
    checkpoint_path,
    application='unilm',
    keep_tokens=keep_tokens,   # include only tokens in keep tokens
)
model_Scholar.load_weights(path_wd + path_models + '/best_model_Scholar.weights')

# the Know-it-all Model Trained with both movie and cscl dataset
model_Knowitall = build_transformer_model(
    config_path,
    checkpoint_path,
    application='unilm',
    keep_tokens=keep_tokens,   # include only tokens in keep tokens
) 
model_Knowitall.load_weights(path_wd + path_models + '/best_model_Knowitall.weights')


Using TensorFlow backend.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [3]:
# MAKE PREDICTION
# Cells to run before this one: 
# [1] Mount Drive, Install Requirements
# [2] Load Models

# Load Library
import pandas as pd
import numpy as np
import json
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder

# Set Path
path_datasets ='/datasets'
path_predictions = '/predictions'
path_dataset_movie = '/Movie_Title_and_Summary'
path_dataset_csl = '/CSL'
path_test = path_wd + path_datasets + path_dataset_csl + '/test.json'
path_prediction = path_wd + path_predictions + '/model_Knowitall_predicts_CSL_Titles.json'

# Load Test Data
test_data = []
with open(path_test, encoding='utf-8') as f:
    for l in f:
        test_data.append(json.loads(l.strip()))

# Decoder: seq2seq
class AutoTitle(AutoRegressiveDecoder):
    #seq2seq decoder
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
    def predict(self, inputs, output_ids, states):
        token_ids, segment_ids = inputs
        token_ids = np.concatenate([token_ids, output_ids], 1)
        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
        # CHANGE MODEL NAME HERE TO SWITCH PREDICTING MODELS:
        return model_Knowitall.predict([token_ids, segment_ids])[:, -1]

    def generate(self, text, topk=1):
        max_c_len = maxlen - self.maxlen
        token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
        output_ids = self.beam_search([token_ids, segment_ids],
                                      topk=topk)  # beam search
        return tokenizer.decode(output_ids)

autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32)

# Write to Prediction Path
with open(path_prediction ,'w') as f:
    for data in test_data:
        pred_title = ''.join(autotitle.generate(data["abst"], topk)).lower()
        output = {}
        output["id"] = data["id"]
        output["title"] = pred_title
        f.write(json.dumps(output, ensure_ascii = False))
        f.write('\n')

FileNotFoundError: ignored

In [10]:
# RESULT ANALYSIS
# Previous

In [None]:
# TRAIN MORE MODELS
# Cells to run before this one: [1]
# [1] Mount Drive, Install Requirements

# Load Libraries
from __future__ import print_function
import numpy as np
import json
from tqdm import tqdm
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import utils  # LocalLibraries

# Set Path
path_models = '/models'
path_datasets ='/datasets'

# Set Name of this model
########################################
model_name = 'model_ScholarNew'
########################################

# Load Train and Dev Data
path_train = path_wd + path_datasets + '/CSL/train_new_2500.json'
path_dev = path_wd + path_datasets + '/CSL/dev.json'
train_data = utils.load_data(path_train)
valid_data = utils.load_data(path_dev)
# Could use pandas, but pandas has limitation on cell string length

# Set Model Parameter
epochs = 10
batch_size = 16
maxlen = 256
topk = 1

# Set BERT Path
# Download pretrained BERT model if necesary
path_pretrained_model = '/pretrained_model'
path_bert_chinese = path_wd + path_pretrained_model + '/chinese_L-12_H-768_A-12'
if not os.path.exists(path_bert_chinese):
 # Download Model
  !wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip -P './pretrained_model/'
  %cd pretrained_model
  !unzip chinese_L-12_H-768_A-12.zip
  !rm -f chinese_L-12_H-768_A-12.zip
  os.chdir(path_wd)
  !pwd
config_path = path_wd + path_pretrained_model + '/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = path_wd + path_pretrained_model + '/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = path_wd + path_pretrained_model + '/chinese_L-12_H-768_A-12/vocab.txt'

# Load Vocab and Tokenize
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)
 
# Data Generator
class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids = [], []
        for is_end, (title, content) in self.sample(random):
            token_ids, segment_ids = tokenizer.encode(
                content, title, maxlen=maxlen
            )
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids], None
                batch_token_ids, batch_segment_ids = [], []
train_generator = data_generator(train_data, batch_size)

# Loss Computation
class CrossEntropy(Loss):
    """交叉熵作为loss，并mask掉输入部分
    """
    def compute_loss(self, inputs, mask=None):
        y_true, y_mask, y_pred = inputs
        y_true = y_true[:, 1:]  # 目标token_ids
        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        loss = K.sum(loss * y_mask) / K.sum(y_mask)
        return loss

# Decoder: seq2seq
class AutoTitle(AutoRegressiveDecoder):
    #seq2seq decoder
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
    def predict(self, inputs, output_ids, states):
        token_ids, segment_ids = inputs
        token_ids = np.concatenate([token_ids, output_ids], 1)
        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
        # CHANGE MODEL NAME HERE TO SWITCH PREDICTING MODELS:
        return model.predict([token_ids, segment_ids])[:, -1]

    def generate(self, text, topk=1):
        max_c_len = maxlen - self.maxlen
        token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
        output_ids = self.beam_search([token_ids, segment_ids],
                                      topk=topk)  # beam search
        return tokenizer.decode(output_ids)

autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32)

# Callback
class Evaluator(keras.callbacks.Callback):
    def __init__(self, model_name=''):
        self.rouge = Rouge()
        self.smooth = SmoothingFunction().method1
        self.best_bleu = 0.
        self.valid_data = valid_data
        self.model_name = model_name # save weights given model name

    def on_epoch_end(self, epoch, logs=None):
        metrics = self.evaluate(valid_data)  # 评测模型
        if metrics['bleu'] > self.best_bleu:
            self.best_bleu = metrics['bleu']
            model.save_weights('./weights/' + self.model_name + '_best_model.weights')  # 保存模型
        metrics['best_bleu'] = self.best_bleu
        print('valid_data:', metrics)

    def evaluate(self, data, topk=1):
        total = 0
        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
        for title, content in tqdm(data):
            total += 1
            title = ' '.join(title).lower()
            pred_title = ' '.join(autotitle.generate(content, topk)).lower()
            if pred_title.strip():
                scores = self.rouge.get_scores(hyps=pred_title, refs=title)
                rouge_1 += scores[0]['rouge-1']['f']
                rouge_2 += scores[0]['rouge-2']['f']
                rouge_l += scores[0]['rouge-l']['f']
                bleu += sentence_bleu(
                    references=[title.split(' ')],
                    hypothesis=pred_title.split(' '),
                    smoothing_function=self.smooth
                )
        rouge_1 /= total
        rouge_2 /= total
        rouge_l /= total
        bleu /= total
        return {
            'rouge-1': rouge_1,
            'rouge-2': rouge_2,
            'rouge-l': rouge_l,
            'bleu': bleu,
        }
evaluator = Evaluator(model_name)

# Initialize Model with BERT
model = build_transformer_model(
    config_path,
    checkpoint_path,
    application='unilm',
    keep_tokens=keep_tokens,  # reduce vocab size
)
output = CrossEntropy(2)(model.inputs + model.outputs)
model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))
model.summary()

# Train Model
model.fit(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=epochs,
    callbacks=[evaluator]
)

# Save Trained Model
utils.save_model_with_dt(path_wd + path_models, model, model_name)

  'be expecting any data to be passed to {0}.'.format(name))


Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     multiple             10432512    Input-Token[0][0]                
                                                                 MLM-Norm[0][0]                   
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]        