# A14: Named Entity Disambiguation

## Data Set Overview
- Stocks(Entity Dictionary)
    - company_2_code_sub.txt: 
        - ./drive/My Drive/fwwb_workspace/data_from_fwwb/company_2_code_sub.txt
    - company_2_code_full.txt
        - ./drive/My Drive/fwwb_workspace/data_from_fwwb/company_2_code_full.txt
- NED Corpus
    - train.json
        - ./drive/My Drive/fwwb_workspace/data_from_fwwb/train.json
    - dev.json
        - ./drive/My Drive/fwwb_workspace/data_from_fwwb/dev.json
- Complementary Corpus
    - raw_texts.txt
        - ./drive/My Drive/fwwb_workspace/data_from_fwwb/raw_texts.txt

## Configurations

In [15]:
%tensorflow_version 1.x
!pip install kashgari-tf
!pip install jieba



In [0]:
import jieba
import jieba.posseg as pseg
import os


WORKSPACE_PATH = os.path.join('.', 'drive', 'My Drive', 'fwwb_workspace')

MODE = 'word' # 'char', 'word', 'pos-tagged'
CUSTOMIZED_DICT = True

USE_BERT = False
BERT_PATH = os.path.join(WORKSPACE_PATH, 'bert')


if MODE == 'char':

    params = {
        'unit': 'char',
    }

elif MODE == 'word':
    
    params = {
        'unit': 'word',
        'tokenize_func': jieba.lcut,
    }

elif MODE == 'pos-tagged':
    
    params = {
        'unit': 'pos-tagged',
        'tokenize_func': lambda x: x,
        'pseg_func': pseg.lcut,
    }


if USE_BERT:
    if MODE != 'char': raise Exception("BERT must be applied under 'char' MODE")
    MODE += '_bert'

if CUSTOMIZED_DICT:
    if MODE not in ['word', 'pos-tagged']: raise Exception("'char' mode won't trigger the user dictionary application")
    MODE += '_cd'

## Loading Data
Alright, since every data has been shown beyond, let's load these data for next training.

### Stocks
Stocks are dictionary-like, which means the data structure is in key-value form. The keys are designed to be the "stock_name" for hash searching, and the values are in pandas.Series, which contains the detail of a specificated stock.

In [17]:
import pandas as pd
import logging


logging.getLogger().setLevel(logging.DEBUG)


class Stocks(dict):
    """
    Stocks are dictionary-like, which means the data structure is in key-value form. 
    The keys are designed to be the "stock_name" for hash searching, 
    and the values are in pandas.Series, which contains the detail of a specificated stock.
    """

    __sub_path__ = os.path.join(WORKSPACE_PATH, 'data_from_fwwb', 'company_2_code_sub.txt')
    __full_path__ = os.path.join(WORKSPACE_PATH, 'data_from_fwwb', 'company_2_code_full.txt')
    __dict__path__ = os.path.join(WORKSPACE_PATH, 'data_from_fwwb', 'stock_names.txt')

    @classmethod
    def load(cls):

        stocks_sub = pd.read_csv(cls.__sub_path__, sep='\t')
        stocks_full = pd.read_csv(cls.__full_path__, sep='\t', names=['stock_name', 'stock_full_name', 'stock_code'])
        # Set default id = -1
        stocks_full.insert(0, 'kb_id', -1)

        # difference_set = stocks_full - stocks_sub
        difference_set = stocks_full.loc[stocks_full['stock_name'].isin(stocks_sub['stock_name']) == False]

        table = pd.concat([stocks_sub, difference_set], copy=False)
        table.index = table['stock_name']
        dic = dict((row['stock_name'], row) for idx, row in table.iterrows())

        logging.debug(f"loaded {len(dic)} stocks from {cls.__sub_path__} and {cls.__full_path__}.")

        return cls(dic)

    def save(self):

        dict_path = self.__dict__path__
        with open(dict_path, 'w', encoding='utf-8') as file:
            file.write(' nt\n'.join(self.keys()))
        logging.debug(f"saved {len(self)} stock names to {dict_path}.")
        self.dict_path = dict_path


stocks = Stocks.load()
stocks.save()

if CUSTOMIZED_DICT: jieba.load_userdict(stocks.dict_path)

DEBUG:root:loaded 11444 stocks from ./drive/My Drive/fwwb_workspace/data_from_fwwb/company_2_code_sub.txt and ./drive/My Drive/fwwb_workspace/data_from_fwwb/company_2_code_full.txt.
DEBUG:root:saved 11444 stock names to ./drive/My Drive/fwwb_workspace/data_from_fwwb/stock_names.txt.


Let's peek serveral samples from it.

In [18]:
n_items = 3
keys = list(stocks.keys())[:n_items]
for i in range(n_items):
    print(f'"{keys[i]}"\n----------')
    print(f"{stocks[keys[i]]}")
    print("# =============================================== #\n")

"美丽生态"
----------
kb_id                         0
stock_name                 美丽生态
stock_full_name    深圳美丽生态股份有限公司
stock_code                   10
Name: 美丽生态, dtype: object

"大悦城"
----------
kb_id                          1
stock_name                   大悦城
stock_full_name    大悦城控股集团股份有限公司
stock_code                    31
Name: 大悦城, dtype: object

"农产品"
----------
kb_id                           2
stock_name                    农产品
stock_full_name    深圳市农产品集团股份有限公司
stock_code                     61
Name: 农产品, dtype: object



### NED Corpus
There are 2 json files separately includes "train set" and "valid set". We need to reformat them into character-level, word-level or even with pos-tags list and corresponding IOB list for further training.

In [19]:
# Given Train and Valid Set
import json
import logging
from typing import Tuple, List, Dict, Callable
from tqdm import tqdm_notebook
from kashgari import utils


logging.getLogger().setLevel(logging.DEBUG)


class NerCorpus(object):
    """
    Load dataset as sequence labeling format, char level tokenized
    features: ``[['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', ...], ...]``
    labels: ``[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', ...], ...]``
    Sample::
        train_x, train_y = NerCorpus.load_data('train')
        test_x, test_y = NerCorpus.load_data('test')
    Args:
        subset_name: {train, valid}
        shuffle: should shuffle or not, default True.
    Returns:
        dataset_features and dataset labels
    """
    __train_path__ = os.path.join(WORKSPACE_PATH, 'data_from_fwwb', 'train.json')
    __vaild_path__ = os.path.join(WORKSPACE_PATH, 'data_from_fwwb', 'dev.json')


    @classmethod
    def json2iob(cls, 
                 json_data: Dict,
                 unit: str = 'char',
                 tokenize_func: Callable = None,
                 pseg_func: Callable = None,
                 shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]:

        x_data, y_data = [], []
        for text_and_tags in tqdm_notebook(json_data):
            text = text_and_tags['text']
            
            if unit == 'char':
                x = list(text)
                y = len(x) * ['O']
                lab_result = text_and_tags['lab_result']
                for result in lab_result:
                    offset = int(result['offset'])
                    ends_at = offset + len(result['mention']) - 1
                    for i in range(offset, ends_at + 1):
                        if i == offset:
                            y[i] = 'B'
                        else:
                            y[i] = 'I'
            elif unit in ['word', "pos-tagged"]:
                if tokenize_func == None:
                    raise Exception("tokenize function must be given when unit = 'word' or 'pos-tagged")
                
                tokens = tokenize_func(text)
                
                if unit == "pos-tagged":
                    if pseg_func == None:
                        raise Exception("pos segment function must be given when unit = 'pos-tagged'")
                    
                    tokens = [w + "|" + f for w, f in pseg_func(tokens)]
                
                x = tokens
                y = len(x) * ['O']
                lab_result = text_and_tags['lab_result']
                for result in lab_result:
                    offset = int(result['offset'])
                    ends_at = offset + len(result['mention']) - 1
                
                current_char_idx = 0
                current_token_idx = 0
                for token in tokens:
                    # the len(token) will cause an error below due to "pos-tagged"'s length
                    # so return to the original status
                    if unit == "pos-tagged":
                        token = token.split("|")[0]

                    if current_char_idx <= offset <= current_char_idx + len(token) - 1:
                        y[current_token_idx] = "B"
                    elif offset <= current_char_idx <= ends_at:
                        y[current_token_idx] = "I"

                    current_char_idx += len(token)
                    current_token_idx += 1

            x_data.append(x)
            y_data.append(y)

        if shuffle:
            x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)

        return x_data, y_data

    @classmethod
    def load_data(cls,
                  subset_name: str = 'train',
                  unit: str = 'char',
                  tokenize_func: Callable = None,
                  pseg_func: Callable = None,
                  shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]:
        
        if subset_name == 'train':
            file_path = cls.__train_path__
        else:
            file_path = cls.__vaild_path__

        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            x_data, y_data = cls.json2iob(data, unit, tokenize_func, pseg_func, shuffle)

            logging.debug(f"loaded {len(x_data)} samples from {file_path}. Sample:\n"
                          f"x[0]: {x_data[0]}\n"
                          f"y[0]: {y_data[0]}")
            return x_data, y_data
    
    @classmethod
    def iob2json(cls, x_data: List[List[str]], y_data: List[List[str]], y_probs: List[List[str]] = None) -> List[Dict]:
        json_data = []
        for text_id, (x, y) in tqdm_notebook(enumerate(zip(x_data, y_data)), total=len(x_data)):
            pred_mentions = []
            pred_offsets = []
            pred_confs = []
            
            # process if in "pos-tagged" mode
            no_tagged_tokens = []
            for token in x:
                if len(token.split("|")) == 2:
                    token = token.split("|")[0]
                no_tagged_tokens.append(token)
            x = no_tagged_tokens

            max_len = min(len(x), len(y)) # model would cut the text into 95% length
            char_idx = 0
            for idx, token in enumerate(x):
                if idx == max_len: break
                if y[idx] in ['B', 'I']:
                    if y[idx] == 'B':
                        pred_offsets.append(char_idx)
                        if y_probs: pred_confs.append(y_probs[text_id][idx])
                        if pred_mentions:  pred_mentions.append('\break')
                    pred_mentions.append(token)
                
                char_idx += len(token)

            pred_mentions = ''.join(pred_mentions).split('\break')

            item = {'text_id': text_id, 'text': ''.join(x)}
            if pred_mentions:
                labels = []
                for i, (m, o) in enumerate(zip(pred_mentions, pred_offsets)):
                    if m in stocks:
                        detail = stocks[m]
                        labels.append({
                            "kb_id": detail['kb_id'],
                            "mention": m,
                            "offset": o,
                            "confidence": pred_confs[i] if y_probs else 1.0})
                
                if labels:
                    item["lab_result"] = labels
            
            json_data.append(item)
        
        return json_data

train_x, train_y = NerCorpus.load_data('train', **params)
valid_x, valid_y = NerCorpus.load_data('valid', **params)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=15718.0), HTML(value='')))

DEBUG:root:loaded 15718 samples from ./drive/My Drive/fwwb_workspace/data_from_fwwb/train.json. Sample:
x[0]: ['市场', '投资', '而言', ',', '广发证券', '策略', '研究', '首席', '分析师', '戴康', '表示', ',', '二季度', '市场', '有', '三个', '预期', '差会', '得到', '纠偏']
y[0]: ['O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']





HBox(children=(FloatProgress(value=0.0, max=1630.0), HTML(value='')))

DEBUG:root:loaded 1630 samples from ./drive/My Drive/fwwb_workspace/data_from_fwwb/dev.json. Sample:
x[0]: ['在', '2018', '年', '四季度', 'A股', '市场', '震荡', '盘整', '行情', '中', '，', '公募', '基金', '加仓', '的', '个股', '主要', '是', '细分', '行业龙头', '股', '，', '例如', '税控', '信息', '龙头股', '航天信息', '、', '医疗器械', '龙头股', '迈瑞', '医疗', '、', '金融', '软件', '龙头股', '恒生电子', '、', '传媒', '娱乐', '龙头股', '芒果超媒', '等', '均', '被', '不同', '程度', '的', '增持']
y[0]: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']





Still, let's peek some data to check if they're right processed.

In [20]:
n_items = 5
for i in range(n_items):
    x_i, y_i = train_x[i], train_y[i]
    print(f"x[{i}]({len(x_i)}): {x_i}")
    print(f"x[{i}]({len(y_i)}): {y_i}")

x[0](20): ['市场', '投资', '而言', ',', '广发证券', '策略', '研究', '首席', '分析师', '戴康', '表示', ',', '二季度', '市场', '有', '三个', '预期', '差会', '得到', '纠偏']
x[0](20): ['O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
x[1](10): ['大名城', 'B', '：', '关于', '股东', '办理', '股票', '质押', '的', '公告']
x[1](10): ['B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
x[2](46): ['报', '7.04', '元', '，', '涨', '4.30%', '；', '驰宏锌锗', '报', '4.40', '元', '，', '涨', '4.27%', '；', '罗平', '锌', '电报', '6.94', '元', '，', '涨', '4.20%', '；', '西部资源', '报', '3.49', '元', '，', '涨', '4.18%', '。', '注', '：', '以上', '信息', '仅供参考', '，', '不', '对', '您', '构成', '任何', '投资', '建议', '。']
x[2](46): ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
x[3](69): ['《', '《', '《', '黑山谷', '旅游', '地产', '项目', '进展', '情况', '◎', '2012', '年', '12', '月', '8', '日'

And don't forget the complementary corpus. First, transform it into json-like structure, then plug it in the same class.

In [21]:
comp_path = os.path.join(WORKSPACE_PATH, "data_from_fwwb", "raw_texts.txt")

json_data = []

with open(comp_path, 'r') as file:
    for ne_text in file:
        mention, text = ne_text.strip().split('\t')
        offset = text.index(mention)
        lab_result = [{'mention': mention, 'offset': offset}]
        item = {"text_id": -1, "text": text, "lab_result": lab_result}
        json_data.append(item)
    
comp_x, comp_y = NerCorpus.json2iob(json_data, **params)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=211793.0), HTML(value='')))




Checking...

In [22]:
n_items = 5
for i in range(n_items):
    x_i, y_i = comp_x[i], comp_y[i]
    print(f"x[{i}]({len(x_i)}): {x_i}")
    print(f"x[{i}]({len(y_i)}): {y_i}")

x[0](48): ['截至', '3', '月', '3', '日', '晚间', ',', '大智慧', '公告', '称', '累计', '收到', '上海市第一中级人民法院', '发来', '的', '民事诉讼', '《', '应诉', '通知书', '》', '及', '相关', '法律文书', '合计', '960', '例', ',', '法院', '已', '受理', '的', '原', '告诉', '大智慧', '证券', '虚假', '陈述', '责任', '纠纷案', '所涉', '诉讼请求', '金额', '合计', '为', '18', ',', '507.16', '万元']
x[0](48): ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
x[1](42): ['大商所', '油脂', '油料', '期货', '、', '期权', '总成交', '3.82', '亿手', '（', '单边', '，', '下同', '）', '，', '日均', '成交', '189.83', '万手', '，', '日均', '成交额', '660.68', '亿元', '，', '日均', '持仓量', '280.15', '万手', '，', '成交量', '、', '持仓量', '分别', '占', '国内', '农产品', '期货市场', '的', '40.72%', '、', '43.22%']
x[1](42): ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

## Building Model
After data preprocessing, we come to model building section.

### Kashgari
First, to fast test the baseline, we opt to use kashgari library with basic embedding layer, BiLSTM and CRF

In [0]:
MAX_EPOCHS = 10
CHECKPOINT = True

MODEL_SAVE_ROOT_PATH = os.path.join(WORKSPACE_PATH, "models")
CP_ROOT_PATH = os.path.join(WORKSPACE_PATH, 'checkpoints')

#### BiLSTM_Model

In [0]:
import kashgari
from kashgari.embeddings import BERTEmbedding
from kashgari.utils import load_model
from keras.callbacks import ModelCheckpoint


def load_or_train(model, model_name, train_x, train_y, valid_x, valid_y, use_bert = False, checkpoint = False, epochs = 10):
    
    model_save_path = os.path.join(MODEL_SAVE_ROOT_PATH, model_name)
    checkpoint_path = os.path.join(CP_ROOT_PATH, model_name)
    weights_path = os.path.join(checkpoint_path, 'model_weights.h5')

    if os.path.isdir(model_save_path):
        logging.info("previous trained model found, model loaded")
        return load_model(model_save_path)
    elif checkpoint and os.path.isdir(checkpoint_path):
        logging.info("checkpoint found, continue the training steps")
        model = load_model(checkpoint_path)
        model.compile_model()
    else:
        # initialize for a brand new model
        if use_bert:
            logging.info("using BERT as embedding layer")
            model.embedding = BERTEmbedding(BERT_PATH, task=kashgari.LABELING, sequence_length=100)
            
        if checkpoint:
            # initialize for checkpoint mode
            model.build_model(train_x, train_y, valid_x, valid_y)
            model.save(checkpoint_path)
    
    callbacks = []
    if checkpoint:
        callbacks.append(ModelCheckpoint(weights_path, monitor='acc', verbose=1, save_best_only=True))

    # keep training
    model.fit(train_x,
            train_y,
            x_validate=valid_x,
            y_validate=valid_y,
            epochs=epochs,
            callbacks=callbacks,
            batch_size=512)
    model.save(model_save_path)
    
    return model

In [0]:
from kashgari.tasks.labeling import BiLSTM_Model
model_name = f"bilstm_with_{MODE}"
model = BiLSTM_Model()

model = load_or_train(model, model_name, train_x+comp_x, train_y+comp_y, valid_x, valid_y, USE_BERT, CHECKPOINT, MAX_EPOCHS)

DEBUG:root:need to build after build_word2idx
INFO:root:checkpoint found, continue the training steps
DEBUG:root:need to build after build_word2idx


Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 100)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 100, 768), ( 16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 100, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________

INFO:root:model saved to /content/drive/My Drive/fwwb_workspace/models/bilstm_with_char_bert


Well, let's check it out...    
Correct results can make you proud, but that's the only thing, we have to check the wrong results

In [0]:
def score(model, true_x, true_y):
    pred_y = model.predict(true_x)
    
    correct_count = 0
    mistake_count = 0
    null_count = 0
    true_json = NerCorpus.iob2json(true_x, true_y)
    pred_json = NerCorpus.iob2json(true_x, pred_y)

    for item_true, item_pred, ty, py in zip(true_json, pred_json, true_y, pred_y):
        text = item_true["text"]

        pred_mentions = []
        true_mentions = []

        if 'lab_result' in item_true:
            true_mentions = [m['mention'] for m in item_true['lab_result']]

        if 'lab_result' in item_pred:
            pred_mentions = [m['mention'] for m in item_pred['lab_result']]
        
        if true_mentions != pred_mentions:
            print(f"\nText:{text}")
            print("----------------------------------------")
            print(f"Predicted Mentions: {pred_mentions}")
            print(f"True Mentions: {true_mentions}")
            print(f"Predicted IOB: {py}")
            print(f"True IOB: {ty}")

        if true_mentions == pred_mentions:
            correct_count += 1
        elif pred_mentions == []:
            null_count += 1
        else:
            mistake_count += 1


    print(f"\nAcc: {round(correct_count / len(pred_y) * 100, 4)}%")
    print(f"Count of mistake predictions: {mistake_count}")
    print(f"Count of null predictions: {null_count}")

In [0]:
score(model, valid_x, valid_y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=1630), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1630), HTML(value='')))



Text:”量化护航多维度掘金投资机会据悉,在进入浙商担任衍生品及量化投资总经理前,查晓磊曾任博时基金博士后研究员、策略分析师,具备深厚的量化研究功底及丰富的量化投资研究从业经验,目前正在发行中的浙商大数据智选消费,也注入了其所擅长的量化投资因子
----------------------------------------
Predicted Mentions: []
True Mentions: ['金博士']
Predicted IOB: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True IOB: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 

Not bad, accuracy is beyond 90%. But there's still some work to do though...

#### BiLSTM_CRF_Model

In [25]:
from kashgari.tasks.labeling import BiLSTM_CRF_Model

model_name = f"bilstm_crf_with_{MODE}"
model = BiLSTM_CRF_Model()

model = load_or_train(model, model_name, train_x+comp_x, train_y+comp_y, valid_x, valid_y, USE_BERT, CHECKPOINT, MAX_EPOCHS)

DEBUG:root:need to build after build_word2idx
INFO:root:previous trained model found, model loaded
DEBUG:root:need to build after build_word2idx




In [26]:
score(model, valid_x, valid_y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1630.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1630.0), HTML(value='')))



Text:23日早盘，海南板块快速走强。截至发稿，海南橡胶涨停，*ST罗顿触及涨停，海南瑞泽、罗牛山、海南高速等个股涨超3%。(责任编辑：DF302)
----------------------------------------
Predicted Mentions: ['罗牛山']
True Mentions: ['海南高速']
Predicted IOB: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True IOB: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Text:0.80元，涨7.62%；海航基础报5.47元，涨7.05%；世嘉科技报51.16元，涨5.31%；天目湖报32.58元，涨5.27%；益丰药房报55.33元，涨4.99%。注：以上信息仅供参考，不对
----------------------------------------
Predicted Mentions: ['海航基础', '天目湖']
True Mentions: ['天目湖']
Predicted IOB: ['O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

#### BiGRU_Model

In [0]:
from kashgari.tasks.labeling import BiGRU_Model

model_name = f"bigru_with_{MODE}"
model = BiGRU_Model()

model = load_or_train(model, model_name, train_x+comp_x, train_y+comp_y, valid_x, valid_y, USE_BERT, CHECKPOINT, MAX_EPOCHS)

DEBUG:root:need to build after build_word2idx
INFO:root:using BERT as embedding layer
DEBUG:root:build label2idx dict finished, contains 4 labels.


Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 100)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 100, 768), ( 16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 100, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________

INFO:root:model saved to /content/drive/My Drive/fwwb_workspace/checkpoints/bigru_with_char_bert


Epoch 1/10
  4/445 [..............................] - ETA: 16:38 - loss: 0.0155 - acc: 0.9951
Epoch 00001: acc improved from -inf to 0.98577, saving model to ./drive/My Drive/fwwb_workspace/checkpoints/bigru_with_char_bert/model_weights.h5
Epoch 2/10
  4/445 [..............................] - ETA: 10:16 - loss: 0.0074 - acc: 0.9975
Epoch 00002: acc improved from 0.98577 to 0.99702, saving model to ./drive/My Drive/fwwb_workspace/checkpoints/bigru_with_char_bert/model_weights.h5
Epoch 3/10
  4/445 [..............................] - ETA: 10:19 - loss: 0.0069 - acc: 0.9978
Epoch 00003: acc improved from 0.99702 to 0.99809, saving model to ./drive/My Drive/fwwb_workspace/checkpoints/bigru_with_char_bert/model_weights.h5
Epoch 4/10
  4/445 [..............................] - ETA: 10:18 - loss: 0.0049 - acc: 0.9983
Epoch 00004: acc improved from 0.99809 to 0.99850, saving model to ./drive/My Drive/fwwb_workspace/checkpoints/bigru_with_char_bert/model_weights.h5
Epoch 5/10
  4/445 [...........

INFO:root:model saved to /content/drive/My Drive/fwwb_workspace/models/bigru_with_char_bert


In [0]:
score(model, valid_x, valid_y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=1630), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1630), HTML(value='')))



Text:”量化护航多维度掘金投资机会据悉,在进入浙商担任衍生品及量化投资总经理前,查晓磊曾任博时基金博士后研究员、策略分析师,具备深厚的量化研究功底及丰富的量化投资研究从业经验,目前正在发行中的浙商大数据智选消费,也注入了其所擅长的量化投资因子
----------------------------------------
Predicted Mentions: []
True Mentions: ['金博士']
Predicted IOB: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True IOB: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 

#### BiGRU_CRF_Model

In [0]:
from kashgari.tasks.labeling import BiGRU_CRF_Model

model_name = f"bigru_crf_with_{MODE}"
model = BiGRU_CRF_Model()

model = load_or_train(model, model_name, train_x+comp_x, train_y+comp_y, valid_x, valid_y, USE_BERT, CHECKPOINT, MAX_EPOCHS)

DEBUG:root:need to build after build_word2idx
INFO:root:previous trained model found, model loaded
DEBUG:root:need to build after build_word2idx


In [0]:
score(model, valid_x, valid_y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=1630), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1630), HTML(value='')))



Text:。盘面上，稀土永磁概念股全线杀跌，软件板块掀涨停潮，中国软件等多股涨停，海南板块午后爆发，罗牛山、大东海A等多股封板。海通证券则指出，目前外围环境依旧复杂多变，国内高频经济数据也较为低迷，指数在目
----------------------------------------
Predicted Mentions: ['罗牛山', '大东海']
True Mentions: ['大东海']
Predicted IOB: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True IOB: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O'

#### CNN_LSTM_Model

In [0]:
from kashgari.tasks.labeling import CNN_LSTM_Model

model_name = f"cnn_lstm_with_{MODE}"
model = CNN_LSTM_Model()

model = load_or_train(model, model_name, train_x+comp_x, train_y+comp_y, valid_x, valid_y, USE_BERT, CHECKPOINT, MAX_EPOCHS)

DEBUG:root:need to build after build_word2idx
INFO:root:checkpoint found, continue the training steps
DEBUG:root:need to build after build_word2idx


Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 100)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 100, 768), ( 16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 100, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________

INFO:root:model saved to /content/drive/My Drive/fwwb_workspace/models/cnn_lstm_with_char_bert


In [0]:
score(model, valid_x, valid_y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=1630), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1630), HTML(value='')))



Text:板，国泰君安、招商证券下探近3%。市北高新领涨地产板块，新黄浦、泛海控股涨幅超6%，阳光城、金融街、大名城等80逾股上涨。（文章来源：中国网地产）(责任编辑：DF052)
----------------------------------------
Predicted Mentions: ['阳光城', '大名城']
True Mentions: ['大名城']
Predicted IOB: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'B', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True IOB: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

### Sklearn
Now let's build a sklearn model with manual extracted features

In [0]:
if model: del model # release some memories

In [0]:
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction import DictVectorizer
from tqdm import tqdm_notebook


def feature_detector(tokens, index, history):
    # pad the sequence with placeholders
    tokens = ["__START2__", "__START1__"] + tokens + ["__END1__", "__END2__"]
    history = ["__START2__", "__START1__"] + history

    # shift the index with 2, to accommodate the padding
    index += 2

    word = tokens[index]
    prevword = tokens[index - 1]
    prevprevword = tokens[index - 2]
    nextword = tokens[index + 1]
    nextnextword = tokens[index + 2]
    previob = history[-1]
    prevpreviob = history[-2]

    feat_dict = {
        'word': word,
        
        'prev-iob': previob,
        'prev-word': prevword,
        'prev-prev-iob': prevpreviob,
        'prev-prev-word': prevprevword,

        'next-word': nextword,
        'next-next-word': nextnextword, 
    }

    return feat_dict


class ScikitLearnBasedModel(object):
    
    def __init__(self, clf, feature_detector):
        self.vec = DictVectorizer()
        self.clf = clf
        self.feature_detector = feature_detector
    
    def train(self, input_x, input_y, batch_size = 512, epoch = 5):
        # pre-process and feature extracting
        x_data, y_data = [], []
        logging.info("extracting features...")
        for tokens, iob in tqdm_notebook(zip(input_x, input_y), total=len(input_x)):
            for index in range(len(tokens)):
                x_data.append(self.feature_detector(tokens, index, iob[:index]))
                y_data.append(iob[index])
        
        logging.info("vectorizing...")
        x_data = self.vec.fit_transform(x_data)
        logging.info("training...")

        classes = list(set(y_data))
        for ep in range(0, epoch):
            print(f"Epoch: {ep+1}/{epoch}")
            for beg in tqdm_notebook(range(0, len(y_data), batch_size)):
                end = beg + batch_size
                partial_x, partial_y = x_data[beg: end], y_data[beg: end]

                self.clf.partial_fit(partial_x, partial_y, classes=classes)

        
    
    def predict(self, input_x):
        y_data = []
        for tokens in tqdm_notebook(input_x):
            iob = []
            for index in range(len(tokens)):
                feat_dict = self.feature_detector(tokens, index, iob[:index])
                vectorized = self.vec.transform(feat_dict)
                result = self.clf.predict(vectorized).item()
                iob.append(result)
            y_data.append(iob)
        return y_data


clf = Perceptron() # have to implement "partial_fit", or be crashed due to memory lacking

sk_model = ScikitLearnBasedModel(clf, feature_detector)
sk_model.train(train_x + comp_x, train_y + comp_y, epoch=10)

Then check like the previous one...

In [0]:
score(sk_model, valid_x, valid_y)

Okay, a fair baseline

# Predict
Now is the final part, load test data, predict, then format into given structure.

## Load Test Data

In [27]:
class TestData(object):
    __vaild_path__ = os.path.join(WORKSPACE_PATH, 'data_from_fwwb', 'test.txt')

    @classmethod
    def tokenize(cls, data, unit, tokenize_func, pseg_func):
        xs = []
        process_func = None

        process_functions = {
            'char': [list],
            'word': [tokenize_func],
            'pos-tagged': [tokenize_func, pseg_func],
        }

        process_func = process_functions[unit]
        
        for sent in data:
            _ , sent = sent.strip().split('\t')
            for pro in process_func:
                sent = pro(sent)

            xs.append(sent)

        return xs


    @classmethod
    def render_xs(cls, unit: str = 'char', tokenize_func: Callable = None, pseg_func: Callable = None):
        file_path = cls.__vaild_path__

        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.readlines()
            x_data = cls.tokenize(data, unit, tokenize_func, pseg_func)

            logging.debug(f"loaded {len(x_data)} samples from {file_path}. Sample:\n"
                          f"x[0]: {x_data[0]}")
            return x_data

xs = TestData.render_xs(**params)

DEBUG:root:loaded 1621 samples from ./drive/My Drive/fwwb_workspace/data_from_fwwb/test.txt. Sample:
x[0]: ['壹加壹', '表示', ',', '本次', '股权', '司法', '冻结', '不会', '对', '公司', '生产', '经营', '产生', '影响', ',', '如', '本次', '诉讼', '判决', '对', '马九虎', '不利', ',', '相关', '权利', '人', '对', '本次', '被', '冻结', '股份', '申请', '采取', '司法', '强制执行', '措施', ',', '不会', '导致', '公司', '控股', '股东', '和', '实际', '控制', '人', '的', '变更']


## Feed into the Model

In [0]:
import numpy as np

ys, y_probs = [], []
# Kashgari actually can't figue out the probs, we have to do it manually
tensor = model.embedding.process_x_dataset(xs)
pred = model.tf_model.predict(tensor)
ind2lab = dict((v, k) for k,v in model.embedding.processor.label2idx.items())
for sample_prob in pred:
    yp = [float(max(item)) for item in sample_prob]
    y = [ind2lab[np.argmax(item)] for item in sample_prob]
    y_probs.append(yp)
    ys.append(y)

## Format the Result

In [29]:
res = NerCorpus.iob2json(xs, ys, y_probs)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1621.0), HTML(value='')))




## Save into JSON file

In [0]:
result_path = os.path.join(WORKSPACE_PATH, 'result.json')
with open(result_path, 'w') as file:
    # Array of JSON objects doesn't match the key-value pair format, modify manually if necessary
    json.dump(res, file, indent=4, ensure_ascii=False)