In [None]:
# Setup the google colab
# !pip install xmnlp

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
isColab = False
googlePath = "./drive/MyDrive/L101Project/" if isColab else "./"
cpuPools = 2 if isColab else 8

In [3]:
import tensorflow as tf
from tensorflow import keras
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
# tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [4]:
# from tensorflow.compat.v1 import ConfigProto
# from tensorflow.compat.v1 import Session

# config = ConfigProto()
# config.gpu_options.allow_growth = True
# session = Session(config=config)
# tf.compat.v1.disable_eager_execution()


In [5]:
import csv
import numpy as np
import xmnlp
import itertools
from collections import Counter
from keras.utils import to_categorical
import pandas as pd

import collections
import operator
from functools import reduce
import json
from multiprocessing import Pool
from keras.preprocessing.sequence import pad_sequences

In [6]:
# Preprocess the training and testing data.

In [7]:
nerTrain = googlePath + "./corpus/ner/ner_train.txt"
nerDev = googlePath + "./corpus/ner/ner_dev.txt"
nerTest = googlePath + "./corpus/ner/ner_test.txt"

In [8]:
train = pd.read_table(nerTrain, header=None, names=['token', 'label'])  # don't drop the empty lines yet, they show up as NaN in the data frame
dev = pd.read_table(nerDev, header=None, names=['token', 'label'])  # don't drop the empty lines yet, they show up as NaN in the data frame
test = pd.read_table(nerTest, header=None, names=['token', 'label'])  # don't drop the empty lines yet, they show up as NaN in the data frame

In [9]:
train.head(1)

Unnamed: 0,token,label
0,海,O


In [10]:
set(train["label"])

{'B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O', nan}

In [11]:
# Label translation and token idx

token_vocab = train.token.unique().tolist()
oov = len(token_vocab) 

def token_index(tok):
  ind = tok
  if not pd.isnull(tok):  # new since last time: deal with the empty lines which we didn't drop yet
    if tok in token_vocab:  # if token in vocabulary
      ind = token_vocab.index(tok)
    else:  # else it's OOV
      ind = oov
  return ind

def bio_index(bio):
  ind = bio
  if not pd.isnull(bio):  # deal with empty lines
    if bio=='B-LOC':
      ind = 0
    elif bio=='B-ORG':
      ind = 1
    elif bio=='B-PER':
      ind = 2
    elif bio=='I-LOC':
      ind = 3
    elif bio=='I-ORG':
      ind = 4
    elif bio=='I-PER':
      ind = 5
    elif bio=='O':
      ind = 6
  return ind


def bio_only_idx(bio):
  ind = bio
  if not pd.isnull(bio):  # deal with empty lines
    if bio=='B-LOC':
      ind = 0
    elif bio=='B-ORG':
      ind = 0
    elif bio=='B-PER':
      ind = 0
    elif bio=='I-LOC':
      ind = 1
    elif bio=='I-ORG':
      ind = 1
    elif bio=='I-PER':
      ind = 1
    elif bio=='O':
      ind = 2
  return ind

# pass a data frame through our feature extractor
def extract_features(txt_orig,istest=False):
  txt = txt_orig.copy()
  tokinds = [token_index(u) for u in txt['token']]
  txt['token_indices'] = tokinds
  if not istest:  # can't do this with the test set
    bioints = [bio_index(b) for b in txt['label']]
    bioints_only = [bio_only_idx(b) for b in txt['label']]
    txt['bio'] = bioints
    txt['bio_only'] = bioints_only

    return txt

train_copy = extract_features(train)
train_copy.head(n=30)

Unnamed: 0,token,label,token_indices,bio,bio_only
0,海,O,0.0,6.0,2.0
1,钓,O,1.0,6.0,2.0
2,比,O,2.0,6.0,2.0
3,赛,O,3.0,6.0,2.0
4,地,O,4.0,6.0,2.0
5,点,O,5.0,6.0,2.0
6,在,O,6.0,6.0,2.0
7,厦,B-LOC,7.0,0.0,0.0
8,门,I-LOC,8.0,3.0,1.0
9,与,O,9.0,6.0,2.0


In [12]:
def tokens2sequences(txt_orig,istest=False):

    txt = txt_orig.copy()
    txt['sequence_num'] = 0
    seqcount = 0
    for i in txt.index:  # in each row...
        txt.loc[i,'sequence_num'] = seqcount  # set the sequence number
        if pd.isnull(txt.loc[i,'token']):  # increment sequence counter at empty lines
            seqcount += 1
    # now drop the empty lines, group by sequence number and output df of sequence lists
    txt = txt.dropna()
    if istest:  # test set doesn't have labels
        txt_seqs = txt.groupby(['sequence_num'],as_index=False)[['token', 'label', 'token_indices', 'bio', "bio_only"]].agg(lambda x: list(x))
    else:
        txt_seqs = txt.groupby(['sequence_num'],as_index=False)[['token', 'label', 'token_indices', 'bio', "bio_only"]].agg(lambda x: list(x))
    return txt_seqs

print("This cell takes a little while to run: be patient :)")
train_seqs = tokens2sequences(train_copy)
train_seqs.head(1)

This cell takes a little while to run: be patient :)


Unnamed: 0,sequence_num,token,label,token_indices,bio,bio_only
0,0,"[海, 钓, 比, 赛, 地, 点, 在, 厦, 门, 与, 金, 门, 之, 间, 的, ...","[O, O, O, O, O, O, O, B-LOC, I-LOC, O, B-LOC, ...","[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...","[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 0.0, 3.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 1.0, ..."


In [13]:
dev_copy = extract_features(dev)
dev_seqs = tokens2sequences(dev_copy)
test_copy = extract_features(test)
test_seqs = tokens2sequences(test_copy)

In [14]:
# train_seqs.to_csv("./trains_seq.csv", sep='\t', encoding='utf-8')
# dev_seqs.to_csv("./dev_seqs.csv", sep='\t', encoding='utf-8')
# test_seqs.to_csv("./test_seq.csv", sep='\t', encoding='utf-8')
from ast import literal_eval
train_seqs_temp = pd.read_table("./trains_seq.csv",converters={"token": literal_eval,
                                                               "token_indices": literal_eval,
                                                               "label": literal_eval,
                                                               "bio": literal_eval,
                                                               "bio_only": literal_eval})
test_seqs_temp = pd.read_table("./test_seq.csv",converters={"token": literal_eval,
                                                               "token_indices": literal_eval,
                                                               "label": literal_eval,
                                                               "bio": literal_eval,
                                                               "bio_only": literal_eval}) 
dev_seqs_temp = pd.read_table("./dev_seqs.csv",converters={"token": literal_eval,
                                                               "token_indices": literal_eval,
                                                               "label": literal_eval,
                                                               "bio": literal_eval,
                                                               "bio_only": literal_eval})

In [15]:
## Create Corpus for Chinese Word Segamentation

In [16]:
# pkuTrain = googlePath + "./corpus/ner/ner_train.txt"
# pkuTest = googlePath + "./corpus/cws/icwb2-data/testing/pku_test.utf8"

# f = open(pkuTest, "r")
# temp = f.readlines()
# con = 0
# for item in temp:
#     con+=1
#     if(item == "\n"):
#         print(con)

# import pandas as pd
# pkuTrain = googlePath + "./corpus/cws/icwb2-data/training/pku_training.utf8"
# pkuTest = googlePath + "./corpus/cws/icwb2-data/testing/pku_test.utf8"
# train = pd.read_table(pkuTrain,  encoding='utf8', header=None, names=['input'])  # don't drop the empty lines yet, they show up as NaN in the data frame
# # train.head(n=1)

# pkuTest = googlePath  + "./corpus/cws/icwb2-data/testing/pku_test.utf8"
# test = pd.read_table(pkuTest,  encoding='utf8', header=None, names=['input']) 
# test["raws"] = test["input"]
# # test.head(n=1)

In [17]:
# def postPorcess(dataset):
#     data_temp = dataset.copy()
#     tokens = data_temp["token"]
#     data_temp["raws"] 
# #     print(tokens)
#     print("".join(tokens))

# postPorcess(train_seqs_temp)

In [18]:
import ast

def prepross_data(train_Data):
    inputs = train_Data["token"][:]
    print(len(inputs))
    raws = []
    tokenList = []
    
    for item in inputs:
        tokenList.append(item)
        item = "".join(item)
        raws.append(item)
        tokens = (item)
    train_Data["raws"] = raws
    train_Data["tokenList"] = tokenList
    dictionary_train_word = list(set(reduce(operator.add,train_Data["tokenList"])))
    dictionary_train_char =  list(set(reduce(operator.add,train_Data["raws"])))    
    return train_Data, dictionary_train_char, dictionary_train_word

train, train_dic_char, train_dic_word  = prepross_data(train_seqs_temp)
dev, _, _  = prepross_data(dev_seqs_temp)
test, _, _  = prepross_data(test_seqs_temp)

20864
2318
4636


In [19]:
train.head(1)

Unnamed: 0.1,Unnamed: 0,sequence_num,token,label,token_indices,bio,bio_only,raws,tokenList
0,0,0,"[海, 钓, 比, 赛, 地, 点, 在, 厦, 门, 与, 金, 门, 之, 间, 的, ...","[O, O, O, O, O, O, O, B-LOC, I-LOC, O, B-LOC, ...","[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...","[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 0.0, 3.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 1.0, ...",海钓比赛地点在厦门与金门之间的海域。,"[海, 钓, 比, 赛, 地, 点, 在, 厦, 门, 与, 金, 门, 之, 间, 的, ..."


In [20]:
# Get redical dictionary

# Get redical of a chinese character:
def str_get_redical(st):
    return xmnlp.radical(st)

def get_redical_dic_from_char_dic(char_dic):
    return list(set(str_get_redical("".join(char_dic))))

train_dic_redical = get_redical_dic_from_char_dic(train_dic_char)
print(len(train_dic_redical))

(Lazy Load) Loading model...
235


In [21]:
# Build up subcharacter dictionary
# Build up Chinese sub-character unites diactionary.
chaiZi_Dic = {}
with open(googlePath + "corpus/chaizi/chaizi-jt.txt", 'r') as f:
    reader = csv.reader(f,delimiter='\t')
    for row in reader:
        chaiZi_Dic[row[0]] = row[1]
        
def getSubChar(charCN):
    if(charCN in chaiZi_Dic.keys()):
        return chaiZi_Dic[charCN].split(" ")
#     elif charCN in train_dic_char:
#         return [charCN]
    else:
        return [None]

In [22]:
def str_get_subchar(st):
#     for item in st:
#         print(item)
#         print(getSubChar(item))
    return [getSubChar(item) for item in st]

def get_subchar_dic_from_char_dic(char_dic):
    return list(set(itertools.chain.from_iterable(str_get_subchar("".join(char_dic)))))


def get_top200_subchar_dic_from_char_dic(char_dic):
    allsub = (list(itertools.chain.from_iterable(str_get_subchar("".join(char_dic)))))
    c = Counter(allsub)
    ans = list(map(lambda x : x[0] , (c.most_common(200))))
    return ans
# train_dic_subchar = get_subchar_dic_from_char_dic(train_dic_char)
train_dic_subchar_top200 = get_top200_subchar_dic_from_char_dic(train_dic_char)


train_dic_subchar = get_subchar_dic_from_char_dic(train_dic_char)
print(len(train_dic_subchar))

1142


In [23]:
# Validate the pre-processing
def validate_data(data):
    tkList = data["tokenList"]
    raws = data["raws"]
    for (tokens, raw) in zip(tkList,raws):
#         print(tokens)
        
#         print(raw)
        temp  = "".join(tokens)
#         print(temp)
        assert(temp == raw)
        if not (temp == raw):
            print(temp)
            print(raw)
            print(tokens)
validate_data(train)

In [24]:
# print(len(train_dic_char))
# print(len(set(train_dic_char)))
# print(len(train_dic_word))
# print(len(set(train_dic_word)))
# with open(googlePath + "./corpus/cws/icwb2-data/gold/pku_training_words.utf8", 'r') as f:
#     content = f.readlines()  
#     print(len(content))

# Make bag of subchar model.


# 	letter = 
# 	letter[value] = 1



# str_to_bagOfChar("迈向充满希望的新世纪——一九九八年新年讲话（附图片１张）")[2][1]

In [25]:
# Feature extraction

oov = len(train_dic_char)
print(oov)

oov_redical = len(train_dic_redical)
print(oov_redical)

oov_subchar = len(train_dic_subchar)
print(oov_subchar)

def token_index(tok):
    ind = tok
    if tok in train_dic_char:  # if token in vocabulary
        ind = train_dic_char.index(tok)
    else:  # else it's OOV
        ind = oov
    return ind

def str_token_index(string):
    return [token_index(x) for x in (string)]




def redical_index(red):
    ind = oov_redical
    if red in train_dic_redical:  # if token in vocabulary
        ind = train_dic_redical.index(red)
    return ind

def str_red_index(string):
    return [redical_index(x) for x in str_get_redical(string)]


def subchar_index(subchar):
    ind = oov_subchar
    if subchar in train_dic_subchar:
        ind = train_dic_subchar.index(subchar)
    return ind

def str_subchar_index(string):
    return [[subchar_index(temp) for temp in x] for x in str_get_subchar(string)]

def char_to_bagOfChar(cnChar):
    ans = [0 for _ in range(len(train_dic_subchar_top200)+1)]
    for item in (getSubChar(cnChar)):
        if item in train_dic_subchar_top200:
            idx = train_dic_subchar_top200.index(item)
        else:
            idx = len(train_dic_subchar_top200)
        ans[idx] += 1
    return (ans)

def str_to_bagOfChar(cnStr):
    return [char_to_bagOfChar(cnChar) for cnChar in cnStr]
        



# # Get Begin Middle End Single sequence
# # B 0 M 1 E 2 S 3

# def str_bmes_idx(tokenList):
#     answer = []
#     for item in tokenList:
#         if len(item) == 0:
#             raise NameErro("Zero Length Word")
#         if len(item) == 1:
#             answer.append(3)
#         else:
#             answer.append(0)
#             for item in range(len(item) - 2):
#                 answer.append(1)
#             answer.append(2)
#     return answer

def extract_features(data_set, isTest=False):
    data_temp = data_set.copy()

    # Idx for chars
    with Pool(8) as p:
        tokinds = p.map(str_token_index,data_temp['raws'])
#     tokinds = [list(map(token_index, u)) for u in data_temp['raws']]

    with Pool(8) as p:
        redinds = p.map(str_red_index,data_temp['raws'])
        
    with Pool(8) as p:
        subcharidx = p.map(str_subchar_index,data_temp['raws'])
        
#     subcharidx_bag = list(m0ap(str_to_bagOfChar,data_temp['raws']))
        

    data_temp["tokenIdx"] = tokinds
    data_temp["redIdx"] = redinds
    data_temp["subcharIdx"] = subcharidx
#     data_temp["subcharIdxBag"] = subcharidx_bag
    
    # BIO
#     if(not isTest):
#         data_temp["bmes"] = [str_bmes_idx(u) for u in data_temp['tokenList']]
#         assert (list(map(len,data_temp["bmes"])) == list(map(len,data_temp["tokenIdx"])))
    
    return data_temp

4312
235
1142


In [26]:
%%time
# MultiThreading
temp = extract_features(train)
train_feature = temp.drop(columns=["token","token_indices"])
# train_feature.head(5)

CPU times: user 1.54 s, sys: 256 ms, total: 1.79 s
Wall time: 12.4 s


In [27]:
train_feature.head(1)

Unnamed: 0.1,Unnamed: 0,sequence_num,label,bio,bio_only,raws,tokenList,tokenIdx,redIdx,subcharIdx
0,0,0,"[O, O, O, O, O, O, O, B-LOC, I-LOC, O, B-LOC, ...","[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 0.0, 3.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 1.0, ...",海钓比赛地点在厦门与金门之间的海域。,"[海, 钓, 比, 赛, 地, 点, 在, 厦, 门, 与, 金, 门, 之, 间, 的, ...","[166, 2518, 995, 4050, 1673, 2546, 4064, 790, ...","[121, 99, 156, 4, 131, 8, 131, 192, 201, 170, ...","[[527, 971], [566, 948], [55, 55], [810, 1021,..."


In [28]:
%%time
temp = extract_features(dev)
dev_feature = temp.drop(columns=["token","token_indices"])
temp = extract_features(test)
test_feature = temp.drop(columns=["token","token_indices"])
# test_feature.head(5)

CPU times: user 440 ms, sys: 532 ms, total: 972 ms
Wall time: 5.46 s


In [29]:
test_feature.head(5)

Unnamed: 0.1,Unnamed: 0,sequence_num,label,bio,bio_only,raws,tokenList,tokenIdx,redIdx,subcharIdx
0,0,0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...",我们变而以书会友，以书结缘，把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。,"[我, 们, 变, 而, 以, 书, 会, 友, ，, 以, 书, 结, 缘, ，, 把, ...","[4205, 10, 2053, 3285, 3856, 2545, 2936, 3958,...","[162, 220, 225, 33, 139, 86, 139, 225, 194, 13...","[[1066, 190], [152, 236], [693, 544], [1071, 1..."
1,1,1,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...",为了跟踪国际最新食品工艺、流行趋势，大量搜集海外专业书刊资料是提高技艺的捷径。,"[为, 了, 跟, 踪, 国, 际, 最, 新, 食, 品, 工, 艺, 、, 流, 行, ...","[2665, 1607, 3912, 2504, 994, 2033, 3484, 326,...","[203, 42, 9, 9, 132, 80, 25, 48, 183, 112, 24,...","[[1107, 846, 1027, 1107], [951, 926], [568, 98..."
2,2,2,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...",其中线装古籍逾千册；民国出版物几百种；珍本四册、稀见本四百余册，出版时间跨越三百余年。,"[其, 中, 线, 装, 古, 籍, 逾, 千, 册, ；, 民, 国, 出, 版, 物, ...","[1503, 2373, 1721, 2301, 2258, 324, 1980, 2754...","[69, 40, 232, 70, 112, 138, 188, 89, 227, 194,...","[[246, 1071, 357], [977, 604], [771, 718], [88..."
3,3,3,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...",有的古木交柯，春机荣欣，从诗人句中得之，而入画中，观之令人心驰。,"[有, 的, 古, 木, 交, 柯, ，, 春, 机, 荣, 欣, ，, 从, 诗, 人, ...","[2077, 2120, 2258, 3978, 861, 3957, 1137, 1310...","[95, 113, 112, 174, 176, 174, 194, 215, 174, 1...","[[1071, 846, 103], [123, 948], [954, 977], [95..."
4,4,4,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...",不过重在晋趣，略增明人气息，妙在集古有道、不露痕迹罢了。,"[不, 过, 重, 在, 晋, 趣, ，, 略, 增, 明, 人, 气, 息, ，, 妙, ...","[751, 4085, 1386, 4064, 3780, 2109, 1137, 4199...","[170, 188, 233, 131, 215, 106, 194, 190, 131, ...","[[1071, 846, 604, 1107], [848, 564], [846, 107..."


In [30]:
def find_longest_sequence(data_with_features):
#     assert (np.max(list(map(len, data_with_features["tokenIdx"])))) == (np.max(list(map(len, data_with_features["bmes"]))))
    return (np.max(list(map(len, data_with_features["tokenIdx"]))))

train_longest = find_longest_sequence(train_feature)
print(train_longest)
test_longest = find_longest_sequence(test_feature)
print(test_longest)
dev_longest = find_longest_sequence(dev_feature)
print(dev_longest)
seq_longest = np.max([train_longest,test_longest,dev_longest])
print(seq_longest)

subchar_seq_length = np.max(list(map(lambda x : len(getSubChar(x)),train_dic_char)))
subchar_seq_length = 3
subsubchar_padtok = oov_subchar + 1
seq_length = seq_longest

def padd_char(seq):
    temp_char_seqs_padded = []
    for item in seq["subcharIdx"]:
        temp_pad = pad_sequences(item, maxlen=subchar_seq_length,
                                  dtype='int32', padding='post', truncating='post', value=subsubchar_padtok)
  
        a = temp_pad
        b = [[subsubchar_padtok for i in range(subchar_seq_length)] for _ in range(0, seq_length - len(temp_pad))]
        if len(b) == 0:
            c = a
        else:
            c = np.concatenate((a, b))
        
    # print(len(c))
        temp_char_seqs_padded.append(c)
  # print(len(temp_char_seqs_padded))
    return temp_char_seqs_padded

574
577
568
577


In [31]:
train_subchar_seqs_padded = padd_char(train_feature)

In [32]:
seq_length = seq_longest

# a new dummy token index, one more than OOV
padtok = oov+1
red_padtok = oov_redical + 1
print('The padding token index is %i' % padtok)

# use pad_sequences, padding or truncating at the end of the sequence (default is 'pre')
train_seqs_padded = pad_sequences(train_feature['tokenIdx'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=padtok)
# print('Example of padded token sequence:')
# print(train_seqs_padded[1])


train_red_padded = pad_sequences(train_feature['redIdx'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=red_padtok)


train_subchar_seqs_padded = padd_char(train_feature)


# Prepare Dev set.

dev_seqs_padded = pad_sequences(dev_feature['tokenIdx'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=padtok)

dev_red_padded = pad_sequences(dev_feature['redIdx'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=red_padtok)

dev_subchar_seqs_padded = padd_char(dev_feature)



# Prepare Test set.
test_seqs_padded = pad_sequences(test_feature['tokenIdx'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=padtok)

test_rad_padded = pad_sequences(test_feature['redIdx'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=red_padtok)

test_subchar_seqs_padded = padd_char(test_feature)


The padding token index is 4313


In [33]:
# # get lists of named entity labels, padded with a null label (=3)
# # Full BIOs


# padlab = 7
# n_labs = 8

# train_labs_padded = pad_sequences(train_feature['bio'].tolist(), maxlen=seq_length,
#                                   dtype='int32', padding='post', truncating='post', value=padlab)

# # convert those labels to one-hot encoding
# train_labs_onehot = [to_categorical(i, num_classes=n_labs) for i in train_labs_padded]

# # # follow the print outputs below to see how the labels are transformed
# # print('Length of input sequence: %i' % len(train_labs_padded[1]))
# # print('Length of label sequence: %i' % len(train_labs_onehot[1]))
# # print(train_labs_padded[1][:11])
# # print(train_labs_onehot[1][:11])



# dev_labs_padded = pad_sequences(dev_feature['bio'].tolist(), maxlen=seq_length,
#                                   dtype='int32', padding='post', truncating='post', value=padlab)

# dev_labs_onehot = [to_categorical(i, num_classes=n_labs) for i in dev_labs_padded]


In [34]:
# get lists of named entity labels, padded with a null label (=3)
# Full BIOs


padlab = 3
n_labs = 4

train_labs_padded = pad_sequences(train_feature['bio_only'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=padlab)

# convert those labels to one-hot encoding
train_labs_onehot = [to_categorical(i, num_classes=n_labs) for i in train_labs_padded]

# # follow the print outputs below to see how the labels are transformed
# print('Length of input sequence: %i' % len(train_labs_padded[1]))
# print('Length of label sequence: %i' % len(train_labs_onehot[1]))
# print(train_labs_padded[1][:11])
# print(train_labs_onehot[1][:11])



dev_labs_padded = pad_sequences(dev_feature['bio_only'].tolist(), maxlen=seq_length,
                                  dtype='int32', padding='post', truncating='post', value=padlab)

dev_labs_onehot = [to_categorical(i, num_classes=n_labs) for i in dev_labs_padded]


In [35]:
# # use deep copy to ensure we aren't updating original values
# import copy
# train_weights_onehot = copy.deepcopy(train_labs_onehot)

# # our first-pass class weights: normal for named entities (0 and 1), down-weighted for non named entities (2 and 3)
# class_wts = [1,1,.1,.1]

# # apply our weights to the label lists
# for i,labs in enumerate(train_weights_onehot):
#     for j,lablist in enumerate(labs):
#         lablistaslist = lablist.tolist()
#         whichismax = lablistaslist.index(max(lablistaslist))
#         train_weights_onehot[i][j][whichismax] = class_wts[whichismax]
        
        
# dev_weights_onehot = copy.deepcopy(dev_labs_onehot)

# # our first-pass class weights: normal for named entities (0 and 1), down-weighted for non named entities (2 and 3)
# class_wts = [1,1,.1,.1]

# # apply our weights to the label lists
# for i,labs in enumerate(dev_weights_onehot):
#     for j,lablist in enumerate(labs):
#         lablistaslist = lablist.tolist()
#         whichismax = lablistaslist.index(max(lablistaslist))
#         dev_weights_onehot[i][j][whichismax] = class_wts[whichismax]

# # # what's this like, before and after?
# # print('Initial one-hot label encoding:')
# # print(train_labs_onehot[1][:11])

# # print('Weighted label encoding:')
# # print(dev_weights_onehot[1][:11])

In [36]:
# load Keras and TensorFlow



# our final vocab size is the padding token + 1 (OR length of vocab + OOV + PAD)
vocab_size = padtok+1
red_size = red_padtok+1
subchar_size = subsubchar_padtok + 1

print(vocab_size==len(train_dic_char)+2)
embed_size = 128 # y an embedding size of 128 (could tune this)

# list of metrics to use: true & false positives, negatives, accuracy, precision, recall, area under the curve
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

# our model has the option for an label prediction bias, it's sequential, starts with an embedding layer, then bi-LSTM,
# a dropout layer follows for regularisation, and a dense final layer with softmax activation to output class probabilities
# we compile with the Adam optimizer at a low learning rate, use categorical cross-entropy as our loss function
def make_model(metrics = METRICS, output_bias=None):
    if output_bias is not None:        
        output_bias = tf.keras.initializers.Constant(output_bias)
    tok_input1 = keras.layers.Input(shape=(seq_length,), dtype='int32', name='tok_input1')
    red_input2 = keras.layers.Input(shape=(seq_length,), dtype='int32', name='red_input2')
    subchar_input3 = keras.layers.Input(shape=(seq_length,subchar_seq_length), dtype='int32', name='char_input3')
    emb_char = keras.layers.TimeDistributed(keras.layers.Embedding(output_dim=embed_size, input_dim=subchar_size, input_length=3,  mask_zero=True, trainable=True))(subchar_input3)
    char_enc = keras.layers.TimeDistributed(keras.layers.Bidirectional(keras.layers.LSTM(units=50, return_sequences=False, dropout=0.2,recurrent_dropout=0.2)))(emb_char)
    x1 = keras.layers.Embedding(output_dim=embed_size, input_dim=vocab_size,  input_length=seq_length,  mask_zero=True, trainable=True)(tok_input1)
    x2 = keras.layers.Embedding(output_dim=embed_size, input_dim=red_size,  input_length=seq_length, mask_zero=True, trainable=True)(red_input2)
    x_cancat = keras.layers.concatenate([x1,x2])
#     x_cancat = (x1)
    x_lstm = keras.layers.Bidirectional(keras.layers.LSTM(units=50, return_sequences=True, dropout=0.2))(x_cancat)
    x_drop = keras.layers.Dropout(0.5)(x_lstm)
    main_output = keras.layers.TimeDistributed(keras.layers.Dense(n_labs, activation='softmax', bias_initializer=output_bias))(x_drop)
    model = keras.models.Model(inputs=[tok_input1,red_input2, subchar_input3], outputs= main_output)
#     model = keras.models.Model(inputs=[tok_input1, pos_input2, char_input3], outputs= main_output)
    
    model.compile(optimizer=keras.optimizers.Adam(lr=1e-3), loss=keras.losses.CategoricalCrossentropy(), metrics=metrics)
    return model


# Old lstm model
#     model = keras.Sequential([
#         keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=seq_length, mask_zero=True, trainable=True),
#         keras.layers.Bidirectional(keras.layers.LSTM(units=50, return_sequences=True, dropout=0.2)),  # 2 directions, 50 units each, concatenated (can change this)
#         keras.layers.Dropout(0.5),
#         keras.layers.TimeDistributed(keras.layers.Dense(n_labs, activation='softmax', bias_initializer=output_bias)),
#     ])


# early stopping criteria based on area under the curve: will stop if no improvement after 10 epochs
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', verbose=1, patience=10, mode='max', restore_best_weights=True)

# the number of training epochs we'll use, and the batch size (how many texts are input at once)
EPOCHS = 100
BATCH_SIZE = 128

print('**Defining a neural network**')
model = make_model()
model.summary()

True
**Defining a neural network**
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tok_input1 (InputLayer)         [(None, 577)]        0                                            
__________________________________________________________________________________________________
red_input2 (InputLayer)         [(None, 577)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 577, 128)     552192      tok_input1[0][0]                 
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 577, 128)     30336       red_input2[0][0]                 
___________________________________________________________

In [37]:
# %%time
# # evaluate our initial model
# results = model.evaluate(X, y, batch_size=BATCH_SIZE, verbose=0)
# print("Loss: {:0.4f}".format(results[0]))

In [38]:
# figure out the label distribution in our fixed-length texts
from collections import Counter
all_labs = [l for lab in train_labs_padded for l in lab]
label_count = Counter(all_labs)
total_labs = len(all_labs)
print(label_count)
print(total_labs)

initial_bias=[(label_count[0]/total_labs), (label_count[1]/total_labs),
              (label_count[2]/total_labs), (label_count[3]/total_labs)]

# initial_bias=[(label_count[0]/total_labs), (label_count[1]/total_labs),
#               (label_count[2]/total_labs), (label_count[3]/total_labs), (label_count[4]/total_labs),
#               (label_count[5]/total_labs), (label_count[6]/total_labs), (label_count[7]/total_labs)]

print('Initial bias:')
print(initial_bias)

Counter({3: 11059348, 2: 869087, 1: 76101, 0: 33992})
12038528
Initial bias:
[0.002823601024975811, 0.006321453918618622, 0.0721921317955152, 0.9186628132608904]


In [39]:
# prepare sequences and labels as numpy arrays, check dimensions
X = np.array(train_seqs_padded)
X_red = np.array(train_red_padded)
X_sub = np.array(train_subchar_seqs_padded)
y = np.array(train_labs_onehot)
# y = np.array(train_weights_onehot)


# print('Input sequence dimensions (n.docs, seq.length):')
# print(X.shape)
# print('Label dimensions (n.docs, seq.length, one-hot encoding of 4 NER labels):')
# print(y.shape)

In [40]:
X_dev = np.array(dev_seqs_padded)
X_red_dev = np.array(dev_red_padded)
X_sub_dev = np.array(dev_subchar_seqs_padded)
y_dev = np.array(dev_labs_onehot)
# y_dev = np.array(dev_weights_onehot)


In [None]:
# re-initiate model with bias
model = make_model(output_bias=initial_bias)
# and fit...
model.fit([X,X_red,X_sub], y, batch_size=128, epochs=50, callbacks = [early_stopping],   validation_data=([X_dev,X_red_dev,X_sub_dev], y_dev))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50

In [103]:
X_test = np.array(test_seqs_padded)
X_test_rad = np.array(test_rad_padded)
X_test_subchar = np.array(test_subchar_seqs_padded)

# preds = np.argmax(model.predict(X_test), axis=-1)
preds = np.argmax(model.predict([X_test,X_test_rad,X_test_subchar]), axis=-1)
flat_preds = [p for pred in preds for p in pred]
print(Counter(flat_preds))

Counter({3: 2455784, 2: 187794, 1: 22252, 0: 9142})


In [104]:
len(preds)

4636

In [106]:
test_seqs_temp['prediction'] = ''
for i in test_seqs_temp.index:
    this_seq_length = len(test_seqs_temp['tokenList'][i])
    test_seqs_temp['prediction'][i] = preds[i][:this_seq_length].astype(int)
# test_seqs_temp.head(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0.1,Unnamed: 0,sequence_num,token,label,token_indices,bio,bio_only,raws,tokenList,prediction
0,0,0,"[我, 们, 变, 而, 以, 书, 会, 友, ，, 以, 书, 结, 缘, ，, 把, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[102.0, 408.0, 138.0, 44.0, 224.0, 517.0, 122....","[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...",我们变而以书会友，以书结缘，把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。,"[我, 们, 变, 而, 以, 书, 会, 友, ，, 以, 书, 结, 缘, ，, 把, ...","[2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."


In [107]:
test_long = test_seqs_temp.set_index('sequence_num').apply(pd.Series.explode).reset_index()
# test_long.head(1)

Unnamed: 0.1,sequence_num,Unnamed: 0,token,label,token_indices,bio,bio_only,raws,tokenList,prediction
0,0,0,我,O,102,6,2,我们变而以书会友，以书结缘，把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。,我,2


In [108]:
# re-using the BIO integer-to-character function from last time
def reverse_bio(ind):
  bio = 'O'  # for any pad=3 predictions
  if ind==0:
    bio = 'B'
  elif ind==1:
    bio = 'I'
  elif ind==2:
    bio = 'O'
  return bio

bio_labs = [reverse_bio(b) for b in test_long['bio_only']]
test_long['bio_only'] = bio_labs
pred_labs = [reverse_bio(b) for b in test_long['prediction']]
test_long['prediction'] = pred_labs

test_long.head()
test_long.prediction.value_counts()

O    187803
I     22252
B      9142
Name: prediction, dtype: int64

In [109]:
def wnut_evaluate(txt):
  '''row by row entity evaluation: we evaluate by whole named entities'''
  tp = 0; fp = 0; fn = 0
  in_entity = 0
  for i in txt.index:
    if txt['prediction'][i]=='B' and txt['bio_only'][i]=='B':
      if in_entity==1:  # if there's a preceding named entity which didn't have intervening O...
        tp += 1  # count a true positive
      in_entity = 1  # start tracking this entity (don't count it until we know full span of entity)
    elif txt['prediction'][i]=='B':
      fp += 1  # if not a B in gold annotations, it's a false positive
      in_entity = 0
    elif txt['prediction'][i]=='I' and txt['bio_only'][i]=='I':
      next  # correct entity continuation: do nothing
    elif txt['prediction'][i]=='I' and txt['bio_only'][i]=='B':
      fn += 1  # if a new entity should have begun, it's a false negative
      in_entity = 0
    elif txt['prediction'][i]=='I':  # if gold is O...
      if in_entity==1:  # and if tracking an entity, then the span is too long
        fp += 1  # it's a false positive
      in_entity = 0
    elif txt['prediction'][i]=='O':
      if txt['bio_only'][i]=='B':
        fn += 1  # false negative if there's B in gold but no predicted B
        if in_entity==1:  # also check if there was a named entity in progress
          tp += 1  # count a true positive
      elif txt['bio_only'][i]=='I':
        if in_entity==1:  # if this should have been a continued named entity, the span is too short
          fn += 1  # count a false negative
      elif txt['bio_only'][i]=='O':
        if in_entity==1:  # if a named entity has ended in right place
          tp += 1  # count a true positive
      in_entity = 0

  if in_entity==1:  # catch any final named entity
    tp += 1

  print('Sum of TP and FP = %i' % (tp+fp))
  print('Sum of TP and FN = %i' % (tp+fn))
  print('True positives = %i, False positives = %i, False negatives = %i' % (tp, fp, fn))
  prec = tp / (tp+fp)
  rec = tp / (tp+fn)
  f1 = (2*(prec*rec)) / (prec+rec)
  print('Precision = %.3f, Recall = %.3f, F1 = %.3f (max=1)' % (prec, rec, f1))
 
wnut_evaluate(test_long)

Sum of TP and FP = 8652
Sum of TP and FN = 6167
True positives = 5314, False positives = 3338, False negatives = 853
Precision = 0.614, Recall = 0.862, F1 = 0.717 (max=1)


In [110]:
test_long.to_csv('lstm21epocweighted1hot.txt', sep='\t', index=False)

In [102]:
# lstm21epoc
# Sum of TP and FP = 6239
# Sum of TP and FN = 6921
# True positives = 4895, False positives = 1344, False negatives = 2026
# Precision = 0.785, Recall = 0.707, F1 = 0.744 (max=1)


In [33]:
# def postEditPred(pred):
#     print(pred)
#     ans = []
#     for item in pred:
#         if item == 0 or item == 3:
#             ans.append(1)
#         else:
#             ans.append(0)
            
#     assert len(ans) == len(pred)
#     return ans

# def splitSentence(st, pred):
#     temp_st = st
#     temp_pred = pred
#     temp_pred[0] = 2
#     buf = []
#     result = []
#     for (pre,char) in zip(temp_pred,temp_st):
#         if(pre == 0 or pre == 3):
#             result.append(buf)
#             buf = []
#         buf.append(char)
#     if(len(buf) > 0):
#         result.append(buf)
#     return result
    
    



# def sentencePrediction(dataset,prediction):
#     data_temp = dataset.copy()
#     assert len(data_temp) == len(prediction)
#     raw_sents = data_temp["raws"]
#     assert len(raw_sents) == len(prediction)
#     ans = []
#     for (st,pred) in zip(raw_sents,prediction):
#         sptSt = splitSentence(st,pred)
#         ans.append(sptSt)
#     assert len(ans) == len(data_temp)
#     data_temp["tokenList"]= ans
#     return data_temp
    
# def convertToPredSts(dataset):
#     data_temp = dataset.copy()
#     tokenList = data_temp["tokenList"]
#     ans = []
    
#     for item in tokenList:
#         temp_st = list(map(lambda x : "".join(x), item))
#         temp = "  ".join(temp_st)
#         temp = temp + "  "
#         ans.append(temp)
#     return ans

In [None]:
result_pred = sentencePrediction(test_feature,preds)
output_sts = convertToPredSts(result_pred)

f = open("./ans.txt", "w")
f.write("\n".join(output_sts))
f.close()

In [None]:
f = open(googlePath + "./this.txt", "w")
f.write("\n".join(output_sts))
f.close()