In [91]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm

In [92]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Preprocessing
* Change input data (ex. train.txt) into CRF model input format (ex. train.data)
    * CRF model input format (ex. train.data):
        ```
        肝 O
        功 O
        能 O
        6 B-med_exam
        8 I-med_exam
        ```

In [93]:
file_path='/content/drive/MyDrive/Colab Notebooks/NER/SampleData_deid.txt'

In [94]:
def loadInputFile(path):
    trainingset = list()  # store trainingset [content,content,...]
    position = list()  # store position [article_id, start_pos, end_pos, entity_text, entity_type, ...]
    mentions = dict()  # store mentions[mention] = Type

    # new list
    word_length = list()
    word_position = list()
    with open(file_path, 'r', encoding='utf8') as f:
        file_text=f.read().encode('utf-8').decode('utf-8-sig')
    datas=file_text.split('\n\n--------------------\n\n')[:-1]
    for data in datas:
        data=data.split('\n')
        content=data[0]
        # print(content + '\n')
        trainingset.append(content)
        annotations=data[1:]
        for annot in annotations[1:]:
            
            annot=annot.split('\t') #annot= article_id, start_pos, end_pos, entity_text, entity_type
            position.extend(annot)
            mentions[annot[3]]=annot[4]

            word_length.append(int(annot[2]) - int(annot[1]))
            word_position.append(annot[1])                  
    
    return trainingset, position, mentions, word_length, word_position

ts, pos, men, word_length, word_position = loadInputFile(file_path)

print(len(men))
print(len(pos))
print(len(ts))
print(len(word_length))
print(len(word_position))
# print(pos[:20])

255
2230
26
446
446


In [116]:
def CRFFormatData(trainingset, position, path):
    if (os.path.isfile(path)):
        os.remove(path)
    outputfile = open(path, 'a', encoding= 'utf-8')

    # output file lines
    count = 0 # annotation counts in each content
    tagged = list()
    for article_id in range(len(trainingset)):
        trainingset_split = list(trainingset[article_id])
        while '' or ' ' in trainingset_split:
            if '' in trainingset_split:
                trainingset_split.remove('')
            else:
                trainingset_split.remove(' ')        
        start_tmp = 0
        for position_idx in range(0,len(position),5):
            if int(position[position_idx]) == article_id:
                count += 1
                if count == 1:
                    start_pos = int(position[position_idx+1])
                    end_pos = int(position[position_idx+2])
                    entity_type=position[position_idx+4]
                    if start_pos == 0:
                        token = list(trainingset[article_id][start_pos:end_pos])
                        whole_token = trainingset[article_id][start_pos:end_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            # BIO states
                            if token_idx == 0:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                            
                            output_str = token[token_idx] + ' ' + label + ' ' + start_pos + ' ' + str(end_pos - start_pos) + '\n'
                            outputfile.write(output_str)

                    else:
                        token = list(trainingset[article_id][0:start_pos])
                        whole_token = trainingset[article_id][0:start_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            
                            output_str = token[token_idx] + ' ' + 'O' + ' ' + start_pos + ' '  + '0' + '\n'
                            outputfile.write(output_str)

                        token = list(trainingset[article_id][start_pos:end_pos])
                        whole_token = trainingset[article_id][start_pos:end_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            # BIO states
                            if token[0] == '':
                                if token_idx == 1:
                                    label = 'B-'+entity_type
                                else:
                                    label = 'I-'+entity_type
                            else:
                                if token_idx == 0:
                                    label = 'B-'+entity_type
                                else:
                                    label = 'I-'+entity_type

                            output_str = token[token_idx] + ' ' + label + ' ' + start_pos + ' '  + str(end_pos - start_pos) + '\n'
                            outputfile.write(output_str)

                    start_tmp = end_pos
                else:
                    start_pos = int(position[position_idx+1])
                    end_pos = int(position[position_idx+2])
                    entity_type=position[position_idx+4]
                    if start_pos<start_tmp:
                        continue
                    else:
                        token = list(trainingset[article_id][start_tmp:start_pos])
                        whole_token = trainingset[article_id][start_tmp:start_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            output_str = token[token_idx] + ' ' + 'O' + ' ' + start_pos + ' '  + '0' + '\n'
                            outputfile.write(output_str)

                    token = list(trainingset[article_id][start_pos:end_pos])
                    whole_token = trainingset[article_id][start_pos:end_pos]
                    for token_idx in range(len(token)):
                        if len(token[token_idx].replace(' ','')) == 0:
                            continue
                        # BIO states
                        if token[0] == '':
                            if token_idx == 1:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                        else:
                            if token_idx == 0:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                        
                        output_str = token[token_idx] + ' ' + label + ' ' + start_pos + ' '  + str(end_pos - start_pos) + '\n'
                        outputfile.write(output_str)
                    start_tmp = end_pos

        token = list(trainingset[article_id][start_tmp:])
        whole_token = trainingset[article_id][start_tmp:]
        for token_idx in range(len(token)):
            if len(token[token_idx].replace(' ','')) == 0:
                continue

            
            output_str = token[token_idx] + ' ' + 'O' + ' ' + start_pos + ' '  + '0' + '\n'
            outputfile.write(output_str)

        count = 0
    
        output_str = '\n'
        outputfile.write(output_str)
        ID = trainingset[article_id]

        if article_id%10 == 0:
            print('Total complete articles:', article_id)

    # close output file
    outputfile.close()

In [117]:
trainingset, position, mentions, word_length, word_position=loadInputFile(file_path)

In [118]:
data_path='/content/drive/MyDrive/Colab Notebooks/NER/sample.data'
CRFFormatData(trainingset, position, data_path)

Total complete articles: 0
Total complete articles: 10
Total complete articles: 20


## NER model
### CRF (Conditional Random Field model)
* Using `sklearn-crfsuite` API

    (you may try `CRF++`, `python-crfsuite`, `pytorch-crfsuite`(neural network version))

In [119]:
!pip install sklearn-crfsuite
import sklearn_crfsuite

from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report



In [120]:
def CRF(x_train, y_train, x_test, y_test):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(x_train, y_train)
    # print(crf)
    y_pred = crf.predict(x_test)
    y_pred_mar = crf.predict_marginals(x_test)

    # print(y_pred_mar)

    labels = list(crf.classes_)
    labels.remove('O')
    f1score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
    sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
    print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
    return y_pred, y_pred_mar, f1score

## Model Input: 
* input features:
    * word vector: pretrained traditional chinese word embedding by Word2Vec-CBOW
    
    (you may try add some other features, ex. pos-tag, word_length, word_position, ...) 

In [121]:
import numpy as np

In [122]:
# load pretrained word vectors
# get a dict of tokens (key) and their pretrained word vectors (value)
# pretrained word2vec CBOW word vector: https://fgc.stpi.narl.org.tw/activity/videoDetail/4b1141305ddf5522015de5479f4701b1
dim = 0
word_vecs= {}
# open pretrained word vector file
with open('/content/drive/MyDrive/Colab Notebooks/NER/cna.cbow.cwe_p.tar_g.512d.0.txt') as f:
    for line in f:
        tokens = line.strip().split()

        # there 2 integers in the first line: vocabulary_size, word_vector_dim
        if len(tokens) == 2:
            dim = int(tokens[1])
            continue
    
        word = tokens[0] 
        vec = np.array([ float(t) for t in tokens[1:] ])
        word_vecs[word] = vec


In [123]:
print('vocabulary_size: ',len(word_vecs),' word_vector_dim: ',vec.shape)

vocabulary_size:  158566  word_vector_dim:  (512,)


Here we split data into training dataset and testing dataset,
however, we'll provide `development data` and `test data` which is real testing dataset.

You should upload prediction on `development data` and `test data` to system, not this splitted testing dataset.

In [124]:
# load `train.data` and separate into a list of labeled data of each text
# return:
#   data_list: a list of lists of tuples, storing tokens and labels (wrapped in tuple) of each text in `train.data`
#   traindata_list: a list of lists, storing training data_list splitted from data_list
#   testdata_list: a list of lists, storing testing data_list splitted from data_list
from sklearn.model_selection import train_test_split
def Dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data=f.readlines()#.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list=list()
    idx=0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            print('Row:' + row)
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1], row[2])
            print(data_tuple)

            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)
    
    # here we random split data into training dataset and testing dataset
    # but you should take `development data` or `test data` as testing data
    # At that time, you could just delete this line, 
    # and generate data_list of `train data` and data_list of `development/test data` by this function
    traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list=train_test_split(data_list,
                                                                                                    article_id_list,
                                                                                                    test_size=0.33,
                                                                                                    random_state=42)
    
    return data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list 

In [125]:
# look up word vectors
# turn each word into its pretrained word vector
# return a list of word vectors corresponding to each token in train.data
def Word2Vector(data_list, embedding_dict):
    embedding_list = list()

    # No Match Word (unknown word) Vector in Embedding
    unk_vector=np.random.rand(*(list(embedding_dict.values())[0].shape))

    for idx_list in range(len(data_list)):
        embedding_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):            
            key = data_list[idx_list][idx_tuple][0] # token
            # print(str(idx_tuple) + key)
            if key in embedding_dict:
                value = embedding_dict[key]
            else:
                value = unk_vector
            embedding_list_tmp.append(value)
        embedding_list.append(embedding_list_tmp)
    return embedding_list

In [126]:
# input features: pretrained word vectors of each token
# return a list of feature dicts, each feature dict corresponding to each token
def Feature(embed_list):
    feature_list = list()
    for idx_list in range(len(embed_list)):
        feature_list_tmp = list()
        for idx_tuple in range(len(embed_list[idx_list])):
            feature_dict = dict()
            for idx_vec in range(len(embed_list[idx_list][idx_tuple])):
                feature_dict['dim_' + str(idx_vec+1)] = embed_list[idx_list][idx_tuple][idx_vec]
                
            feature_list_tmp.append(feature_dict)

        feature_list.append(feature_list_tmp)        
    return feature_list

In [127]:
# get the labels of each tokens in train.data
# return a list of lists of labels
def Preprocess(data_list):
    label_list = list()
    for idx_list in range(len(data_list)):
        label_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            label_list_tmp.append(data_list[idx_list][idx_tuple][1])
        label_list.append(label_list_tmp)
    return label_list

## Training

In [137]:
data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list = Dataset(data_path)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

('還', 'O', '0')
Row:蠻 O 0

('蠻', 'O', '0')
Row:多 O 0

('多', 'O', '0')
Row:次 O 0

('次', 'O', '0')
Row:的 O 0

('的', 'O', '0')
Row:， O 0

('，', 'O', '0')
Row:然 O 0

('然', 'O', '0')
Row:後 O 0

('後', 'O', '0')
Row:… O 0

('…', 'O', '0')
Row:… O 0

('…', 'O', '0')
Row:應 O 0

('應', 'O', '0')
Row:該 O 0

('該', 'O', '0')
Row:就 O 0

('就', 'O', '0')
Row:是 O 0

('是', 'O', '0')
Row:約 O 0

('約', 'O', '0')
Row:了 O 0

('了', 'O', '0')
Row:蠻 O 0

('蠻', 'O', '0')
Row:多 O 0

('多', 'O', '0')
Row:次 O 0

('次', 'O', '0')
Row:的 O 0

('的', 'O', '0')
Row:， O 0

('，', 'O', '0')
Row:然 O 0

('然', 'O', '0')
Row:後 O 0

('後', 'O', '0')
Row:都 O 0

('都', 'O', '0')
Row:蠻 O 0

('蠻', 'O', '0')
Row:固 O 0

('固', 'O', '0')
Row:定 O 0

('定', 'O', '0')
Row:的 O 0

('的', 'O', '0')
Row:這 O 0

('這', 'O', '0')
Row:樣 O 0

('樣', 'O', '0')
Row:子 O 0

('子', 'O', '0')
Row:。 O 0

('。', 'O', '0')
Row:醫 O 0

('醫', 'O', '0')
Row:師 O 0

('師', 'O', '0')
Row:： O 0

('：', 'O', '0')


In [138]:
# Load Word Embedding
trainembed_list = Word2Vector(traindata_list, word_vecs)
testembed_list = Word2Vector(testdata_list, word_vecs)

# CRF - Train Data (Augmentation Data)
x_train = Feature(trainembed_list)
y_train = Preprocess(traindata_list)

# CRF - Test Data (Golden Standard)
x_test = Feature(testembed_list)
y_test = Preprocess(testdata_list)

In [139]:
y_pred, y_pred_mar, f1score = CRF(x_train, y_train, x_test, y_test)

              precision    recall  f1-score   support

  B-location      0.000     0.000     0.000        15
  I-location      0.000     0.000     0.000        41
  B-med_exam      0.500     0.030     0.057        33
  I-med_exam      1.000     0.050     0.095        80
     B-money      0.364     0.333     0.348        12
     I-money      0.353     0.171     0.231        35
      B-name      0.200     0.143     0.167         7
      I-name      0.333     0.100     0.154        10
      B-time      0.623     0.387     0.478       111
      I-time      0.801     0.442     0.569       265

   micro avg      0.683     0.291     0.408       609
   macro avg      0.417     0.166     0.210       609
weighted avg      0.656     0.291     0.375       609



In [140]:
f1score

0.37499174379701783

In [132]:
y_pred

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-time',
  'I-time',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-time',
  'I-time',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',


In [133]:
y_pred_mar

[[{'B-location': 1.576140989481551e-05,
   'B-med_exam': 2.0349677171010466e-05,
   'B-money': 2.1415631543444366e-06,
   'B-name': 2.5667555350755234e-08,
   'B-time': 3.826102623539972e-06,
   'I-location': 0.32574220283128313,
   'I-med_exam': 0.0005168103859106142,
   'I-money': 4.129492180684617e-05,
   'I-name': 5.299035127525697e-08,
   'I-time': 0.0033150795519423224,
   'O': 0.6703424548982803},
  {'B-location': 1.3736967582453505e-09,
   'B-med_exam': 6.184320005986601e-07,
   'B-money': 3.707641381388487e-07,
   'B-name': 7.852750626111617e-07,
   'B-time': 5.562261111413439e-07,
   'I-location': 0.0007369571068025866,
   'I-med_exam': 4.505889468051608e-06,
   'I-money': 1.9826994656517793e-06,
   'I-name': 6.300462941303267e-08,
   'I-time': 6.435235229120161e-08,
   'O': 0.9992540948762462},
  {'B-location': 1.5900049297249853e-07,
   'B-med_exam': 6.181116955826502e-06,
   'B-money': 2.0159536273437343e-06,
   'B-name': 3.113417636031172e-07,
   'B-time': 2.4692900050300

## Output data
* Change model output into `output.tsv` 
* Only accept this output format uploading to competition system

In [134]:
output="article_id\tstart_position\tend_position\tentity_text\tentity_type\n"
for test_id in range(len(y_pred)):
    pos=0
    start_pos=None
    end_pos=None
    entity_text=None
    entity_type=None
    for pred_id in range(len(y_pred[test_id])):
        if y_pred[test_id][pred_id][0]=='B':
            start_pos=pos
            entity_type=y_pred[test_id][pred_id][2:]
        elif start_pos is not None and y_pred[test_id][pred_id][0]=='I' and y_pred[test_id][pred_id+1][0]=='O':
            end_pos=pos
            entity_text=''.join([testdata_list[test_id][position][0] for position in range(start_pos,end_pos+1)])
            line=str(testdata_article_id_list[test_id])+'\t\t'+str(start_pos)+'\t\t'+str(end_pos+1)+'\t\t'+entity_text+'\t\t'+entity_type
            output+=line+'\n'
        pos+=1     

In [135]:
output_path='/content/drive/MyDrive/Colab Notebooks/NER/output.tsv'
with open(output_path,'w',encoding='utf-8') as f:
    f.write(output)

In [136]:
print(output)

article_id	start_position	end_position	entity_text	entity_type
8		52		54		前天		time
8		68		70		昨天		time
8		189		193		二十分鐘		time
8		293		295		五年		time
8		540		544		兩個禮拜		time
8		726		728		前天		time
8		730		732		前天		time
8		858		860		前天		time
8		898		900		前天		time
8		1549		1551		五天		time
8		1622		1626		五天禮拜		time
8		2352		2354		去喬		time
8		2560		2563		兩個月		time
16		51		55		九、十點		time
16		60		64		九、十點		time
16		122		124		三年		time
16		130		132		三年		time
16		247		249		三年		time
16		462		464		焦慮		name
0		1268		1271		8公分		med_exam
0		1358		1362		三多路上		time
0		2576		2578		五天		time
0		2604		2609		3月18號		time
0		2665		2670		二到禮拜四		time
24		48		51		三個月		time
24		53		55		七公		time
24		113		115		三年		time
24		141		144		三年間		time
24		1381		1384		一點點		time
24		1381		1410		一點點麻煩啦，不比這個……民眾：不能馬上拿啦。醫師：啊明明		time
24		1858		1861		二個月		time
24		1869		1872		二個月		time
24		1986		1989		五個月		time
11		18		21		九千七		money
11		61		64		零四百		money
11		67		70		零五百		money
11		83		86		九千七		money
11		135		139		三個禮拜		time
11		51

## Note
* You may try `python-crfsuite` to train an neural network for NER tagging optimized by gradient descent back propagation
    * [Documentation](https://github.com/scrapinghub/python-crfsuite)
* You may try `CRF++` tool for NER tagging by CRF model
    * [Documentation](http://taku910.github.io/crfpp/)
    * Need design feature template
    * Can only computed in CPU
* You may try other traditional chinese word embedding (ex. fasttext, bert, ...) for input features
* You may try add other features for NER model, ex. POS-tag, word_length, word_position, ...
* You should upload the prediction output on `development data` or `test data` provided later to the competition system. Note don't upload prediction output on the splitted testing dataset like this baseline example.

-----------------------------------------------------