In [27]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

In [29]:
## functions for convert from .muc files to .conll files

import os
import re
import sys
from nltk import word_tokenize

stringRegex = "<ENAMEX TYPE[^>]*>([^<]*)</ENAMEX>"
nestStringRegex = "<ENAMEX TYPE[^>]*>([^<]*<ENAMEX TYPE[^>]*>([^<]*<ENAMEX TYPE[^>]*>([^<]*<ENAMEX TYPE[^>]*>[^<]*</ENAMEX>[^<]*|[^<]*)*</ENAMEX>[^<]*|[^<]*)*</ENAMEX>[^<]*)*</ENAMEX>"


def split_tokenize(sentence):
    nestData = []
    regex = re.compile(stringRegex)
    nestRegex = re.compile(nestStringRegex)

    i = 0
    while nestRegex.search(sentence, i) is not None:
        regexMatcher = nestRegex.search(sentence, i)
        if regexMatcher is not None:
            nestData.append(sentence[i:regexMatcher.start()])
            nestData.append(regexMatcher.group())
            i = regexMatcher.end() + 1
    if i != len(sentence):
        nestData.append(sentence[i:])
    # print("nestData:", nestData)
    data = []
    for d in nestData:
        i = 0
        while regex.search(d, i) is not None:
            # print("có")
            if nestRegex.search(d, i) is not None:
                data.append(d)
                i = len(d)
                break
            else:
                regexMatcher = regex.search(d, i)
                if regexMatcher is not None:
                    data.append(d[i:regexMatcher.start()])
                    data.append(regexMatcher.group())
                    # print(c.group())
                    i = regexMatcher.end() + 1
        if i != len(d):
            data.append(d[i:])

    tokens = []
    for d in data:
        if "<ENAMEX" in d:
            tokens.append(d)
        else:
            for w in word_tokenize(d):
                tokens.append(w)
        # print(d)
    # for t in tokens:
    #     print(t)
    return tokens

def merger_nest_entities(sentence):
    nestData_sent = []
    regex = re.compile(stringRegex)
    regexMatcher = regex.search(sentence)
    textMatch = regexMatcher.group()
    g = "<ENAMEX TYPE=\"([^<]*)\">"
    e = re.compile(g)
    labeledEntity = e.search(textMatch)
    # print(regexMatcher.groups()[0], "\t", labeledEntity.groups()[0])

    ws = word_tokenize(regexMatcher.groups()[0])
    nestData_sent.append([ws[0], "B-" + labeledEntity.groups()[0]])
    textMerger = ws[0] + "_" + "B-" + labeledEntity.groups()[0] + " "
    for w in range(1, len(ws)):
        nestData_sent.append([ws[w], "I-" + labeledEntity.groups()[0]])
        textMerger += ws[w] + "_" + "I-" + labeledEntity.groups()[0] + " "

    # print(textMatch)

    return textMatch, textMerger[:-1], nestData_sent

def make_text(data):
    text = ""
    for d in data:
        for w in d:
            line = w[0] + "\t_" + "\t_"
            for e in w[1]:
                line += "\t" + e
            text += line + "\n"

        text += "\n"
    return text

def make_form(data):
    new_data = []
    num_maximum_entities = 0
    for d in data:
        # print("sentence = ",d)
        tokens = split_tokenize(d)
        # this token includes the word(raw text), ner(not nested) and nested ner.
        # định dạng ConLL 2003 gồm:
        # cột 0     cột 1       cột 2           cột 3       cột 4->>>
        # word      POS(k có)   Phrase(k có)    NER_main    Ner_extension
        data_sent = []
        for token in tokens:
            # print("token = ", token)
            # xử lý thực thể
            if "<ENAMEX" in token:
                nestRegex = re.compile(nestStringRegex)
                regex = re.compile(stringRegex)
                # thực thể đơn
                # print(nestRegex.search(token))
                if nestRegex.search(token) is None:
                    # print("-> Nhãn đơn: ")
                    regexMatcher = regex.search(token)

                    textMatch = regexMatcher.group()
                    g = "<ENAMEX TYPE=\"([^<]*)\">"
                    e = re.compile(g)
                    labeledEntity = e.search(textMatch)

                    # print(regexMatcher.groups()[0], "\t", labeledEntity.groups()[0])
                    ws = word_tokenize(regexMatcher.groups()[0])
                    data_sent.append(ws[0] + "_" + "B-"+labeledEntity.groups()[0])
                    for w in range(1, len(ws)):
                        data_sent.append(ws[w] + "_" + "I-" + labeledEntity.groups()[0])

                # thực thể lồng ghép
                else:
                    # tái sử dụng hàm split_tokenize(sentence)
                    # print("---> Nhãn lồng: ")
                    # print(nestRegex.search(token).groups()[0])
                    if nestRegex.search(token).groups()[0] is None:
                        print(token)
                        # continue
                        sys.exit("Error: NER is not exits")

                    while nestRegex.search(token) is not None and regex.search(token) is not None:
                        textMatch, textMerger, nestData_sent = merger_nest_entities(token)
                        # print("textMatch = ",textMatch)

                        g = token.replace(textMatch, textMerger)
                        # print("g = ", g)
                        token = g

                    textMatch, textMerger, nestData_sent = merger_nest_entities(token)
                    for d_s in nestData_sent:
                        data_sent.append(d_s[0]+"_"+d_s[1])
                        # print(d_s[0]+"_"+d_s[1])

            else:
                data_sent.append(token)
        pass

        for i in range(len(data_sent)):
            labeledEntities = data_sent[i].split("_")

            word = labeledEntities[0]
            entities = labeledEntities[1:]
            # print(word, entities)
            if num_maximum_entities < len(entities):
                num_maximum_entities = len(entities)
            entities = entities[::-1]

            data_sent[i] = [word, entities]
        pass
        new_data.append(data_sent)
        # split_tokenize(d, nestStringRegex)
        # print()

    for i in range(len(new_data)):
        for j in range(len(new_data[i])):
            # print(new_data[i])
            while len(new_data[i][j][1]) < num_maximum_entities:
                new_data[i][j][1].append("O")

    return make_text(new_data)


def get_sentence_origin(sentence):
    # tách các thành phần có chứa "<ENAMEX .." ra khỏi câu:
    return re.sub(r'<ENAMEX TYPE="[A-Z|\-]*">|</ENAMEX>', '', sentence).strip(" ").replace("  ", " ")


def read_input_file(filename):
#     if os.path.isfile(filename):
#         print(filename)
    with open(filename, "r", encoding="utf-8") as f:
        text = []
        lines = f.readlines()
        for i in range(0, len(lines)):
            line = lines[i].strip("\n")
            # if line != "":
            text.append(line)
            # text += lines[i]

        return text


def write_output_file(filename, text):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text)


def convert(path_in, path_out):
    list_files = os.listdir(path_in)
    if not os.path.exists(path_out):
        os.mkdir(path_out)
    for file in tqdm(list_files):
        file_path = path_in + "/" + file
        data = read_input_file(file_path)
        output = make_form(data)
        file_out = path_out + "/" + file[:-3] + "conll"
        write_output_file(file_out, output)
#     write_output_file(path_out, path_out)
    print("Done!")

In [23]:
## check dir_out is exist or not
## if not, create empty dir_out
## input: dir_out (not ended by /)

def checkDir(dir_out):
    if not os.path.exists(dir_out):
        os.mkdir(dir_out)

In [30]:
## replace </ENAMEX>punc => </ENAMEX> punc
## input: path of origin dir, path of destination dir (not ended by /)
## output: None

def add_space_tag(dir_in, dir_out):
    a = ',:.?!'
    aa = ['</ENAMEX>' + i for i in a]
    aaa = ['</ENAMEX> ' + i for i in  a]

    checkDir(dir_out)
        
    for path_in in tqdm(os.listdir(dir_in)):
        dt = []
        with open(dir_in + '/' + path_in, 'r', encoding='utf-8') as f:
            for i in f.readlines():
                for j in range(len(aa)):
                    i = i.replace(aa[j], aaa[j])
                dt.append(i)
        with open(dir_out + '/' + path_in, 'w', encoding='utf-8') as f:
            for i in dt:
                f.write(i)

In [31]:
## split train, dev, test dts to folder resp: dir_out/train, dir_out/dev, dir_out/test
## input: dir_in, dir_out (not ended by /)

import shutil
def train_dev_test_split(dir_in, dir_out):
    checkDir(dir_out)
    for i in ['train', 'test', 'dev']:
        checkDir(dir_out + '/' + i)
    
    for i in tqdm(os.listdir(dir_in)):
        if 'dev' in i:
            shutil.move( dir_in + '/' + i, dir_out + '/dev/' +i)
        elif 'test' in i:
            shutil.move(dir_in + '/' + i, dir_out + '/test/' + i)
        else:
            shutil.move(dir_in + '/' + i, dir_out + '/train/' + i)

In [22]:
## merge all files into only file with name _merge.txt
## input: dir_path contain all files need to merge

def merge_files(dir_path):
    temp = []
    for i in os.listdir(dir_path):
        with open(dir_path +'/' + i, 'r', encoding='utf-8') as f:
            temp += f.readlines()
    with open(dir_path + '_merge.txt', 'w', encoding='utf-8') as f:
        for i in temp:
            f.write(i)

In [21]:
## normalize merge file to atleast 6 columns format
## input: file_path origin merge, file_path_new

def convert2normalize(file_path, file_path_new):
    print('convert ' + file_path + 'to atleast 6 columns format')
    temp = []
    with open( file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            line = line.strip().split('\t')
            if len(line) == 3:
                line.append('O')
            if len(line) >= 4 and len(line) < 6:
                for i in range(6-len(line)):
                    line.append('?')
            if len(line) != 1 and len(line) <4:
                line = ['']
            temp.append('\t'.join(line))
    with open(file_path_new, 'w', encoding='utf-8') as f:
        for i in temp:
            f.write(i + '\n')

In [20]:
## convert from 6 columns to 2 columns format
## input: file_path orgin, file_path_new

def convert2columns2(file_path, file_path_new):
    print('convert ' + file_path + 'to 2 columns format')
    temp = []
    with open( file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f.readlines()):
            line = line.strip().split('\t')
            if len(line) > 3:
                temp.append('\t'.join([line[0], line[3]]))
            else:
                if len(line) > 1:
                    print(line, file_path, i)
                temp.append('')
    with open(file_path_new, 'w', encoding='utf-8') as f:
        for i in temp:
            f.write(i + '\n')

In [19]:
## merge subtags and BI- tags
## input: name of tag origin: string
## output: name of merge-tag: string

def merge_subtag(text):
    NER = ['DATETIME', 'PERSONTYPE', 'PERSON', 'ORGANIZATION', 'PRODUCT', 'EVENT', 'LOCATION', 'URL', 'PHONENUMBER', 'QUANTITY', 'IP', 'ADDRESS', 'SKILL', 'EMAIL', 'MISCELLANEOUS']
    for i in NER:
        if len(i) > 1 and i in text:
            return i
    return text

In [18]:
## convert data 2 columns to merge tag format 2 columns
## input: file name origin, file_name new

def convert_merge_subtag(file_path, file_new):
    print('convert ' + file_path + 'to merge subtag format')
    temp = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            line = line.strip().split('\t')
            if len(line) > 1:
                line[1] = merge_subtag(line[1])
            temp.append('\t'.join(line))
    with open(file_new, 'w', encoding='utf-8') as f:
        for i in temp:
            f.write(i+'\n')

In [17]:
## convert from 2 columns format to pkl file
## input: file_txt 2 columns format
## output: create pickle file with same name (change extension)

def convert2pkl(file_txt):
    print('convert ' + file_txt + 'to pkl')
    temp = []
    res = []
    with open(file_txt, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            if len(line) > 1:
                line = line.strip().split('\t')
                temp.append((line[0], line[1]))
            else:
                if len(temp) > 0:
                    res.append(temp)
                temp = []
    new_file = file_txt.replace('txt', 'pkl')
    with open(new_file, 'wb') as f:
        pickle.dump(res, f)

In [16]:
## merge BI tag
## input: tag origin: string
## output: merge tag: string

def merge_BItag(tag):
    tag = tag.replace('B-', '')
    return tag.replace('I-', '')

In [15]:
## read dataset from pickle file to dataframe
## input: file path pickle, is_merge: boolean (merge BI-tag or not)
## output: dataframe

def read_dts(file_path, is_merge=True):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    X, y = [], []
    for i in data:
        for a, b in i:
            X.append(a)
            y.append(b)
    df = pd.DataFrame(X, columns=['word'])
    df['tag'] = y
    if is_merge:
        df['tag'] = df['tag'].apply(lambda x:merge_BItag(x))
    return df

In [14]:
## create vocab of dataset with {word: {tag: count of tag}}
## input: dataframe
## output: vocab: dict

def create_vocab_dict(df):
    vocab = {}
    tag = df.tag.unique()
    temp = {}
    for i in tag:
        temp[i] =  0
    for i in range(len(df)):
        word, tag = df['word'][i], df['tag'][i]
        if word not in vocab:
            vocab[word] = temp.copy()
        vocab[word][tag] += 1
    return vocab

In [33]:
import spacy
from spacy import displacy

colors = ['Apricot', 'Brown', 'Olive', 'Teal', 'Pink', 'Black', 'Red', 'Orange', 'Yellow', 'Lime', 'Green', 'Cyan', 'Blue', 'Purple', 
         'Grey']
NER = ['DATETIME', 'PERSONTYPE', 'PERSON', 'ORGANIZATION', 'PRODUCT', 'EVENT', 'LOCATION', 'URL', 'PHONENUMBER', 'QUANTITY', 'IP', 'ADDRESS', 'SKILL', 'EMAIL', 'MISCELLANEOUS']
COLORS = dict()
for i in range(len(NER)):
    COLORS[NER[i]] = colors[i]
OPTIONS = {'ents': NER, 'colors': COLORS}
    
## visualize result
## input: predict format [(word, tag)]

def visualize_spacy(arr):
    if len(arr) < 1:
        return None
    text = ' '.join([i for i, j in arr])
    pos = 0
    start_end_labels = []
    for word, tag in arr:
        if len(start_end_labels) > 0 and tag == start_end_labels[-1][2]:
            temp = [start_end_labels[-1][0], pos+len(word), tag]
            start_end_labels[-1] = temp.copy()
        else:
            temp = [pos, pos+len(word), tag]
            start_end_labels.append(temp)
        pos += len(word) + 1
        
    ex = [{'text': text, 'ents': [{'start': x[0], 'end': x[1], 'label': x[2]} for x in start_end_labels if x[2]!= 0]}]
    displacy.render(ex, manual=True, jupyter=True, style='ent', options = OPTIONS )  

In [32]:
## pipeline load dts from all-muc-folder
LIST_FOLDER = ['train', 'dev', 'test']
DIR_MUC = 'MyNER21/NER-Data-Muc' ## contain all origin muc files, linked to your dataset.
DIR_MUC_FIXED = 'demo_fixed'  ## contain all muc files fixed space, any name.
DIR_CONLL =  'demo_conll' ## contain all conll files, any name.
DIR_DTS =   'demo_dts' ## contain folders train, dev, test which each folder contains files conll after splitting.
print('Add space after TAG HTML END and punc:')
add_space_tag(DIR_MUC, DIR_MUC_FIXED)  ## add space after </ENAMEX>punc
print('Convert muc format to conll format:')
convert(DIR_MUC_FIXED, DIR_CONLL)  ## convert muc format to conll format
print('Train dev test split:')
train_dev_test_split(DIR_CONLL, DIR_DTS) ## split train, dev, test
for i in LIST_FOLDER:
    merge_files(DIR_DTS + '/' + i)     ## merge folder train, dev, test into only files with name DIR_DTS/train_merge.txt, DIR_DTS/dev_merge.txt, DIR_DTS/test_merge.txt
    convert2normalize(DIR_DTS+'/'+i+'_merge.txt', DIR_DTS+'/'+i+'_fixed.txt')
    convert2columns2(DIR_DTS+'/'+i+'_fixed.txt', DIR_DTS+'/'+i+'_2columns.txt')
    convert_merge_subtag(DIR_DTS+'/'+i+'_2columns.txt', DIR_DTS+'/'+i+'_merge_subtag.txt')
    convert2pkl(DIR_DTS+'/'+i+'_merge_subtag.txt')

100%|████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 3133.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:19<00:00, 78.88it/s]


Done!


100%|████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 7762.74it/s]
