# Loading Datasets

In [1]:
import json
import random
import re
from tqdm import tqdm


def print_in_color(s, cint=31, end='\n'):
    print('\x1b[{}m{}\x1b[0m'.format(cint, s), end=end)

In [2]:
datasets = []
for t in ['train', 'val', 'test']:
    with open('../../dataset/Weibo/raw/post/{}.json'.format(t), 'r') as f:
        pieces = json.load(f)
        print(t, len(pieces))
        datasets.append(pieces)

train 3816
val 1272
test 1274


# Example

In [3]:
from LAC import LAC

lac = LAC(mode='rank')

In [4]:
with open('./resources/pattern_words_Chinese.txt', 'r') as f:
    pattern_words = f.readlines()
    pattern_words = [l.strip() for l in pattern_words]
len(pattern_words), pattern_words[:10]

(29809, ['!', '！', '?', '？', ',', '，', '.', '。', '[', '【'])

In [5]:
def extract_chinese(txt):
    pattern = re.compile(
        "[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]")
    return "".join(pattern.findall(txt))


def analysis_a_result(res):
    print(''.join(res[0]), '\n')

    for i, tag in enumerate(res[1]):
        word = res[0][i]
        if tag in ['PER', 'LOC', 'ORG', 'TIME']:
            # red for entities
            print_in_color('{}'.format(word), end=' ')
        elif word in pattern_words:
            # green for stylistic tokens
            print_in_color('{}'.format(word), cint=32, end=' ')
        else:
            print('{}'.format(res[0][i]), end=' ')


def handle_a_text(text):
    try:
        res = lac.run(text)
    except:
        res = lac.run(extract_chinese(text))

    return res

In [12]:
t = random.sample(random.sample(datasets, 1)[0], 1)[0]
print(t['label'])
print()

analysis_a_result(lac.run(t['content']))

fake

兄弟姐妹们，大兵求您帮忙，希望能尽自己一点微薄之力号召更多有爱的人❤️伸出您爱心之手寻找失踪孩子🙏 🙏 🙏 孩子在义乌被别人拐走—急找孩子，求转，求帮忙实验小学 寻人启事 13940292999。有线索酬金10万 帮忙扩散，今天上午一个三岁多小女孩在锦绣花园小区附近被人拐走了  

兄弟姐妹们 [32m，[0m 大兵 求 您 帮忙 [32m，[0m [32m希望[0m 能 尽 自己 [32m一点[0m 微薄 之 力 号召 更多 有 爱的人 [31m❤️[0m 伸出 您 [32m爱心[0m 之 手 寻找 [32m失踪[0m 孩子🙏 🙏 🙏  孩子 在 [31m义乌[0m 被 别人 拐走 — 急 找 孩子 [32m，[0m 求 转 [32m，[0m 求 帮忙 [31m实验小学[0m   寻人启事   13940292999。 有线索 酬金 10万   帮忙 扩散 [32m，[0m [31m今天[0m [31m上午[0m 一个三岁 多 小女孩 在 [31m锦绣花园小区[0m 附近 被 人 拐走 了   

# Execute

In [8]:
pattern_words = set(pattern_words)
for pieces in datasets:
    for p in tqdm(pieces):
        res = handle_a_text(p['content'])
        words = []

        try:
            for i, tag in enumerate(res[1]):
                word = res[0][i]
                if tag in ['PER', 'LOC', 'ORG', 'TIME']:
                    # entity
                    t = 'ENTITY'
                elif word in pattern_words:
                    # pattern
                    t = 'PATTERN'
                else:
                    t = 'OTHERS'

                words.append((word, t))
        except:
            pass

        p['words'] = words

        if len(p['words']) == 0:
            print(p)

 88%|████████▊ | 3374/3816 [00:10<00:01, 370.95it/s]

{'content': 'Dream Room  🌟', 'label': 'real', 'words': []}


100%|██████████| 3816/3816 [00:11<00:00, 322.38it/s]
100%|██████████| 1272/1272 [00:04<00:00, 306.83it/s]
100%|██████████| 1274/1274 [00:04<00:00, 260.96it/s]


In [9]:
for i, t in enumerate(['train', 'val', 'test']):
    with open('../../dataset/Weibo/raw/post/{}.json'.format(t), 'w') as f:
        json.dump(datasets[i], f, indent=4, ensure_ascii=False)