# Loading Datasets

In [16]:
import json
import random
import re
from tqdm import tqdm
import requests

In [8]:
# Here, the lists do not contain segmented and tagged words.
datasets = []
for t in ['train', 'val', 'test']:
    with open('../../dataset/Twitter/raw/post/{}.json'.format(t), 'r') as f:
        pieces = json.load(f)
        print(t, len(pieces))
        datasets.append(pieces)

train 8822
val 2943
test 2944


# Example

In [9]:
pattern_words = []
with open('./resources/pattern_words_English.txt', 'r') as f:
    pattern_words = f.readlines()
    pattern_words = [l.strip() for l in pattern_words]
len(pattern_words), pattern_words[:10]

(21126, ['!', '！', '?', '？', ',', '，', '.', '。', '[', '【'])

In [13]:
# We used the API from TexSmart, but it seems unavailable now.
# You may use its SDK version.
# See https://ai.tencent.com/ailab/nlp/texsmart/zh/index.html#instructions
api = "https://texsmart.qq.com/api"

opt = {
    "input_spec":{"lang":"en"},
    "word_seg":{"enable":True},
    "pos_tagging":{"enable":True,"alg":"crf"},
    "ner":{"enable":True,"alg":"crf","fine_grained":False},
    "syntactic_parsing":{"enable":False},
    "srl":{"enable":False}
}

In [14]:
def handle_a_text(index, text):
    req_str = json.dumps(
        {
            'str':text,
            'options':opt,
            'echo_data':index
        }
    ).encode()

    r = requests.post(api, data=req_str)
    r.encoding = "utf-8"
    res = json.loads(r.text)

    return res

In [None]:
t = random.sample(random.sample(datasets, 1)[0], 1)[0]
print(t['label'])
print()

handle_a_text(0, t['content'])

In [None]:
# an example
# {'header': {'time_cost_ms': 56.465,
#   'time_cost': 0.056465,
#   'core_time_cost_ms': 56.383,
#   'ret_code': 'succ'},
#   'norm_str': 'It took a few seasons, but the first couple to sleep in the same bed on TV was Fred and Wilma. Yabba Dabba Do! ',
#   'lang': 'en',
#   'word_list': [{'str': 'It', 'hit': [0, 2, 0, 1], 'tag': 'PRP'},
#   {'str': 'took', 'hit': [3, 4, 1, 1], 'tag': 'VBD'},
#   {'str': 'a', 'hit': [8, 1, 2, 1], 'tag': 'DT'},
#   {'str': 'few', 'hit': [10, 3, 3, 1], 'tag': 'JJ'},
#   {'str': 'seasons', 'hit': [14, 7, 4, 1], 'tag': 'NNS'},
#   {'str': ',', 'hit': [21, 1, 5, 1], 'tag': ','},
#   {'str': 'but', 'hit': [23, 3, 6, 1], 'tag': 'CC'},
#   {'str': 'the', 'hit': [27, 3, 7, 1], 'tag': 'DT'},
#   {'str': 'first', 'hit': [31, 5, 8, 1], 'tag': 'JJ'},
#   {'str': 'couple', 'hit': [37, 6, 9, 1], 'tag': 'NN'},
#   {'str': 'to', 'hit': [44, 2, 10, 1], 'tag': 'IN'},
#   {'str': 'sleep', 'hit': [47, 5, 11, 1], 'tag': 'NN'},
#   {'str': 'in', 'hit': [53, 2, 12, 1], 'tag': 'IN'},
#   {'str': 'the', 'hit': [56, 3, 13, 1], 'tag': 'DT'},
#   {'str': 'same', 'hit': [60, 4, 14, 1], 'tag': 'JJ'},
#   {'str': 'bed', 'hit': [65, 3, 15, 1], 'tag': 'NN'},
#   {'str': 'on', 'hit': [69, 2, 16, 1], 'tag': 'IN'},
#   {'str': 'TV', 'hit': [72, 2, 17, 1], 'tag': 'NN'},
#   {'str': 'was', 'hit': [75, 3, 18, 1], 'tag': 'VBD'},
#   {'str': 'Fred', 'hit': [79, 4, 19, 1], 'tag': 'NNP'},
#   {'str': 'and', 'hit': [84, 3, 20, 1], 'tag': 'CC'},
#   {'str': 'Wilma', 'hit': [88, 5, 21, 1], 'tag': 'NNP'},
#   {'str': '.', 'hit': [93, 1, 22, 1], 'tag': '.'},
#   {'str': 'Yabba', 'hit': [95, 5, 23, 1], 'tag': 'NNP'},
#   {'str': 'Dabba', 'hit': [101, 5, 24, 1], 'tag': 'NNP'},
#   {'str': 'Do', 'hit': [107, 2, 25, 1], 'tag': 'NNP'},
#   {'str': '!', 'hit': [109, 1, 26, 1], 'tag': '.'}],
#   'phrase_list': [{'str': 'It', 'hit': [0, 2, 0, 1], 'tag': 'PRP'},
#   {'str': 'took', 'hit': [3, 4, 1, 1], 'tag': 'VBD'},
#   {'str': 'a', 'hit': [8, 1, 2, 1], 'tag': 'DT'},
#   {'str': 'few', 'hit': [10, 3, 3, 1], 'tag': 'JJ'},
#   {'str': 'seasons', 'hit': [14, 7, 4, 1], 'tag': 'NNS'},
#   {'str': ',', 'hit': [21, 1, 5, 1], 'tag': ','},
#   {'str': 'but', 'hit': [23, 3, 6, 1], 'tag': 'CC'},
#   {'str': 'the', 'hit': [27, 3, 7, 1], 'tag': 'DT'},
#   {'str': 'first', 'hit': [31, 5, 8, 1], 'tag': 'JJ'},
#   {'str': 'couple', 'hit': [37, 6, 9, 1], 'tag': 'NN'},
#   {'str': 'to', 'hit': [44, 2, 10, 1], 'tag': 'IN'},
#   {'str': 'sleep', 'hit': [47, 5, 11, 1], 'tag': 'NN'},
#   {'str': 'in', 'hit': [53, 2, 12, 1], 'tag': 'IN'},
#   {'str': 'the', 'hit': [56, 3, 13, 1], 'tag': 'DT'},
#   {'str': 'same', 'hit': [60, 4, 14, 1], 'tag': 'JJ'},
#   {'str': 'bed', 'hit': [65, 3, 15, 1], 'tag': 'NN'},
#   {'str': 'on TV', 'hit': [69, 5, 16, 2], 'tag': 'IN'},
#   {'str': 'was', 'hit': [75, 3, 18, 1], 'tag': 'VBD'},
#   {'str': 'Fred', 'hit': [79, 4, 19, 1], 'tag': 'JJ'},
#   {'str': 'and', 'hit': [84, 3, 20, 1], 'tag': 'CC'},
#   {'str': 'Wilma', 'hit': [88, 5, 21, 1], 'tag': 'NNP'},
#   {'str': '.', 'hit': [93, 1, 22, 1], 'tag': '.'},
#   {'str': 'Yabba Dabba Do', 'hit': [95, 14, 23, 3], 'tag': 'UH'},
#   {'str': '!', 'hit': [109, 1, 26, 1], 'tag': '.'}],
#   'entity_list': [{'str': 'Fred',
#     'hit': [79, 4, 19, 1],
#     'type': {'name': 'person.generic', 'i18n': 'person', 'path': '/'},
#     'tag': 'person.generic',
#     'tag_i18n': 'person'},
#    {'str': 'Wilma.',
#     'hit': [88, 6, 21, 2],
#     'type': {'name': 'person.generic', 'i18n': 'person', 'path': '/'},
#     'tag': 'person.generic',
#     'tag_i18n': 'person'}],
#   'syntactic_parsing_str': '',
#   'srl_str': '',
#   'echo_data': 0
# }

# Execute

In [None]:
pattern_words = set(pattern_words)
for pieces in datasets:
    for i, p in enumerate(tqdm(pieces)):
        res = handle_a_text(i, p['content'])
        assert res['echo_data'] == i
        words = []

        try:
            for i, w in enumerate(res['word_list']):
                word = w['str']
                if word in pattern_words:
                    type = 'PATTERN'
                elif w['tag'] in ['NNP', 'NNPS']:
                    type = 'ENTITY'
                else:
                    type = 'OTHERS'

                words.append((word, t))
        except:
            pass

        p['words'] = words

        if len(p['words']) == 0:
            print(p)

In [9]:
for i, t in enumerate(['train', 'val', 'test']):
    with open('../../dataset/Twitter/raw/post/{}.json'.format(t), 'w') as f:
        json.dump(datasets[i], f, indent=4, ensure_ascii=False)