In [1]:
# default_exp predefined_problems.test_data


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# Pre-defined Problems

Preprocessing functions of pre-defined problems. 

In [6]:
# export
import re
import string
import random
import numpy as np


from bert_multitask_learning.predefined_problems.ner_data import gold_horse_ent_type_process_fn, read_ner_data, gold_horse_segment_process_fn
from bert_multitask_learning.preproc_decorator import preprocessing_fn


def get_weibo_fake_cls_fn(file_path):
    @preprocessing_fn
    def weibo_fake_cls(params, mode):
        """Just a test problem to test multiproblem support

        Arguments:
            params {Params} -- params
            mode {mode} -- mode
        """
        data = read_ner_data(file_pattern=file_path,
                             proc_fn=gold_horse_ent_type_process_fn)
        if mode == 'train':
            data = data['train']
        else:
            data = data['eval']
        inputs_list = data['inputs']
        target_list = data['target']

        new_target_list = ['1' if len(
            set(t)) > 1 else '0' for t in target_list]

        return inputs_list[:20], new_target_list[:20]
    return weibo_fake_cls

def get_weibo_fake_ner_fn(file_path):
    @preprocessing_fn
    def weibo_fake_ner(params, mode):
        data = read_ner_data(file_pattern=file_path,
                             proc_fn=gold_horse_ent_type_process_fn)
        if mode == 'train':
            data = data['train']
        else:
            data = data['eval']
        inputs_list = data['inputs']
        target_list = data['target']

        return inputs_list[:20], target_list[:20]
    return weibo_fake_ner

# def get_weibo_pretrain_fn(file_path):
#     @preprocessing_fn
#     def weibo_pretrain(params, mode):

#         sentence_split = r'[.!?。？！]'

#         data = read_ner_data(file_pattern=file_path,
#                              proc_fn=gold_horse_segment_process_fn)
#         if mode == 'train':
#             data = data['train']
#         else:
#             data = data['eval']
#         inputs_list = data['inputs']

#         segmented_list = []
#         for document in inputs_list:
#             segmented_list.append([])
#             doc_string = ''.join(document)
#             splited_doc = re.split(sentence_split, doc_string)
#             for sentence in splited_doc:
#                 if sentence:
#                     segmented_list[-1].append(list(sentence))
#         segmented_list = [doc for doc in segmented_list if doc]

#         return segmented_list
#     return weibo_pretrain


def get_weibo_fake_multi_cls_fn(file_path):
    @preprocessing_fn
    def weibo_fake_multi_cls(params, mode):
        data = read_ner_data(file_pattern=file_path,
                             proc_fn=gold_horse_ent_type_process_fn)
        if mode == 'train':
            data = data['train']
        else:
            data = data['eval']
        inputs_list = data['inputs']

        # fake multimodal
        inputs_list = [{'text': t, 'image': np.random.uniform(
            size=(5, 10))} for t in inputs_list]

        # create fake target
        target_list = []
        for _ in inputs_list:
            target_list.append(
                list(string.ascii_lowercase[:random.randint(0, 25)]))

        return inputs_list[:20], target_list[:20]
    return weibo_fake_multi_cls


def get_weibo_masklm(file_path):
    @preprocessing_fn
    def weibo_masklm(params, mode):
        data = read_ner_data(file_pattern=file_path,
                             proc_fn=gold_horse_ent_type_process_fn)
        if mode == 'train':
            data = data['train']
        else:
            data = data['eval']
        inputs_list = data['inputs']

        for i in inputs_list[:20]:
            yield i, 'a'
    return weibo_masklm
