In [1]:
import numpy as np
import pandas as pd

In [None]:
class WeiboProcessor(DataProcessor):
    """Processor for the Weibo data classification set."""

    def get_train_examples(self, data_dir):
        return self._create_examples(os.path.join(data_dir, "train.csv"), "train")

    def get_dev_examples(self, data_dir):
        return self._create_examples(os.path.join(data_dir, "test.csv"), "dev")

    def get_test_examples(self, data_dir) -> List[InputExample]:
        raise NotImplementedError()

    def get_unlabeled_examples(self, data_dir) -> List[InputExample]:
        return self._create_examples(os.path.join(data_dir, "unlabel.csv"), "dev")

    def get_labels(self):
        return ["0", "1"]

    @staticmethod
    def _create_examples(path: str, set_type: str) -> List[InputExample]:
        examples = []

        with open(path, encoding='utf-8') as f:
            reader = csv.reader(f, delimiter=',')
            for idx, row in enumerate(reader):
                if idx != 0:
                    text, label = row
                    guid = "%s-%s" % (set_type, idx)
                    text_a = text.replace('\n\n', ' ').replace('\n', ' ').replace('/','').replace('"', '').replace('#','')

                    example = InputExample(guid=guid, text_a=text_a, label=label)
                    examples.append(example)

        return examples
    
    
class WeiboPVP(PVP):
    VERBALIZER_0 = {
        "0": ["坏", "差"],
        "1": ["好", "棒"]
    }
    
    VERBALIZER_1 = {
        "0": ["对"],
        "1": ["错"]
    }
    
    def get_parts(self, example: InputExample) -> FilledPattern:
        text = self.shortenable(example.text_a)

        if self.pattern_id == 0:
            return [text, "。我认为，这段话的作者心情是", self.mask, '的。'], []
        elif self.pattern_id == 1:
            return [text, '。提问：这些文字是消极的吗？回答：', self.mask, '。'], []
        elif self.pattern_id == 2:
            return [text, "。由前面可知，作者的情绪是", self.mask, '的。'], []
        else:
            raise ValueError("No pattern implemented for id {}".format(self.pattern_id))

    def verbalize(self, label) -> List[str]:
        if self.pattern_id == 0 or self.pattern_id == 2:            
            return WeiboPVP.VERBALIZER_0[label]
        else:
            return WeiboPVP.VERBALIZER_1[label]

In [1]:
import pandas as pd

In [10]:
train = pd.read_csv('two_class.csv')
train['emo'] = train['emotion'].map({-1:0, 1:1})
train = train[['context', 'emo']]

In [19]:
eval_2 = train[1500:]
unlabel = train[:1500] 
eval_2.to_csv('eval.csv', index=False)
unlabel.to_csv('unlabel.csv', index=False)

In [12]:
for num in [10, 50, 100, 500]:
    new_train = train.sample(n=num)
    # to ensure both classes existed in the sub-sumple
    state = (0 in new_train['emo'].tolist()) and (1 in new_train['emo'].tolist())
    while not state:
        new_train = train.sample(n=num)
        state = (0 in new_train['emo'].tolist()) and (1 in new_train['emo'].tolist())
        
    new_train.to_csv("weibo_train_" + str(num) + '.csv', index=False)

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

In [None]:
# use the tool provided by https://github.com/timoschick/pet
%run cli.py \
--method pet \
--pattern_ids 0 1 2 \
--data_dir "/content" \
--model_type bert \
--model_name_or_path "bert-base-chinese" \
--task_name weibo \
--output_dir "/content/result1" \
--do_train \
--do_eval \
--pet_per_gpu_eval_batch_size 16 \
--pet_per_gpu_train_batch_size 16 \
--pet_gradient_accumulation_steps 16 \
--pet_max_steps 250 \
--pet_max_seq_length 256 \
--pet_repetitions 1