In [1]:
import os
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'

import json
import random
from datasets import load_dataset

from settings import LLM_CLS_FORMAT, HUMAN_FORMAT, categories, LABELS_DICT

In [2]:
ds_abstract = load_dataset("ccdv/patent-classification", "abstract")

## 原始数据集构造

In [3]:
ds_abstract

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [4]:
def convert_LLM(dataset, output_file, num=-1, shuffle=False):
    res = []
    for item in dataset:
        text = item["text"]
        label = item["label"]
        llm_data = {
                'instruction': LLM_CLS_FORMAT.format(categories="\n".join(categories)),
                "input":HUMAN_FORMAT.format(input=text),
                "output": LABELS_DICT[label]
            }
        res.append(llm_data)
    
    if shuffle:
        random.shuffle(res)
    
    if num > 0:
        res = res[:num]
        
    with open(output_file, 'w') as w:
        w.write(json.dumps(res, ensure_ascii=False, indent=2) + '\n')

In [5]:
for num in [100, 500, 1000, 2000]:
    convert_LLM(
        dataset = ds_abstract["train"], 
        output_file = f"data/llm_train_{num}.json",
        num=num,
        shuffle=True
        )

In [6]:
convert_LLM(ds_abstract["validation"], "data/llm_valid.json")
convert_LLM(ds_abstract["test"], "data/llm_test.json")