# 预训练数据集打包
主要有两个步骤：
- tokenization：将文本转换为token
- packing the data

## 1、tokenization

加载02_data_preparation.ipynb中保存的数据

In [None]:
import datasets

dataset = datasets.load_dataset(
    "parquet",
    data_files="./data/preprocessed_dataset.parquet",
    split="train"
)
print(dataset)

In [None]:
dataset = dataset.shard(num_shards=10, index=0)  # num_shards表示分片的数量，index表示当前分片的索引
print(dataset)

加载分词器

In [None]:
from transformers import AutoTokenizer
model_path_or_name = "./models/upstage/SOLAR-10.7B-v1.0"
tokenizer = AutoTokenizer.from_pretrained(
    model_path_or_name,
    use_fast=False  
)

In [None]:
tokenizer.tokenize("I'm a short sentence")

In [None]:
def tokenization(example):
    # Tokenize
    tokens = tokenizer.tokenize(example["text"])

    # Convert tokens to ids
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Add <bos>, <eos> tokens to the front and back of tokens_ids
    # bos: begin of sequence, eos: end of sequence
    token_ids = [
        tokenizer.bos_token_id] \
        + token_ids \
        + [tokenizer.eos_token_id
           ]
    example["input_ids"] = token_ids

    # We will be using this column to count the total number of tokens
    # in the final dataset
    example["num_tokens"] = len(token_ids)
    return example

In [None]:
dataset = dataset.map(tokenization, load_from_cache_file=False)
print(dataset)

In [None]:
sample = dataset[3]

print("text", sample["text"][:30])
print("\ninput_ids", sample["input_ids"][:30])
print("\nnum_tokens", sample["num_tokens"])

统计数据集中所有的token数量

In [None]:
import numpy as np
np.sum(dataset["num_tokens"])

## 2、packing the data

In [None]:
input_ids = np.concatenate(dataset["input_ids"])
print(len(input_ids))

In [None]:
max_seq_length = 32

In [None]:
total_length = len(input_ids) - len(input_ids) % max_seq_length  # 保证整除
print(total_length)

In [None]:
input_ids = input_ids[:total_length]
print(input_ids.shape)

In [None]:
input_ids_reshaped = input_ids.reshape(-1, max_seq_length).astype(np.int32)
input_ids_reshaped.shape

In [None]:
type(input_ids_reshaped)

In [None]:
input_ids_list = input_ids_reshaped.tolist()
packaged_pretrain_dataset = datasets.Dataset.from_dict(
    {"input_ids": input_ids_list}
)
print(packaged_pretrain_dataset)

保存数据

In [None]:
packaged_pretrain_dataset.to_parquet("./data/packaged_pretrain_dataset.parquet")