In [16]:
import os
os.environ["http_proxy"] = "http://127.0.0.1:8889"
os.environ["https_proxy"] = "http://127.0.0.1:8889"

In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset
import datasets

In [18]:
# ds = load_dataset("pleisto/wikipedia-cn-20230720-filtered")
ds = DatasetDict.load_from_disk("datasets/wikipedia-cn-20230720-filtered")
ds = Dataset.from_dict(ds["train"][:3000])


In [19]:
ds

Dataset({
    features: ['source', 'completion'],
    num_rows: 3000
})

In [20]:
ds[0]

{'source': 'wikipedia.zh2307',
 'completion': '昭通机场（ZPZT）是位于中国云南昭通的民用机场，始建于1935年，1960年3月开通往返航班“昆明－昭通”，原来属军民合用机场。1986年机场停止使用。1991年11月扩建，于1994年2月恢复通航。是西南地区「文明机场」，通航城市昆明。 机场占地1957亩，飞行区等级为4C，有一条跑道，长2720米，宽48米，可供波音737及以下机型起降。机坪面积6600平方米，停机位2个，航站楼面积1900平方米。位于城东6公里处，民航路与金鹰大道交叉处。\n航点\n客服电话\n昭通机场客服电话：0870-2830004'}

In [21]:
tokenizer = AutoTokenizer.from_pretrained("Langboat/bloom-389m-zh")

def process_func(examples):
    contents = [e + tokenizer.eos_token for e in examples["completion"]]
    return tokenizer(contents, max_length=128, truncation=True)

In [22]:
tokenized_ds = ds.map(process_func, batched=True, remove_columns=ds.column_names)
tokenized_ds

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map: 100%|██████████| 3000/3000 [00:00<00:00, 9858.79 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 3000
})

In [23]:
from torch.utils.data import DataLoader

dl = DataLoader(tokenized_ds, batch_size=2, collate_fn=DataCollatorForLanguageModeling(tokenizer, mlm=False))

In [24]:
next(enumerate(dl))

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(0,
 {'input_ids': tensor([[ 8948,  1262,  9971,   916,    61,    51, 21447,  6496,  5317,  2140,
          15952,  8948, 32923, 23296,  9971,   355, 39145, 31097,   355, 11747,
          12196,  1359, 18943,  5612, 17102,  1006, 21189,  4571,  8948,  1262,
            755,   355, 14122,  3157,  1920,  1157,  1038,   866,  9971,   420,
          20446,  9971,  8979,  2436,   420, 11823,  4621, 26650,   355,   937,
          11121,  5216,  7442,  1262,  3891,   420,   583, 10051,  3728,  1258,
          11358,  9971,  1263,   355,  1262,  3891,  4845, 21189,   420,   210,
           9971, 18398, 23055, 15229,   355, 11968,  1473, 14861, 34739,    38,
            355, 38274, 24049,   355,  1490,  2057,  1393,  2211,   355,  8124,
           2524,  2211,   355, 21761, 23861, 23527,  1107,  4744, 37360,  1491,
           3796,   420,  1366, 10937,  6272,  2568,   462, 12640,   355,  3924,
           1366,  1389, 14931,   355, 39662,  6272, 18627, 12640,   420,  5317,
           2039,  2073

In [25]:
model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-389m-zh")

In [39]:
training_args = TrainingArguments(
    output_dir="./models/macbert",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    logging_steps=10,
    num_train_epochs=1,
)

In [40]:
trainer = Trainer(
    args=training_args,
    model=model,
    train_dataset=tokenized_ds,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [41]:
model.device

device(type='cuda', index=0)

In [42]:
# free gpu mem
import torch
torch.cuda.empty_cache()

trainer.train()

Step,Training Loss
10,3.4433
20,4.1598
30,3.8589
40,3.7781


TrainOutput(global_step=46, training_loss=3.7952080602231235, metrics={'train_runtime': 83.469, 'train_samples_per_second': 35.941, 'train_steps_per_second': 0.551, 'total_flos': 683528341684224.0, 'train_loss': 3.7952080602231235, 'epoch': 0.98})

In [44]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

In [47]:
pipe("杭州市一个美丽的", max_length=100, do_sample=True) 

[{'generated_text': '杭州市一个美丽的旅游景区，位于广东省深圳南山经济区）南滨路街道，由深圳市南山区旅游发展委员会管理，于2014年建成，面积约为3平方公里，建筑面积约为12万平方米。与深圳南山南山新区联合运营。该景区于2013年12月23日开幕，为南山区首家景区。\n历史\n该景区前身是于1964年的北滨步行街。南山区政府将该景区以石城为主题，修建石垣、石墙、石柱等，并以“石城'}]