In [1]:
#モデルの読み込み
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

modelName = "line-corporation/japanese-large-lm-3.6b-instruction-sft"

model = AutoModelForCausalLM.from_pretrained(
    modelName,
    device_map="auto",
    torch_dtype = torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(modelName)

# アダプタを付ける線形変換の部分を調べる
import re

modelModule = str(model.modules)
pattern = r'\((\w+)\): Linear'
linearLayerNames = re.findall(pattern, modelModule)
linearLayerNames = list(set(linearLayerNames))
print(linearLayerNames)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


['dense', 'query_key_value', 'dense_h_to_4h', 'dense_4h_to_h', 'embed_out']




In [None]:
from peft import get_peft_model, LoraConfig, TaskType

# LoRAの設定
loraConfig = LoraConfig(
    r = 4,
    lora_alpha = 8,
    target_modules = ["query_key_value"],
    lora_dropout = 0.05,
    bias = "none",
    fan_in_fan_out = False,
    task_type = TaskType.CAUSAL_LM,
)

# LoRAモデルの設定
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

modelName = "cyberagent/open-calm-7b"
baseModel = AutoModelForCausalLM.from_pretrained(
    modelName,
    device_map = "auto",
    torch_dtype = torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(
    modelName,
    legacy = True,
)

# ベースモデルとconfigからLoRAのモデルの設定
model = get_peft_model(baseModel, loraConfig)

# データの用意
# データの読み込み
import datasets
query = datasets.load_dataset("json", data_files="./trainData/mixed_questions_100.json")

# テンプレート
template = (
    "ユーザー:{instruction}\n"
    "システム:{output}\n"
)

# データのリストの作成
dataList = []

for i in range(len(query['train'])):
    d = query['train'][i]
    if (d['input'] == ''):
        pText = template.format_map(d)
        if (len(pText) < 1024):
            dataList.append(pText)

# trainDatasetの構築
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, dataList, tokenizer):
        self.tokenizer = tokenizer
        self.features = []
        
        for pText in dataList:
            input_ids = self.tokenizer.encode(pText)
            input_ids = input_ids + [self.tokenizer.eos_token_id]
            input_ids = torch.LongTensor(input_ids)
            self.features.append({'input_ids': input_ids})

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

trainDataset = MyDataset(dataList, tokenizer)


# trainerの設定と学習の実行
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm = False
)

trainingArgs = TrainingArguments(
    output_dir = "./tunedModels/LoRA/TEST",
    num_train_epochs = 3,
    save_strategy = "epoch",
    per_device_train_batch_size = 8,
    logging_steps = 10,
    fp16 = True,
)

trainer = Trainer(
    model = model,
    data_collator = collator,
    args = trainingArgs,
    train_dataset = trainDataset,
)

trainer.train()



Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.25it/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,5.2224
20,5.0693
30,4.7297


TrainOutput(global_step=39, training_loss=4.9111217596592045, metrics={'train_runtime': 16.9266, 'train_samples_per_second': 17.724, 'train_steps_per_second': 2.304, 'total_flos': 242738807832576.0, 'train_loss': 4.9111217596592045, 'epoch': 3.0})

In [None]:
# 学習結果の確認
# モデルの読み込み
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftConfig, PeftModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

modelName = "cyberagent/open-calm-"

baseModel = AutoModelForCausalLM.from_pretrained(
    modelName,
    device_map = None,
    torch_dtype = torch.float16,
    offload_folder = "./offload",
    low_cpu_mem_usage = True,
)

tokenizer = AutoTokenizer.from_pretrained(
    modelName,
    legacy = True,
)

# LoRAのモデルの読み込み
loraName = "./tunedModels/LoRA/TEST/checkpoint-39"

model = PeftModel.from_pretrained(
    baseModel,
    loraName,
    device_map = "auto",
    offload_folder = "./tunedModels/LoRA/TEST/offload",
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]


In [111]:
template = (
    "ユーザー:{instruction}\n"
    "システム:{output}\n"
)

q = "質問をしてください"
d = {
    "instruction": q,
    "output": "",
}
pText = template.format_map(d)

input_ids = tokenizer.encode(
    pText,
    return_tensors="pt"
).to(device)

model = model.to(device)

startPos = len(input_ids[0])

with torch.no_grad():
    tokens = model.generate(
        input_ids = input_ids,
        max_new_tokens = 200,
        temperature = 0.7,
        do_sample = True,
        pad_token_id = tokenizer.pad_token_id,
        eos_token_id = tokenizer.encode("。")
    )

output = tokenizer.decode(
    tokens[0][startPos:],
    skip_special_tokens = True
)

print(output)

KeyboardInterrupt: 

In [4]:
# -*- coding: sjis -*-

#------------------------------------------
#  ���f���� tokenizer �̐ݒ�
#------------------------------------------

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "cyberagent/open-calm-small"
model = AutoModelForCausalLM.from_pretrained(model_name, 
                  torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

#----------------------------------
#  �f�[�^�̃_�E�����[�h
#----------------------------------

import datasets
query = datasets.load_dataset("json", data_files="./trainData/mixed_questions_100.json")

#----------------------------------
#  �e���v���[�g
#----------------------------------

template = {
    "w_input": (
        "以下はタスクを記述した指示と入力です。入力はタスクで参照されている文章です。指示を適切に満たす応答を書きなさい。\n\n"
        "### 指示:\n{instruction}\n\n"
        "### 入力:\n{input}\n\n"
        "### 応答:\n{output}"
    ),
    "wo_input": (
        "以下はタスクを記述した指示と入力です。入力はタスクで参照されている文章です。指示を適切に満たす応答を書きなさい。\n\n"
        "### 指示:\n{instruction}\n\n"
        "### 応答:\n{output}"
    )
}

#------------------------------------------
#  �f�[�^�i�v�����v�g�j�̃��X�g�̍쐬
#------------------------------------------

datalist = []
for i in range(len(query['train'])):
    d = query['train'][i]
    if (d['input'] == ''):
        ptext = template['wo_input'].format_map(d)
    else:
        ptext = template['w_input'].format_map(d)
    if (len(ptext) < 1500):
        datalist.append(ptext)
        
#------------------------------------------
#  train_dataset �̍\�z
#------------------------------------------

from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, datalist, tokenizer):
        self.tokenizer = tokenizer
        self.features = []
        for ptext in datalist:
            input_ids = self.tokenizer.encode(ptext)
            input_ids = input_ids + [ self.tokenizer.eos_token_id ]
            input_ids = torch.LongTensor(input_ids)
            self.features.append({'input_ids': input_ids})
    def __len__(self):
        return len(self.features)
    def __getitem__(self, idx):
        return self.features[idx]

train_dataset = MyDataset(datalist, tokenizer)

#------------------------------------------
#  Trainer �̐ݒ�Ɗw�K�̎��s
#------------------------------------------

from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir='./output',
    num_train_epochs=5,    
    save_steps=2000,
    per_device_train_batch_size=1
)

trainer = Trainer(
    model=model,
    data_collator=collator,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()


Step,Training Loss
500,0.7917


TrainOutput(global_step=500, training_loss=0.7916690673828125, metrics={'train_runtime': 30.3763, 'train_samples_per_second': 16.46, 'train_steps_per_second': 16.46, 'total_flos': 20729644416000.0, 'train_loss': 0.7916690673828125, 'epoch': 5.0})

In [75]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    "./output/checkpoint-500"
)
tokenizer = AutoTokenizer.from_pretrained(
    "cyberagent/open-calm-small"
)

template = {
    "w_input": (
        "以下はタスクを記述した指示と入力です。入力はタスクで参照されている文章です。指示を適切に満たす応答を書きなさい。\n\n"
        "### 指示:\n{instruction}\n\n"
        "### 入力:\n{input}\n\n"
        "### 応答:\n{output}"
    ),
    "wo_input": (
        "以下はタスクを記述した指示と入力です。入力はタスクで参照されている文章です。指示を適切に満たす応答を書きなさい。\n\n"
        "### 指示:\n{instruction}\n\n"
        "### 応答:\n{output}"
    )
}

d = {}
d['instruction'] = "質問を作成して"
d['output'] = ""

ptext = template['wo_input'].format_map(d)

input = tokenizer.encode(
    ptext,
    return_tensors="pt"
)
start_pos = len(input[0])
with torch.no_grad():
    tokens = model.generate(
        input,
        max_new_tokens=64,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.7,
        top_p=0.9,
)

output = tokenizer.decode(tokens[0][start_pos:], skip_special_tokens=True)
print(output)


今、どんな気持ちですか?
