# ref
https://qiita.com/m__k/items/2c4e476d7ac81a3a44af

# init

## pip install

In [None]:
!pip install -Uqq git+https://github.com/huggingface/peft.git
!pip install -Uqq transformers datasets accelerate bitsandbytes
!pip install scipy

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

## Library

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
from datasets import load_dataset

from sklearn.model_selection import KFold
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from collections import OrderedDict
import warnings
warnings.filterwarnings('ignore')

## config

In [None]:
class cfg:
    seed = 77
    model_name = 'cyberagent/open-calm-7b'
    peft_name = 'lora-calm-7b'
    dataset = 'shunk031/JGLUE'
    dataset_name = 'JCommonsenseQA'

    n_folds=5
    split_method='KFold'

    train_bs=2
    valid_bs=2
    test_bs=2
    max_length=256
    use_amp = True

    n_epochs = 3
    lr = 3e-4
    weight_decay = 1e-6

## helper fucntion

In [None]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    warnings.simplefilter('ignore')


class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

## path

In [None]:
ROOT_DIR = Path('/content/drive/MyDrive/llm')
DATA_DIR = Path('/content/drive/MyDrive/llm/data/jglue')
SAVE_DIR = ROOT_DIR / 'outputs' / 'trial0001'
SAVE_DIR.mkdir(exist_ok=True, parents=True)

DATASET_NAME = 'jcommonsenseqa'

# Tokenizer

# Prompt

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/323 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.23M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

In [None]:
print(tokenizer.special_tokens_map)
print("bos_token :", tokenizer.eos_token, ",", tokenizer.bos_token_id)
print("eos_token :", tokenizer.bos_token, ",", tokenizer.eos_token_id)
print("unk_token :", tokenizer.unk_token, ",", tokenizer.unk_token_id)
print("pad_token :", tokenizer.pad_token, ",", tokenizer.pad_token_id)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|padding|>'}
bos_token : <|endoftext|> , 0
eos_token : <|endoftext|> , 0
unk_token : <|endoftext|> , 0
pad_token : <|padding|> , 1


In [None]:
# def jcommonsenseqa_templete_02(data_point, is_inference=False):
#     if not is_inference:
#         return f"""質問と回答の選択肢を入力として受け取り、選択肢から回答を選択してください。なお、回答は選択肢の番号(例:0)でするものとします。

# 質問:{data_point['question']}
# 選択肢:0.{data_point['choice0']},1.{data_point['choice1']},2.{data_point['choice2']}, 3.{data_point['choice3']},4.{data_point['choice4']}
# 回答:{data_point['label']}""", {data_point['label']}
#     else:
#         return f"""質問と回答の選択肢を入力として受け取り、選択肢から回答を選択してください。なお、回答は選択肢の番号(例:0)でするものとします。

# 質問:{data_point['question']}
# 選択肢:0.{data_point['choice0']},1.{data_point['choice1']},2.{data_point['choice2']}, 3.{data_point['choice3']},4.{data_point['choice4']}
# 回答:""", data_point['label']

In [None]:
def jcommonsenseqa_templete_02(data_point, is_inference=False):
    text = f"""質問と回答の選択肢を入力として受け取り、選択肢から回答を選択してください。なお、回答は選択肢の番号(例:0)でするものとします。

質問:{data_point['question']}
選択肢:0.{data_point['choice0']},1.{data_point['choice1']},2.{data_point['choice2']}, 3.{data_point['choice3']},4.{data_point['choice4']}
回答:"""
    if not is_inference:
        text += str(data_point['label']) + tokenizer.eos_token
    return text, data_point['label']

In [None]:
CUTOFF_LEN = 256  # 最大長

# トークナイズ関数の定義
def tokenize(prompt, tokenizer):
    result = tokenizer(
        prompt+"<|endoftext|>",  # EOSの付加
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
    )
    return {
        "input_ids": result["input_ids"],
        "attention_mask": result["attention_mask"],
    }

# load and split data

In [None]:
def load_split_data(name):
    assert name in set(['jcommonsenseqa', 'jnli', 'jsquad', 'jsts', 'marc_ja'])
    train_valid_paths = list(DATA_DIR.glob(f'{name}*/train*.json'))
    test_paths = list(DATA_DIR.glob(f'{name}*/valid*.json'))
    assert len(train_valid_paths) == 1 and len(test_paths) == 1, f'{len(train_valid_paths)} data train path find and {len(test_paths)} data train path find.'
    # train valid split
    train_valid_features = [json.loads(line) for line in open(train_valid_paths[0], 'r', encoding='utf-8')]
    train_valid_df = pd.DataFrame(train_valid_features)
    train_valid_df['fold'] = -1
    if cfg.split_method == 'KFold':
        for fold, (_, valid_fold_indices) \
            in enumerate(KFold(n_splits=cfg.n_folds, shuffle=True, random_state=cfg.seed).split(train_valid_df)):
            train_valid_df.loc[valid_fold_indices, 'fold'] = fold
    else:
        NotImplementedError
    # test
    test_features = [json.loads(line) for line in open(test_paths[0], 'r', encoding='utf-8')]
    test_df = pd.DataFrame(test_features)
    return train_valid_df, test_df

# Dataset

In [None]:
class JglueDataset(Dataset):
    def __init__(self, df):
        self.features = df.to_dict('records')

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

# DataLoader

In [None]:
def collate_fn(samples):
    '''
        Returns:
            batch (
                input_ids: (bs, max_length)
                attention_mask: (bs, max_length)
            )
    '''
    samples = [jcommonsenseqa_templete_02(sample)[0] for sample in samples]
    batch = tokenizer(
        text=samples,
        padding='max_length',
        truncation=True,
        return_tensors='pt',
        add_special_tokens=True,
        max_length=cfg.max_length
    )
    labels = torch.clone(batch.input_ids)
    labels[labels == tokenizer.pad_token_id] = -100
    batch['labels'] = labels
    return batch

In [None]:
def prepare_dataloader(train_features, valid_features, test_features):
    train_ds = JglueDataset(train_features)
    valid_ds = JglueDataset(valid_features)
    test_ds = JglueDataset(test_features)

    train_dl = DataLoader(
        train_ds,
        batch_size=cfg.train_bs,
        shuffle=True,
        num_workers=os.cpu_count(),
        drop_last=False,
        pin_memory=True,
        collate_fn=collate_fn,
    )

    valid_dl = DataLoader(
        valid_ds,
        batch_size=cfg.valid_bs,
        shuffle=False,
        num_workers=os.cpu_count(),
        drop_last=False,
        pin_memory=True,
        collate_fn=collate_fn
    )

    test_dl = DataLoader(
        test_ds,
        batch_size=cfg.test_bs,
        shuffle=False,
        num_workers=os.cpu_count(),
        drop_last=False,
        pin_memory=True,
        collate_fn=collate_fn
    )
    return train_dl, valid_dl, test_dl

In [None]:
train_valid_df, test_df = load_split_data('jcommonsenseqa')

In [None]:
train_dl, _, _ = prepare_dataloader(train_valid_df.query('fold!=0'), train_valid_df.query('fold==0'), test_df)

In [None]:
bs = next(iter(train_dl))

In [None]:
bs['input_ids'][2]

tensor([ 4347, 29329, 45596,  9491,   457, 21996,   245, 10259,   332,  7066,
        39203,   247,  1612,   245,  7066,   257, 10259, 37970,     9,  1218,
           27,    17,    10,   252, 40796,   247,   186,   186,  4347,    27,
          577, 40485,   855,  1031,  9385,  5856,  8868, 14237, 12529,    32,
          186, 10259,    27,    17,    15,  1576,   587,    13,    18,    15,
        34076,    13,    19,    15,  2697,    13,  2410,    15, 19167,    13,
           21,    15,  1210,  1635,  6528,   186,  7066,    27,    19,     0,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

# model

In [None]:
# モデルの準備
model = AutoModelForCausalLM.from_pretrained(
    cfg.model_name,
    load_in_8bit=True,
    device_map="auto",
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/42.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

# LoRA

In [None]:
# LoRAのパラメータ
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# モデルの前処理
model = prepare_model_for_int8_training(model)

# LoRAモデルの準備
model = get_peft_model(model, lora_config)

# 学習可能パラメータの確認
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,876,176,384 || trainable%: 0.06099762085451472


# train function

In [None]:
def train_function(
    cfg,
    fold,
    epoch,
    train_loader,
    valid_loader,
    tokenizer,
    model,
    optimizer,
    sheduler,
    scaler,
    device
):
    model.train()
    model.config.use_cache = False
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))

    train_losses = AverageMeter()

    for step, batch in pbar:
        input_ids = batch['input_ids'].to(device).long()
        attention_mask = batch['attention_mask'].to(device).long()
        labels = batch['labels'].to(device).long()
        output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = output.loss
        train_losses.update(loss.item())
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)
        pbar.set_description(
            f'[TRAIN epoch {epoch}/{cfg.n_epochs}]')
        pbar.set_postfix(OrderedDict(loss=train_losses.avg))

# main

In [None]:
train_valid_df, test_df = load_split_data('jcommonsenseqa')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for fold in range(cfg.n_folds):
    if fold != 0:continue

    train_loader, valid_loader, _ = prepare_dataloader(train_valid_df.query('fold!=@fold'), train_valid_df.query('fold==@fold'), test_df)

    optimizer = torch.optim.AdamW(
        model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)

    scaler = GradScaler(enabled=False)

    for epoch in range(cfg.n_epochs):
        train_function(cfg, fold, epoch, train_loader, valid_loader, tokenizer, model, optimizer, None, scaler, device)

  0%|          | 0/3576 [00:00<?, ?it/s]

  0%|          | 0/3576 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
torch.cuda.empty_cache()
import gc
gc.collect()

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e59361dc940>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1442, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/usr/lib/python3.10/selectors.py", line 416, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt: 


1995

# Custom metrics
[ref](https://qiita.com/m__k/items/2c4e476d7ac81a3a44af#compute_metrics%E3%82%92%E8%87%AA%E4%BD%9C%E3%81%99%E3%82%8B)

In [None]:
# テキスト生成関数の定義
def generate(instruction,input=None, maxTokens=256):
    # 推論
    prompt = jcommonsenseqa_templete_02(instruction, True)[0]
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
    print(input_ids.shape)
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=4,
        do_sample=True,
        temperature=0.9,
        top_p=0.75,
        top_k=4,
        no_repeat_ngram_size=2,
    )
    outputs = outputs[0].tolist()

    # EOSトークンにヒットしたらデコード完了
    if tokenizer.eos_token_id in outputs:
        eos_index = outputs.index(tokenizer.eos_token_id)
        decoded = tokenizer.decode(outputs[:eos_index])

        # レスポンス内容のみ抽出
        sentinel = "回答:"
        sentinelLoc = decoded.find(sentinel)
        if sentinelLoc >= 0:
            print(decoded[sentinelLoc+len(sentinel):])
        else:
            print('Warning: Expected prompt template to be emitted.  Ignoring output.')
    else:
        print('Warning: no <eos> detected ignoring output')

In [None]:
def convert_int(x):
    try:
        return str(int(x))
    except:
        return None

def custom_compute_metrics(res):
    # res.predictions, res.label_idsはnumpyのarray
    preds = torch.from_numpy(res.predictions.argmax(-1)).cuda()
    labels = res.label_ids.tolist()
    n_data = res.predictions.shape[0]
    n_correct = 0
    sentinel = "回答:"
    for i in range(n_data):
        pred = preds[i:i+1]
        label_id = labels[i]
        eos_index = label_id.index(tokenizer.eos_token_id)
        decoded = tokenizer.decode(label_id[:eos_index])
        sentinelLoc = decoded.find(sentinel)
        encoded = tokenizer(decoded[:sentinelLoc+len(sentinel)], return_tensors="pt")
        label = convert_int(decoded[sentinelLoc+len(sentinel):])
        assert label is not None
        # レスポンス内容のみ抽出
        outputs = model.generate(
            input_ids=encoded.input_ids,
            attention_mask=encoded.attention_mask,
            max_new_tokens=4,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.9,
            top_p=0.75,
            top_k=4,
            no_repeat_ngram_size=2,
        )
        outputs = outputs[0].tolist()
        if tokenizer.eos_token_id in outputs:
            eos_index = outputs.index(tokenizer.eos_token_id)
            decoded = tokenizer.decode(outputs[:eos_index])
            # レスポンス内容のみ抽出
            sentinelLoc = decoded.find(sentinel)
            if sentinelLoc >= 0:
                pred = decoded[sentinelLoc+len(sentinel):]
                pred = convert_int(pred)
        else:
            pred = None
        if pred is not None:
            n_correct += int(pred == label)
    return {'accuracy': n_correct / n_data}

# Trainer

In [None]:
eval_steps = 200
save_steps = 200
logging_steps = 20

# トレーナーの準備
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data.remove_columns(['label']),
    eval_dataset=val_data.remove_columns(['label']),
    args=transformers.TrainingArguments(
        num_train_epochs=3,
        learning_rate=3e-4,
        logging_steps=logging_steps,
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=eval_steps,
        save_steps=save_steps,
        output_dir=str(SAVE_DIR),
        report_to="none",
        save_total_limit=3,
        push_to_hub=False,
        auto_find_batch_size=True
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    compute_metrics=custom_compute_metrics,
)

In [None]:
# 学習の実行
model.config.use_cache = False
trainer.train()
model.config.use_cache = True

# LoRAモデルの保存
trainer.model.save_pretrained(cfg.peft_name)