<a href="https://colab.research.google.com/github/takedatmh/toyama/blob/main/Toyama_Uni_2_8_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# LLMファインチューニングに必要なライブラリ群
!pip install -q \
  transformers \
  datasets \
  accelerate \
  bitsandbytes \
  peft \
  sentencepiece \
  scipy \
  evaluate \
  huggingface-hub


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m83.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from huggingface_hub import login

# デバイス確認（T4 GPU が見えるか）
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {device}")

# 1. モデルとトークナイザの準備
# model_name = "rinna/japanese-gpt2-small"
model_name = "elyza/ELYZA-japanese-Llama-2-7b"

# 8bit量子化の設定
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16  # T4 GPU に最適
)

# # Instead, move the model to the device explicitly after loading.
# model.to(device)

# # 2. LoRAの設定 GPT2系のrinna/japanese-gpt2-smallの場合（PEFT）
# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=32,
#     target_modules=["c_proj", "c_attn"],  # Changed target modules to match GPT2 architecture
#     lora_dropout=0.05,
#     bias="none",
#     task_type=TaskType.CAUSAL_LM
# )

# 2. LoRAの設定 Llama2系 ELYZAの場合
  # モジュール名	用途・場所
  # q_proj	AttentionのQuery部分
  # k_proj	AttentionのKey部分
  # v_proj	AttentionのValue部分
  # o_proj	Attentionの出力変換部分
  # gate_proj	MLP(FFN)部のGating層
  # up_proj	MLP(FFN)部の中間変換層
  # down_proj	MLP(FFN)部の出力層
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",  # Attention
        "gate_proj", "up_proj", "down_proj"      # MLP
    ]
)

model = get_peft_model(model, lora_config)

# 3. データセット読み込みと前処理（FineTuningデータの例: yelpレビュー → 日本語のテキストに置き換えるべき）
dataset = load_dataset("yelp_review_full", split="train[:1%]")  # 小さめで検証

# テキストをトークン化（単純化）
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

# 4. トレーニング引数の設定
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1, #本当は3回ぐらい回したい
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no",
    report_to="none"
)

# 5. トレーナー定義と学習開始
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()


Running on cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/299M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6500 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,2.651
20,2.4555
30,2.4381
40,2.4621
50,2.4372
60,2.439
70,2.4951
80,2.4277
90,2.409
100,2.3971


TrainOutput(global_step=812, training_loss=2.3820426258547553, metrics={'train_runtime': 934.0883, 'train_samples_per_second': 6.959, 'train_steps_per_second': 0.869, 'total_flos': 3.306328265903309e+16, 'train_loss': 2.3820426258547553, 'epoch': 0.9993846153846154})

# Fine-Tuning後のモデルを利用して推論(Chat)を実行


In [None]:
from transformers import GenerationConfig

model.eval()

# より構造化されたプロンプト（LoRAで学習している形式に合わせること）
prompt = "質問：日本で一番高い山は何ですか？\n回答："

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# 推論設定
generation_config = GenerationConfig(
    max_new_tokens=128,
    do_sample=True,
    top_p=0.95,
    temperature=0.7,
    repetition_penalty=1.1,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id
)

with torch.no_grad():
    output = model.generate(
        **inputs,
        generation_config=generation_config
    )

# 出力をプロンプトと分離して表示
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("🧠 回答:", generated_text.replace(prompt, "").strip())

🧠 回答: 富士山。\n\n僕は昨年まで、40-60日/年に何度も来ていただけに、この評価は結構難しくなりました。\n\nある人が言ったように、「The best way to learn a city is to live in it for one year, but the second best way is to visit it once.\" I'm not sure that the second best way will even do this restaurant justice.\n\nA
