In [None]:
import torch
from transformers import AutoTokenizer, Gemma3ForSequenceClassification

In [None]:
from transformers import AutoTokenizer, Gemma3ForSequenceClassification

In [None]:
from huggingface_hub import login
login(new_session=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
!nvidia-smi

Tue Sep 30 00:28:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P0             51W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
"""
設定ファイル - Phi-4 モデルのトレーニングと推論用設定
"""

# Model configuration
VER = 2
MODEL_NAME = "google/gemma-3-27b-it"
MODEL_TYPE = "gemma3"  # Phi-4 model type
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 512  # Phi-4 supports longer context

# Directory settings
OUTPUT_DIR = f"/content/drive/MyDrive/map/gemma-3"

# Training parameters
TRAIN_BATCH_SIZE =8  # Smaller batch size for Phi-4
EVAL_BATCH_SIZE = 8  # Eval batch size
GRADIENT_ACCUMULATION_STEPS = 8  # Increased for effective batch size
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 229
# 1772
EVAL_STEPS = 229
# 1772


# Data paths
TRAIN_DATA_PATH = '/content/drive/MyDrive/map/train.csv'
TEST_DATA_PATH = '/content/drive/MyDrive/map/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/best"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT =0.2
#  0.0000001

# GPU settings
CUDA_VISIBLE_DEVICES = "0"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "gemma3-math-misconceptions"
WANDB_RUN_NAME = f"phi-4-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration for Phi-4
LORA_RANK = 32  # LoRAのランク - optimized for Phi-4
LORA_ALPHA = 64  # LoRAのスケーリングパラメータ - 1:1 ratio with rank
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]  # Phi-4 target modules
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率 - reduced for Phi-4
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"
USE_DORA = False  # DoRA (Weight-Decomposed Low-Rank Adaptation) を使用する場合はTrue

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = False  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

# Attention implementation settings
# "eager": 標準のPyTorch実装
# "flash_attention_2": Flash Attention 2実装（高速・省メモリ）
ATTENTION_IMPLEMENTATION = "eager"  # Options: "eager", "flash_attention_2"

In [None]:
"""
共通ユーティリティ関数
"""

import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from datasets import Dataset
import torch


def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row['is_correct']:
        status = "Yes"
    else:
        status = "No"

    # # 選択肢となる誤概念リストを定義
    # misconceptions = [
    #     'Adding_across', 'Adding_terms', 'Additive', 'Base_rate', 'Certainty', 'Definition',
    #     'Denominator-only_change', 'Division', 'Duplication', 'Firstterm', 'FlipChange',
    #     'Ignores_zeroes', 'Incomplete', 'Incorrect_equivalent_fraction_addition', 'Interior',
    #     'Inverse_operation', 'Inversion', 'Irrelevant', 'Longer_is_bigger', 'Mult',
    #     'Multiplying_by_4', 'NA', 'Not_variable', 'Positive', 'Scale', 'Shorter_is_bigger',
    #     'Subtraction', 'SwapDividend', 'Tacking', 'Unknowable', 'WNB', 'Whole_numbers_larger',
    #     'Wrong_Fraction', 'Wrong_Operation', 'Wrong_fraction', 'Wrong_term'
    # ]
    # リストをカンマ区切りの文字列に変換
    # choices = ", ".join(misconceptions)
    # Gemma-3用のプロンプトフォーマット（選択肢を含む）
    prompt = (
        f"<bos><start_of_turn>user\n"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Student's Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Student's Explanation: {row['StudentExplanation']}\n\n"
        # f"Misconception choices: {choices}\n\n"
        f"Let me analyze this mathematical misconception...\n"
        f"<end_of_turn>\n"
        f"<start_of_turn>model\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0.0
    for i, label in enumerate(labels):
        ranks = top3[i]
        if ranks[0] == label:
            score += 1.0
        elif ranks[1] == label:
            score += 1.0 / 2
        elif ranks[2] == label:
            score += 1.0 / 3
    return {"map@3": score / len(labels)}


def create_submission(predictions, test_data, label_encoder):
    """予測結果から提出用ファイルを作成"""
    probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    flat = top3.flatten()
    decoded = label_encoder.inverse_transform(flat)
    top3_labels = decoded.reshape(top3.shape)
    pred_strings = [" ".join(r) for r in top3_labels]

    submission = pd.DataFrame({
        'row_id': test_data.row_id.values,
        'Category:Misconception': pred_strings
    })
    return submission


In [None]:
"""
カスタムデータコレーター for Qwen3モデル
"""
import torch
from dataclasses import dataclass
from typing import Dict, List, Union
from transformers import PreTrainedTokenizerBase
from transformers.file_utils import PaddingStrategy


@dataclass
class DataCollatorWithPadding:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: int = None
    pad_to_multiple_of: int = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # バッチの最大長を取得
        max_length = max(len(feature["input_ids"]) for feature in features)

        # パディング
        batch = {}
        for key in features[0].keys():
            if key == "label":
                # ラベルはパディング不要
                batch[key] = torch.tensor([f[key] for f in features], dtype=torch.long)
            elif key in ["input_ids", "attention_mask"]:
                # input_idsとattention_maskをパディング
                padded = []
                for feature in features:
                    # tensorをlistに変換
                    if torch.is_tensor(feature[key]):
                        feature_list = feature[key].tolist()
                    else:
                        feature_list = feature[key]

                    remainder = [self.tokenizer.pad_token_id if key == "input_ids" else 0] * (max_length - len(feature_list))
                    padded_feature = feature_list + remainder
                    padded.append(padded_feature)
                batch[key] = torch.tensor(padded, dtype=torch.long)

        # labelsフィールドを追加（Trainerが期待するため）
        if "label" in batch:
            batch["labels"] = batch.pop("label")  # labelを削除してlabelsに変更

        return batch

In [None]:
"""
Phi-4 モデルトレーニングスクリプト
"""

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    AutoConfig
)
from datasets import Dataset
import joblib
import torch
import torch.nn as nn
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from transformers import AutoModel
import wandb
from transformers import EarlyStoppingCallback, TrainerCallback
import gc

# # カスタムモジュールのインポート
# from config import *
# from utils import prepare_correct_answers, format_input, tokenize_dataset, compute_map3
# from data_collator import DataCollatorWithPadding


class SaveBestMap3Callback(TrainerCallback):
    """eval_map@3が最高値を更新した際にモデルを保存するコールバック"""
    def __init__(self, save_dir, tokenizer):
        self.save_dir = save_dir
        self.tokenizer = tokenizer
        self.best_map3 = 0.0

    def on_evaluate(self, args, state, control, metrics, model=None, **kwargs):
        current_map3 = metrics.get('eval_map@3', 0.0)
        current_step = state.global_step
        total_steps = state.max_steps if state.max_steps else "N/A"

        print(f"\n[Step {current_step}/{total_steps}] 評価実行 - MAP@3スコア: {current_map3:.4f}")

        if current_map3 > self.best_map3:
            self.best_map3 = current_map3

            # 専用ディレクトリに保存
            best_map3_path = os.path.join(self.save_dir, 'best_map3')
            os.makedirs(best_map3_path, exist_ok=True)

            # LoRAアダプターのみを保存
            model.save_pretrained(best_map3_path)
            self.tokenizer.save_pretrained(best_map3_path)

            print(f"🎉 新しいベストMAP@3スコア更新: {current_map3:.4f} (Step {current_step}) - モデルを {best_map3_path} に保存しました")
        else:
            print(f"現在のベストMAP@3スコア: {self.best_map3:.4f} (変更なし)")

        return control


class Phi4ForSequenceClassification(nn.Module):
    """Phi-4モデルを分類タスク用にカスタマイズ"""
    def __init__(self, model_name, num_labels, attn_implementation="eager"):
        super().__init__()
        from transformers import AutoModel
        self.phi = AutoModel.from_pretrained(
            model_name,
            trust_remote_code=True,
            attn_implementation=attn_implementation
        )
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.phi.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.phi(input_ids=input_ids, attention_mask=attention_mask)
        # 最後のトークンの隠れ状態を使用
        pooled_output = outputs.last_hidden_state[:, -1, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

        return type('Output', (), {'loss': loss, 'logits': logits})()


def main():
    """メイントレーニング関数"""

    # config.pyの内容を出力
    print("=" * 80)
    print("Configuration Settings (config.py):")
    # print("=" * 80)
    # with open('config.py', 'r', encoding='utf-8') as f:
    #     print(f.read())
    # print("=" * 80)
    # print()_

    # WandBの初期化
    if USE_WANDB:
        wandb.init(
            project=WANDB_PROJECT,
            name=WANDB_RUN_NAME,
            entity=WANDB_ENTITY,
            config={
                "model_name": MODEL_NAME,
                "epochs": EPOCHS,
                "max_len": MAX_LEN,
                "train_batch_size": TRAIN_BATCH_SIZE,
                "eval_batch_size": EVAL_BATCH_SIZE,
                "learning_rate": LEARNING_RATE,
                "early_stopping_patience": EARLY_STOPPING_PATIENCE if USE_EARLY_STOPPING else None,
                "lora_rank": LORA_RANK,
                "lora_alpha": LORA_ALPHA,
                "lora_target_modules": LORA_TARGET_MODULES,
                "lora_dropout": LORA_DROPOUT,
                "lora_bias": LORA_BIAS,
                "use_dora": USE_DORA,
                "attention_implementation": ATTENTION_IMPLEMENTATION,
            }
        )

    # GPU設定
    if CUDA_VISIBLE_DEVICES is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = CUDA_VISIBLE_DEVICES
        print(f"Using CUDA device(s): {CUDA_VISIBLE_DEVICES}")

    # メモリキャッシュをクリア
    torch.cuda.empty_cache()
    gc.collect()

    # 出力ディレクトリの作成
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # --- データの読み込みと前処理 ---
    print("Loading and preprocessing training data...")
    le = LabelEncoder()
    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    train['target'] = train.Category + ":" + train.Misconception
    train['label'] = le.fit_transform(train['target'])
    n_classes = len(le.classes_)
    print(f"Train shape: {train.shape} with {n_classes} target classes")

    # --- 特徴量エンジニアリング ---
    print("Performing feature engineering...")
    correct = prepare_correct_answers(train)
    train = train.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    train.is_correct = train.is_correct.fillna(0)

    # --- 入力テキストのフォーマット ---
    print("Formatting input text...")
    train['text'] = train.apply(format_input, axis=1)
    print("Example prompt for our LLM:")
    print(train.text.values[0])

    # --- トークナイザーの初期化 ---
    print("Initializing tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

    # パディングトークンの設定
    # Phi-4モデルの場合の設定
    if tokenizer.pad_token is None:
        # Phi-4では特別なパディングトークンを使用
        tokenizer.pad_token = "<|finetune_right_pad_id|>"
        tokenizer.pad_token_id = 100257

    # --- トークン長の分析 ---
    print("Analyzing token lengths...")
    lengths = [len(tokenizer.encode(t, truncation=False)) for t in train['text']]
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=50)
    plt.title("Token Length Distribution")
    plt.xlabel("Number of tokens")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.savefig(f'{OUTPUT_DIR}/token_length_distribution.png')
    plt.close()

    over_limit = (np.array(lengths) > MAX_LEN).sum()
    print(f"There are {over_limit} train sample(s) with more than {MAX_LEN} tokens")

    # --- データの分割 ---
    print("Splitting data into train and validation sets...")
    train_df, val_df = train_test_split(train, test_size=VALIDATION_SPLIT, random_state=RANDOM_SEED)
    COLS = ['text','label']
    train_ds = Dataset.from_pandas(train_df[COLS])
    val_ds = Dataset.from_pandas(val_df[COLS])

    # --- データセットのトークナイズ ---
    print("Tokenizing datasets...")
    train_ds = tokenize_dataset(train_ds, tokenizer, MAX_LEN)
    val_ds = tokenize_dataset(val_ds, tokenizer, MAX_LEN)

    # --- Label Encoderの保存 ---
    print(f"Saving label encoder to: {LABEL_ENCODER_PATH}")
    joblib.dump(le, LABEL_ENCODER_PATH)

    # --- モデルの初期化 ---
    print("Initializing model...")
    print(f"Using attention implementation: {ATTENTION_IMPLEMENTATION}")
    # try:
        # 量子化モデルを読み込む
    model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=n_classes,
            trust_remote_code=True,
            device_map=None,  # デバイスマッピングを無効化
            torch_dtype=torch.bfloat16,  # BF16で読み込み
            low_cpu_mem_usage=True,  # CPUメモリ使用量を削減
            attn_implementation=ATTENTION_IMPLEMENTATION  # Attention実装を指定
        )
        # パディングトークンIDを設定
    model.config.pad_token_id = tokenizer.pad_token_id
    # except:
    #     # 失敗した場合はカスタムクラスを使用
    #     print("Using custom classification head for Phi-4...")
    #     # ベースモデルを読み込む
    #     base_model = AutoModel.from_pretrained(
    #         MODEL_NAME,
    #         trust_remote_code=True,
    #         device_map=None,
    #         torch_dtype=torch.bfloat16,  # BF16で読み込み
    #         low_cpu_mem_usage=True,  # CPUメモリ使用量を削減
    #         attn_implementation=ATTENTION_IMPLEMENTATION  # Attention実装を指定
    #     )
    #     # カスタム分類ヘッドを作成
    #     model = Phi4ForSequenceClassification(MODEL_NAME, n_classes, ATTENTION_IMPLEMENTATION)
    #     model.phi = base_model

    # --- LoRAアダプターの設定 ---
    print("Configuring LoRA adapter...")
    lora_config = LoraConfig(
        r=LORA_RANK,  # LoRAのランク
        lora_alpha=LORA_ALPHA,  # LoRAのスケーリングパラメータ
        target_modules=LORA_TARGET_MODULES,  # 対象モジュール
        lora_dropout=LORA_DROPOUT,
        bias=LORA_BIAS,
        task_type=TaskType.SEQ_CLS,
        use_dora=USE_DORA  # DoRAの使用
    )

    # PEFTモデルの作成
    model = get_peft_model(model, lora_config)
    print("Number of trainable parameters:")
    model.print_trainable_parameters()

    # Gradient checkpointingを有効化
    if hasattr(model, 'enable_input_require_grads'):
        model.enable_input_require_grads()

    # モデルのgradient checkpointingを有効化
    if hasattr(model.base_model, 'gradient_checkpointing_enable'):
        model.base_model.gradient_checkpointing_enable()
    elif hasattr(model, 'gradient_checkpointing_enable'):
        model.gradient_checkpointing_enable()

    # シングルGPUに設定
    if torch.cuda.is_available():
        model = model.cuda()

    # 追加のメモリ最適化
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # --- トレーニング引数の設定 ---
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        do_train=True,
        do_eval=True,
        eval_strategy="steps",
        save_strategy="steps",
        eval_steps=EVAL_STEPS,
        save_steps=SAVE_STEPS,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        logging_dir=f"{OUTPUT_DIR}/logs",
        logging_steps=LOGGING_STEPS,
        metric_for_best_model="map@3",
        greater_is_better=True,
        load_best_model_at_end=True,
        report_to="wandb" if USE_WANDB else "none",
        bf16=True,  # BF16を使用
        gradient_checkpointing=True,  # メモリ効率化のため有効化
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,  # メモリ効率向上のため追加
        remove_unused_columns=False,  # カラムを削除しない
        lr_scheduler_type="cosine",  # コサインスケジューラーを使用
        warmup_ratio=0.0,  # ウォームアップを無効化
        save_total_limit=2,
        max_grad_norm=MAX_GRAD_NORM,  # Gradient clipping
        optim="adamw_bnb_8bit" if USE_8BIT_ADAM else "adamw_torch",  # 8-bit Adam optimizer
    )

    # --- トレーナーのセットアップとトレーニング ---
    print("Setting up trainer...")

    # エポックあたりのステップ数を計算
    steps_per_epoch = len(train_ds) // (TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)  # gradient_accumulation_stepsを考慮
    total_steps = steps_per_epoch * EPOCHS
    print(f"\nDataset statistics:")
    print(f"Training samples: {len(train_ds)}")
    print(f"Validation samples: {len(val_ds)}")
    print(f"Batch size: {TRAIN_BATCH_SIZE} (with gradient accumulation: {TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS})")
    print(f"Steps per epoch: {steps_per_epoch}")
    print(f"Total training steps: {total_steps}")
    print(f"Evaluation interval: every {EVAL_STEPS} steps (~{EVAL_STEPS/steps_per_epoch:.2f} epochs)")
    print(f"Early stopping after {EARLY_STOPPING_PATIENCE} evaluations without improvement")

    # カスタムデータコレーターを使用
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=MAX_LEN)

    # アーリーストッピングコールバックの設定
    callbacks = []

    # SaveBestMap3Callbackを追加
    save_best_callback = SaveBestMap3Callback(save_dir=OUTPUT_DIR, tokenizer=tokenizer)
    callbacks.append(save_best_callback)
    print(f"SaveBestMap3Callback enabled - モデルは {OUTPUT_DIR}/best_map3 に保存されます")

    if USE_EARLY_STOPPING:
        # EARLY_STOPPING_PATIENCEは評価回数として直接使用
        early_stopping_callback = EarlyStoppingCallback(
            early_stopping_patience=EARLY_STOPPING_PATIENCE,
            early_stopping_threshold=EARLY_STOPPING_THRESHOLD
        )
        callbacks.append(early_stopping_callback)
        print(f"Early stopping enabled:")
        print(f"  - Patience (evaluations without improvement): {EARLY_STOPPING_PATIENCE}")
        print(f"  - Threshold: {EARLY_STOPPING_THRESHOLD}")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_map3,
        callbacks=callbacks,
    )

    print("Starting training...")
    trainer.train()

    # --- トレーニング終了後の最終評価 ---
    print("\n" + "="*60)
    print("トレーニング完了 - 最終評価を実行中...")
    print("="*60)
    final_eval_results = trainer.evaluate()
    final_map3 = final_eval_results.get('eval_map@3', 0.0)
    print(f"\n🏁 最終評価結果:")
    print(f"   最終MAP@3スコア: {final_map3:.4f}")
    print(f"   全体のベストMAP@3スコア: {save_best_callback.best_map3:.4f}")

    # 最終評価が新しいベストスコアの場合、明示的に保存
    if final_map3 > save_best_callback.best_map3:
        print(f"🎉 最終評価で新しいベストスコア達成！ {final_map3:.4f} > {save_best_callback.best_map3:.4f}")
        save_best_callback.best_map3 = final_map3
        best_map3_path = os.path.join(OUTPUT_DIR, 'best_map3')
        os.makedirs(best_map3_path, exist_ok=True)
        model.save_pretrained(best_map3_path)
        tokenizer.save_pretrained(best_map3_path)
        print(f"   最終ベストモデルを {best_map3_path} に保存しました")

    # --- モデルの保存 ---
    print("\nSaving model...")
    # LoRAアダプターのみを保存
    model.save_pretrained(BEST_MODEL_PATH)
    # トークナイザーも保存
    tokenizer.save_pretrained(BEST_MODEL_PATH)

    print("Training completed successfully!")
    print(f"Model saved to: {BEST_MODEL_PATH}")
    print(f"Label encoder saved to: {LABEL_ENCODER_PATH}")

    # WandBの終了
    if USE_WANDB:
        wandb.finish()


if __name__ == "__main__":
    main()

Configuration Settings (config.py):


[34m[1mwandb[0m: Currently logged in as: [33mmasakazu[0m ([33mISIC[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using CUDA device(s): 0
Loading and preprocessing training data...
Train shape: (36696, 9) with 65 target classes
Performing feature engineering...
Formatting input text...
Example prompt for our LLM:
<bos><start_of_turn>user
[Mathematical Misconception Analysis Task]

Question: What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.]
Student's Answer: \( \frac{1}{3} \)
Correct?: Yes
Student's Explanation: 0ne third is equal to tree nineth

Let me analyze this mathematical misconception...
<end_of_turn>
<start_of_turn>model

Initializing tokenizer...


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Analyzing token lengths...
There are 0 train sample(s) with more than 512 tokens
Splitting data into train and validation sets...
Tokenizing datasets...


Map:   0%|          | 0/29356 [00:00<?, ? examples/s]

Map:   0%|          | 0/7340 [00:00<?, ? examples/s]

Saving label encoder to: /content/drive/MyDrive/map/gemma-3/label_encoder.joblib
Initializing model...
Using attention implementation: eager


config.json:   0%|          | 0.00/916 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/109k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of Gemma3ForSequenceClassification were not initialized from the model checkpoint at google/gemma-3-12b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Configuring LoRA adapter...
Number of trainable parameters:
trainable params: 137,162,496 || all params: 12,324,737,136 || trainable%: 1.1129


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'bos_token_id': 2}.


Setting up trainer...

Dataset statistics:
Training samples: 29356
Validation samples: 7340
Batch size: 8 (with gradient accumulation: 64)
Steps per epoch: 458
Total training steps: 1374
Evaluation interval: every 229 steps (~0.50 epochs)
Early stopping after 10 evaluations without improvement
SaveBestMap3Callback enabled - モデルは /content/drive/MyDrive/map/gemma-3/best_map3 に保存されます
Early stopping enabled:
  - Patience (evaluations without improvement): 10
  - Threshold: 0.001
Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss,Map@3
229,4.8531,0.477143,0.912988
458,3.5114,0.423126,0.92257
687,2.6792,0.344717,0.936649
916,2.2109,0.300867,0.943029
1145,1.3197,0.336024,0.947048
1374,1.0737,0.327338,0.947548



[Step 229/1377] 評価実行 - MAP@3スコア: 0.9130
🎉 新しいベストMAP@3スコア更新: 0.9130 (Step 229) - モデルを /content/drive/MyDrive/map/gemma-3/best_map3 に保存しました

[Step 458/1377] 評価実行 - MAP@3スコア: 0.9226
🎉 新しいベストMAP@3スコア更新: 0.9226 (Step 458) - モデルを /content/drive/MyDrive/map/gemma-3/best_map3 に保存しました

[Step 687/1377] 評価実行 - MAP@3スコア: 0.9366
🎉 新しいベストMAP@3スコア更新: 0.9366 (Step 687) - モデルを /content/drive/MyDrive/map/gemma-3/best_map3 に保存しました

[Step 916/1377] 評価実行 - MAP@3スコア: 0.9430
🎉 新しいベストMAP@3スコア更新: 0.9430 (Step 916) - モデルを /content/drive/MyDrive/map/gemma-3/best_map3 に保存しました

[Step 1145/1377] 評価実行 - MAP@3スコア: 0.9470
🎉 新しいベストMAP@3スコア更新: 0.9470 (Step 1145) - モデルを /content/drive/MyDrive/map/gemma-3/best_map3 に保存しました

[Step 1374/1377] 評価実行 - MAP@3スコア: 0.9475
🎉 新しいベストMAP@3スコア更新: 0.9475 (Step 1374) - モデルを /content/drive/MyDrive/map/gemma-3/best_map3 に保存しました

トレーニング完了 - 最終評価を実行中...



[Step 1377/1377] 評価実行 - MAP@3スコア: 0.9475
現在のベストMAP@3スコア: 0.9475 (変更なし)

🏁 最終評価結果:
   最終MAP@3スコア: 0.9475
   全体のベストMAP@3スコア: 0.9475

Saving model...
Training completed successfully!
Model saved to: /content/drive/MyDrive/map/gemma-3/best
Label encoder saved to: /content/drive/MyDrive/map/gemma-3/label_encoder.joblib


0,1
eval/loss,█▆▃▁▂▂▂
eval/map@3,▁▃▆▇███
eval/runtime,█▄▂▁▁▃▃
eval/samples_per_second,▁▅▇██▆▆
eval/steps_per_second,▁▅▇██▆▆
train/epoch,▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇█████
train/grad_norm,█▄▇▄▂▄▂▂▃▄▄▂▂▁▂▁▁▂▃▂▂▂▂▂▁▂▂
train/learning_rate,████▇▇▇▇▆▆▆▅▅▄▄▄▃▃▃▂▂▂▁▁▁▁▁
train/loss,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.32734
eval/map@3,0.94755
eval/runtime,211.0655
eval/samples_per_second,34.776
eval/steps_per_second,4.349
total_flos,8.087918423930392e+17
train/epoch,3
train/global_step,1377
train/grad_norm,19.46146
train/learning_rate,0.0


In [None]:
from google.colab import runtime

runtime.unassign()