In [None]:
"""
設定ファイル - Phi-4 モデルのトレーニングと推論用設定
"""

# Model configuration
VER = 2
MODEL_NAME = "microsoft/Phi-4"
MODEL_TYPE = "phi"  # Phi-4 model type
EPOCHS = 2  # Reduce epochs for initial testing
MAX_LEN = 250  # Phi-4 supports longer context

# Directory settings
OUTPUT_DIR = f"/content/drive/MyDrive/map/phi-4-2epoch"

# Training parameters
TRAIN_BATCH_SIZE =4  # Smaller batch size for Phi-4
EVAL_BATCH_SIZE = 8  # Eval batch size
GRADIENT_ACCUMULATION_STEPS = 16  # Increased for effective batch size
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 229

# 1772
EVAL_STEPS = 229
# 1772


# Data paths
TRAIN_DATA_PATH = '/content/drive/MyDrive/map/train.csv'
TEST_DATA_PATH = '/content/drive/MyDrive/map/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/best"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.2
# 0.0000001

# GPU settings
CUDA_VISIBLE_DEVICES = "0"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "phi-4-math-misconceptions"
WANDB_RUN_NAME = f"phi-4-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration for Phi-4
LORA_RANK = 128  # LoRAのランク - optimized for Phi-4
LORA_ALPHA = 256  # LoRAのスケーリングパラメータ - 1:1 ratio with rank
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]  # Phi-4 target modules
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率 - reduced for Phi-4
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"
USE_DORA = False  # DoRA (Weight-Decomposed Low-Rank Adaptation) を使用する場合はTrue

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = False  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

ATTENTION_IMPLEMENTATION = "eager"  # Options: "eager", "flash_attention_2"

In [None]:
MAPPING_PATH = '/content/drive/MyDrive/map/question_answer_choice_mapping.csv'

In [1]:
"""
共通ユーティリティ関数 - 選択肢付きバージョン
"""

import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from datasets import Dataset
import torch


def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def prepare_answer_choices(train_data, mapping_file=MAPPING_PATH):
    """各問題のMC_Answer選択肢を準備（マッピングファイルを使用、小文字ラベル）"""
    # マッピングファイルを読み込み
    mapping_df = pd.read_csv(mapping_file)

    # 各QuestionIdごとに選択肢を作成
    choices_list = []

    for question_id in train_data['QuestionId'].unique():
        # 該当QuestionIdのマッピングを取得
        question_mapping = mapping_df[mapping_df['QuestionId'] == question_id].copy()

        if len(question_mapping) > 0:
            # Choice（A,B,C,D）でソート
            question_mapping = question_mapping.sort_values('Choice')
            # 選択肢文字列を作成（小文字ラベル）
            choice_items = []
            choice_mapping = {}  # MC_Answer -> choice label のマッピング
            for _, row in question_mapping.iterrows():
                lowercase_choice = row['Choice'].lower()  # A -> a, B -> b, etc.
                choice_items.append(f"{lowercase_choice}. {row['MC_Answer']}")
                choice_mapping[row['MC_Answer']] = lowercase_choice
            answer_choices_str = '\n'.join(choice_items)
        else:
            # マッピングがない場合は従来の番号方式にフォールバック
            question_answers = train_data[train_data['QuestionId'] == question_id]['MC_Answer'].unique()
            choice_items = []
            choice_mapping = {}
            for i, ans in enumerate(question_answers):
                lowercase_choice = chr(ord('a') + i)  # a, b, c, d, ...
                choice_items.append(f"{lowercase_choice}. {ans}")
                choice_mapping[ans] = lowercase_choice
            answer_choices_str = '\n'.join(choice_items)

        choices_list.append({
            'QuestionId': question_id,
            'answer_choices_str': answer_choices_str,
            'choice_mapping': choice_mapping  # MC_Answer -> choice label のマッピングも保存
        })

    choices = pd.DataFrame(choices_list)
    return choices


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット（選択肢付き、回答をラベルに変換）"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # MC_Answerを選択肢ラベル（a, b, c, d）に変換
    student_answer_label = row.get('choice_label', row['MC_Answer'])  # フォールバック

    # Qwen2.5-Math用の数学タスクに特化したプロンプト（選択肢付き）
    prompt = (
        "<|user|>\n"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n"
        "<|end|>\n"
        "<|assistant|>\n"
        "<think>\n"
        "Let me analyze this mathematical misconception...\n"
        "</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    print(f"[DEBUG] *** compute_map3 function called! ***")
    print(f"[DEBUG] eval_pred type: {type(eval_pred)}")
    print(f"[DEBUG] eval_pred: {eval_pred}")

    try:
        logits, labels = eval_pred
        print(f"[DEBUG] compute_map3 called with logits shape: {logits.shape}, labels shape: {labels.shape}")

        probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
        top3 = np.argsort(-probs, axis=1)[:, :3]
        score = 0.0
        for i, label in enumerate(labels):
            ranks = top3[i]
            if ranks[0] == label:
                score += 1.0
            elif ranks[1] == label:
                score += 1.0 / 2
            elif ranks[2] == label:
                score += 1.0 / 3

        map3_score = score / len(labels)
        result = {"eval_map@3": map3_score}
        print(f"[DEBUG] compute_map3 returning: {result}")
        print(f"[DEBUG] *** compute_map3 function completed successfully! ***")
        return result

    except Exception as e:
        print(f"[ERROR] compute_map3 failed: {e}")
        print(f"[ERROR] Exception type: {type(e)}")
        import traceback
        print(f"[ERROR] Full traceback: {traceback.format_exc()}")
        return {"eval_map@3": 0.0}


def create_submission(predictions, test_data, label_encoder):
    """予測結果から提出用ファイルを作成"""
    probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    flat = top3.flatten()
    decoded = label_encoder.inverse_transform(flat)
    top3_labels = decoded.reshape(top3.shape)
    pred_strings = [" ".join(r) for r in top3_labels]

    submission = pd.DataFrame({
        'row_id': test_data.row_id.values,
        'Category:Misconception': pred_strings
    })
    return submission

NameError: name 'MAPPING_PATH' is not defined

In [None]:
"""
カスタムデータコレーター for Qwen3モデル
"""
import torch
from dataclasses import dataclass
from typing import Dict, List, Union
from transformers import PreTrainedTokenizerBase
from transformers.file_utils import PaddingStrategy


@dataclass
class DataCollatorWithPadding:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: int = None
    pad_to_multiple_of: int = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # バッチの最大長を取得
        max_length = max(len(feature["input_ids"]) for feature in features)

        # パディング
        batch = {}
        for key in features[0].keys():
            if key == "label":
                # ラベルはパディング不要
                batch[key] = torch.tensor([f[key] for f in features], dtype=torch.long)
            elif key in ["input_ids", "attention_mask"]:
                # input_idsとattention_maskをパディング
                padded = []
                for feature in features:
                    # tensorをlistに変換
                    if torch.is_tensor(feature[key]):
                        feature_list = feature[key].tolist()
                    else:
                        feature_list = feature[key]

                    remainder = [self.tokenizer.pad_token_id if key == "input_ids" else 0] * (max_length - len(feature_list))
                    padded_feature = feature_list + remainder
                    padded.append(padded_feature)
                batch[key] = torch.tensor(padded, dtype=torch.long)

        # labelsフィールドを追加（Trainerが期待するため）
        if "label" in batch:
            batch["labels"] = batch.pop("label")  # labelを削除してlabelsに変更

        return batch

In [None]:
"""
Qwen-3-0.6B モデルトレーニングスクリプト - QuestionIDベースのマスク付き損失関数バージョン
"""

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    AutoConfig
)
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import Dataset
import joblib
import torch
import torch.nn as nn
import torch.nn.functional as F
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from transformers import AutoModel
import wandb
from transformers import EarlyStoppingCallback, TrainerCallback
import pickle
from collections import defaultdict

# # カスタムモジュールのインポート（選択肢付きバージョン）
# from config import *
# from utils_with_choices import prepare_correct_answers, prepare_answer_choices, format_input, compute_map3
# from data_collator import DataCollatorWithPadding


class SaveBestMap3Callback(TrainerCallback):
    """eval_map@3が最高値を更新した際にモデルを保存するコールバック"""
    def __init__(self, save_dir, tokenizer):
        self.save_dir = save_dir
        self.tokenizer = tokenizer
        self.best_map3 = 0.0

    def on_evaluate(self, args, state, control, metrics, model=None, **kwargs):
        current_map3 = metrics.get('eval_map@3', 0.0)

        if current_map3 > self.best_map3:
            self.best_map3 = current_map3

            # 専用ディレクトリに保存
            best_map3_path = os.path.join(self.save_dir, 'best_map3')
            os.makedirs(best_map3_path, exist_ok=True)

            # LoRAアダプターのみを保存
            model.save_pretrained(best_map3_path)
            self.tokenizer.save_pretrained(best_map3_path)

            print(f"\n新しいベストMAP@3スコア: {current_map3:.4f} - モデルを {best_map3_path} に保存しました")

        return control


class Qwen2ForSequenceClassificationWithMaskedLoss(nn.Module):
    """Qwen2モデルを分類タスク用にカスタマイズ - マスク付き損失関数版"""
    def __init__(self, model_name, num_labels, question_label_map=None,attn_implementation="eager"):
        super().__init__()
        from transformers import AutoModel
        # self.qwen = AutoModel.from_pretrained(model_name, trust_remote_code=True)
        # self.phi = AutoModel.from_pretrained(
        #     model_name,
        #     trust_remote_code=True,
        #     attn_implementation=attn_implementation
        # )
        self.qwen  = AutoModel.from_pretrained(
            model_name,
            trust_remote_code=True,
            attn_implementation=attn_implementation,
            device_map="auto",
            # load_in_8bit=True,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,

        )
        if hasattr(self.qwen.config, "use_cache"):
          self.qwen.config.use_cache = False
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.qwen.config.hidden_size, num_labels)
        self.num_labels = num_labels

        # PEFTライブラリとの互換性のためconfigを追加
        self.config = self.qwen.config
        self.config.num_labels = num_labels

        # QuestionIdごとの有効ラベルマップ
        self.question_label_map = question_label_map

        # マスク値（無効なラベルに適用する大きな負の値）
        self.mask_value = -65000
        # -1e10

    def forward(self, input_ids, attention_mask=None, labels=None, question_ids=None, **kwargs):
        # Transformersが渡す追加の引数（inputs_embeds等）をkwargsで受け取る
        # 基本のqwenモデルに必要な引数のみを渡す
        outputs = self.qwen(input_ids=input_ids, attention_mask=attention_mask)
        # 最後のトークンの隠れ状態を使用
        pooled_output = outputs.last_hidden_state[:, -1, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # QuestionIdベースのマスクを適用
        if question_ids is not None and self.question_label_map is not None:
            masked_logits = self.apply_question_mask(logits, question_ids)
            # デバッグ: 最初のバッチで確認
            if torch.rand(1).item() < 0.001:  # 0.1%の確率でデバッグ出力
                print(f"[MASK DEBUG] Original logits range: [{logits.min():.3f}, {logits.max():.3f}]")
                print(f"[MASK DEBUG] Masked logits range: [{masked_logits.min():.3f}, {masked_logits.max():.3f}]")
                print(f"[MASK DEBUG] Question IDs: {question_ids[:3].tolist()}")
        else:
            masked_logits = logits

        loss = None

        if labels is not None:
            # マスクされたlogitsで損失を計算
            loss_fct = nn.CrossEntropyLoss()
            # ラベルの形状をチェック
            if labels.dim() > 1:
                labels = labels.view(-1)
            if masked_logits.dim() == 3:
                masked_logits = masked_logits.view(-1, self.num_labels)

            # 損失を計算
            try:
                loss = loss_fct(masked_logits, labels)

                # 最初の数回でデバッグ情報を表示
                if torch.rand(1).item() < 0.001:  # 0.1%の確率でデバッグ出力
                    print(f"[LOSS DEBUG] Computed loss: {loss.item():.6f}")
                    print(f"[LOSS DEBUG] Labels: {labels[:5].tolist()}")
                    print(f"[LOSS DEBUG] Masked logits shape: {masked_logits.shape}")

                # 損失がNaNまたは無限大でないことを確認
                if torch.isnan(loss) or torch.isinf(loss):
                    print(f"Warning: Invalid loss detected: {loss}")
                    print(f"masked_logits stats: min={masked_logits.min()}, max={masked_logits.max()}, mean={masked_logits.mean()}")
                    print(f"labels stats: min={labels.min()}, max={labels.max()}")
                    # NaN/Infの場合は大きな損失値を設定
                    loss = torch.tensor(100.0, requires_grad=True, device=masked_logits.device)

            except Exception as e:
                print(f"Error computing loss: {e}")
                print(f"masked_logits shape: {masked_logits.shape}, labels shape: {labels.shape}")
                # エラーが発生した場合は大きな損失値を設定
                loss = torch.tensor(100.0, requires_grad=True, device=masked_logits.device)

        # Accelerateライブラリ互換のため、SequenceClassifierOutputを使用
        return SequenceClassifierOutput(loss=loss, logits=masked_logits)

    def apply_question_mask(self, logits, question_ids):
        """QuestionIdごとに無効なラベルをマスクする"""
        batch_size = logits.size(0)
        device = logits.device

        # マスクを作成（初期値は全てマスク）
        mask = torch.full_like(logits, self.mask_value)

        for i in range(batch_size):
            q_id = question_ids[i].item() if torch.is_tensor(question_ids[i]) else question_ids[i]

            if q_id in self.question_label_map:
                valid_labels = self.question_label_map[q_id]
                # 有効なラベルのみ元のlogitsを保持
                for label_idx in valid_labels:
                    mask[i, label_idx] = 0

        # マスクを適用（無効なラベルには大きな負の値を加算）
        masked_logits = logits + mask

        return masked_logits


def create_question_label_mapping(train_df):
    """QuestionIdごとの有効なラベル（誤概念）のマッピングを作成"""
    question_label_map = defaultdict(set)

    for _, row in train_df.iterrows():
        question_id = row['QuestionId']
        label = row['label']
        question_label_map[question_id].add(label)

    # setをlistに変換
    question_label_map = {q_id: list(labels) for q_id, labels in question_label_map.items()}

    # 統計情報を表示
    label_counts = [len(labels) for labels in question_label_map.values()]
    print(f"\n=== QuestionId-Label Mapping Statistics ===")
    print(f"Total unique questions: {len(question_label_map)}")
    print(f"Average labels per question: {np.mean(label_counts):.2f}")
    print(f"Min labels per question: {np.min(label_counts)}")
    print(f"Max labels per question: {np.max(label_counts)}")
    print(f"Median labels per question: {np.median(label_counts):.1f}")

    # ヒストグラムを作成
    plt.figure(figsize=(10, 6))
    plt.hist(label_counts, bins=30, edgecolor='black')
    plt.xlabel('Number of labels per question')
    plt.ylabel('Number of questions')
    plt.title('Distribution of Labels per QuestionId')
    plt.grid(True, alpha=0.3)
    plt.savefig('question_label_distribution.png')
    plt.close()

    return dict(question_label_map)


def tokenize_dataset_with_question_id(dataset, tokenizer, max_len):
    """データセットのトークナイズ（QuestionId付き）"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        tokenized = tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )
        # QuestionIdをそのまま保持
        tokenized['question_ids'] = batch['QuestionId']
        return tokenized

    tokenized_dataset = dataset.map(
        tokenize,
        batched=True,
        remove_columns=['text', 'QuestionId']  # textとQuestionIdを削除、labelは保持
    )

    return tokenized_dataset


class DataCollatorWithQuestionId:
    """QuestionIdを含むカスタムデータコレーター"""
    def __init__(self, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, features):
        # # デバッグ: 入力データの構造を確認
        # print(f"[COLLATOR DEBUG] Features keys: {list(features[0].keys())}")
        # print(f"[COLLATOR DEBUG] First feature sample: {features[0]}")

        # バッチの最大長を取得
        max_length = max(len(feature["input_ids"]) for feature in features)

        # パディング
        batch = {}
        for key in features[0].keys():
            if key in ["label", "labels"]:
                # ラベルはパディング不要（labelをlabelsに変換）
                labels = [f[key] for f in features]
                # print(f"[COLLATOR DEBUG] Processing labels: {labels[:5]}...")
                batch["labels"] = torch.tensor(labels, dtype=torch.long)
            elif key in ["input_ids", "attention_mask"]:
                # input_idsとattention_maskをパディング
                padded = []
                for feature in features:
                    # tensorをlistに変換
                    if torch.is_tensor(feature[key]):
                        feature_list = feature[key].tolist()
                    else:
                        feature_list = feature[key]

                    remainder = [self.tokenizer.pad_token_id if key == "input_ids" else 0] * (max_length - len(feature_list))
                    padded_feature = feature_list + remainder
                    padded.append(padded_feature)
                batch[key] = torch.tensor(padded, dtype=torch.long)
            elif key == "question_ids":
                # question_idsはパディング不要
                batch[key] = torch.tensor([f[key] for f in features], dtype=torch.long)

        return batch




# WandBの初期化
if USE_WANDB:
    wandb.init(
        project=WANDB_PROJECT,
        name=WANDB_RUN_NAME + "_masked_loss",
        entity=WANDB_ENTITY,
        config={
            "model_name": MODEL_NAME,
            "epochs": EPOCHS,
            "max_len": MAX_LEN,
            "train_batch_size": TRAIN_BATCH_SIZE,
            "eval_batch_size": EVAL_BATCH_SIZE,
            "learning_rate": LEARNING_RATE,
            "early_stopping_patience": EARLY_STOPPING_PATIENCE if USE_EARLY_STOPPING else None,
            "lora_rank": LORA_RANK,
            "lora_alpha": LORA_ALPHA,
            "lora_target_modules": LORA_TARGET_MODULES,
            "lora_dropout": LORA_DROPOUT,
            "lora_bias": LORA_BIAS,
            "with_choices": True,
            "masked_loss": True,  # マスク付き損失関数を使用
        }
    )

# GPU設定
if CUDA_VISIBLE_DEVICES is not None:
    os.environ['CUDA_VISIBLE_DEVICES'] = CUDA_VISIBLE_DEVICES
    print(f"Using CUDA device(s): {CUDA_VISIBLE_DEVICES}")

# 出力ディレクトリの作成
output_dir_masked = OUTPUT_DIR + "_masked_loss"
os.makedirs(output_dir_masked, exist_ok=True)

# --- データの読み込みと前処理 ---
print("Loading and preprocessing training data...")
le = LabelEncoder()
train = pd.read_csv(TRAIN_DATA_PATH)

# --- QuestionId 32835のQuestionTextを更新 ---
print("Updating QuestionId 32835...")
new_question_text = "Which number is the greatest? Options: 6.0000 6.2 6.079 6.0001"
mask_32835 = train['QuestionId'] == 32835
update_count = mask_32835.sum()

if update_count > 0:
    original_text = train[mask_32835]['QuestionText'].iloc[0]
    print(f"Found {update_count} rows with QuestionId 32835")
    print(f"Original: {original_text[:80]}...")
    print(f"Updated to: {new_question_text}")
    train.loc[mask_32835, 'QuestionText'] = new_question_text
else:
    print("No rows found with QuestionId 32835")

# フィルタリングを行わず全データを使用
print(f"Using all data without filtering: {train.shape[0]} rows")

train.Misconception = train.Misconception.fillna('NA')
train['target'] = train.Category + ":" + train.Misconception
train['label'] = le.fit_transform(train['target'])
n_classes = len(le.classes_)
print(f"Train shape: {train.shape} with {n_classes} target classes")

# --- 特徴量エンジニアリング ---
print("Performing feature engineering...")
correct = prepare_correct_answers(train)
train = train.merge(correct, on=['QuestionId','MC_Answer'], how='left')
train.is_correct = train.is_correct.fillna(0)

# --- 選択肢データの準備 ---
print("Preparing answer choices for each question...")
choices = prepare_answer_choices(train)
train = train.merge(choices[['QuestionId', 'answer_choices_str']], on='QuestionId', how='left')

# --- MC_Answerを選択肢ラベルに変換 ---
print("Converting MC_Answer to choice labels...")
def get_choice_label(row):
    question_id = row['QuestionId']
    mc_answer = row['MC_Answer']
    # 該当するchoice_mappingを取得
    choice_mapping = choices[choices['QuestionId'] == question_id]['choice_mapping'].iloc[0]
    return choice_mapping.get(mc_answer, mc_answer)  # マッピングがない場合は元の値

train['choice_label'] = train.apply(get_choice_label, axis=1)

# --- 入力テキストのフォーマット ---
print("Formatting input text with answer choices...")
train['text'] = train.apply(format_input, axis=1)
print("Example prompt for our LLM with choices:")
print(train.text.values[0])

# --- QuestionId-Labelマッピングの作成 ---
print("\nCreating QuestionId-Label mapping for masked loss...")
question_label_map = create_question_label_mapping(train)

# マッピングを保存
mapping_path = f"{output_dir_masked}/question_label_mapping.pkl"
with open(mapping_path, 'wb') as f:
    pickle.dump(question_label_map, f)
print(f"Question-Label mapping saved to: {mapping_path}")

# --- トークナイザーの初期化 ---
print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# パディングトークンの設定
# Qwen3モデルの場合、特別なトークンIDを使用
if tokenizer.pad_token is None:
    # 語彙内の安全なトークンIDを使用
    # Qwenモデルでは、0番のトークンがUNKNOWNトークンとして使われることが多い
    tokenizer.pad_token_id = 0
    tokenizer.pad_token = tokenizer.decode([0])

# --- トークン長の分析 ---
print("Analyzing token lengths...")
lengths = [len(tokenizer.encode(t, truncation=False)) for t in train['text']]
plt.figure(figsize=(10, 6))
plt.hist(lengths, bins=50)
plt.title("Token Length Distribution (With Choices and Masked Loss)")
plt.xlabel("Number of tokens")
plt.ylabel("Frequency")
plt.grid(True)
plt.savefig(f'{output_dir_masked}/token_length_distribution_masked.png')
plt.close()

over_limit = (np.array(lengths) > MAX_LEN).sum()
print(f"There are {over_limit} train sample(s) with more than {MAX_LEN} tokens")

# 選択肢が追加されたことによるトークン長の増加を表示
avg_length = np.mean(lengths)
max_length = np.max(lengths)
print(f"Average token length: {avg_length:.1f}, Max token length: {max_length}")

# --- データの分割 ---
if VALIDATION_SPLIT > 0:
    print("Splitting data into train and validation sets...")
    train_df, val_df = train_test_split(train, test_size=VALIDATION_SPLIT, random_state=RANDOM_SEED)

    # QuestionIdを含むカラムを選択
    COLS = ['text', 'label', 'QuestionId']
    train_ds = Dataset.from_pandas(train_df[COLS])
    val_ds = Dataset.from_pandas(val_df[COLS])

    # --- データセットのトークナイズ ---
    print("Tokenizing datasets with QuestionIds...")
    train_ds = tokenize_dataset_with_question_id(train_ds, tokenizer, MAX_LEN)
    val_ds = tokenize_dataset_with_question_id(val_ds, tokenizer, MAX_LEN)
else:
    print("Using all data for training (no validation split)...")
    train_df = train
    val_df = None

    # QuestionIdを含むカラムを選択
    COLS = ['text', 'label', 'QuestionId']
    train_ds = Dataset.from_pandas(train_df[COLS])
    val_ds = None

    # --- データセットのトークナイズ ---
    print("Tokenizing training dataset with QuestionIds...")
    train_ds = tokenize_dataset_with_question_id(train_ds, tokenizer, MAX_LEN)

# --- Label Encoderの保存 ---
label_encoder_path = f"{output_dir_masked}/label_encoder.joblib"
print(f"Saving label encoder to: {label_encoder_path}")
joblib.dump(le, label_encoder_path)


# --- モデルの初期化 ---
print("Initializing model with masked loss...")
# カスタムクラスを直接使用（マスク付き損失関数のため）
model = Qwen2ForSequenceClassificationWithMaskedLoss(MODEL_NAME, n_classes, question_label_map)

# パディングトークンIDを設定
if hasattr(model.config, 'pad_token_id'):
    model.config.pad_token_id = tokenizer.pad_token_id


# --- LoRAアダプターの設定 ---
print("Configuring LoRA adapter...")
lora_config = LoraConfig(
    r=LORA_RANK,  # LoRAのランク
    lora_alpha=LORA_ALPHA,  # LoRAのスケーリングパラメータ
    target_modules=LORA_TARGET_MODULES,  # 対象モジュール
    lora_dropout=LORA_DROPOUT,
    bias=LORA_BIAS,
    task_type=TaskType.SEQ_CLS,
)

# PEFTモデルの作成
model = get_peft_model(model, lora_config)
print("Number of trainable parameters:")
model.print_trainable_parameters()

# シングルGPUに設定
if torch.cuda.is_available():
    model = model.cuda()


training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_ratio=0.0,  # ウォームアップを無効化
    learning_rate=LEARNING_RATE,
    logging_steps=LOGGING_STEPS,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="map@3",
    greater_is_better=True,
    push_to_hub=False,
    bf16=True,  # BF16を使用
    gradient_checkpointing=False,  # メモリ効率化のため有効化
    dataloader_num_workers=4,
    remove_unused_columns=False,  # カラムを削除しない
    lr_scheduler_type="cosine",  # コサインスケジューラーを使用
    max_grad_norm=MAX_GRAD_NORM,  # Gradient clipping
    optim="adamw_bnb_8bit" if USE_8BIT_ADAM else "adamw_torch",  # 8-bit Adam optimizer
    report_to="wandb" if USE_WANDB else "none",
    run_name=WANDB_RUN_NAME + "_v2" if USE_WANDB else None,
)
# --- トレーナーのセットアップとトレーニング ---
print("Setting up trainer...")

# エポックあたりのステップ数を計算
steps_per_epoch = len(train_ds) // (TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)  # gradient_accumulation_stepsを考慮
total_steps = steps_per_epoch * EPOCHS
print(f"\nDataset statistics:")
print(f"Training samples: {len(train_ds)}")
print(f"Validation samples: {len(val_ds) if val_ds is not None else 0}")
print(f"Batch size: {TRAIN_BATCH_SIZE} (with gradient accumulation: {TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS})")
print(f"Steps per epoch: {steps_per_epoch}")
print(f"Total training steps: {total_steps}")
if val_ds is not None:
    print(f"Evaluation interval: every {EVAL_STEPS} steps (~{EVAL_STEPS/steps_per_epoch:.2f} epochs)")
    print(f"Early stopping after {EARLY_STOPPING_PATIENCE} evaluations without improvement")
else:
    print("No validation - training without evaluation")

# カスタムデータコレーターを使用（QuestionId付き）
data_collator = DataCollatorWithQuestionId(tokenizer=tokenizer, max_length=MAX_LEN)

# コールバックの設定
callbacks = []

# SaveBestMap3Callbackを追加（validationがある場合のみ）
if val_ds is not None:
    save_best_callback = SaveBestMap3Callback(save_dir=output_dir_masked, tokenizer=tokenizer)
    callbacks.append(save_best_callback)
    print(f"SaveBestMap3Callback enabled - モデルは {output_dir_masked}/best_map3 に保存されます")

    if USE_EARLY_STOPPING:
        # EARLY_STOPPING_PATIENCEは評価回数として直接使用
        early_stopping_callback = EarlyStoppingCallback(
            early_stopping_patience=EARLY_STOPPING_PATIENCE,
            early_stopping_threshold=EARLY_STOPPING_THRESHOLD
        )
        callbacks.append(early_stopping_callback)
        print(f"Early stopping enabled:")
        print(f"  - Patience (evaluations without improvement): {EARLY_STOPPING_PATIENCE}")
        print(f"  - Threshold: {EARLY_STOPPING_THRESHOLD}")
else:
    print("No validation - callbacks disabled")

# デバッグ: compute_map3関数の確認
print(f"[DEBUG] compute_map3 function: {compute_map3}")
print(f"[DEBUG] val_ds is not None: {val_ds is not None}")
compute_metrics_func = compute_map3 if val_ds is not None else None
print(f"[DEBUG] compute_metrics will be set to: {compute_metrics_func}")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_func,
    callbacks=callbacks,
)

# Trainerが初期化された後にcompute_metricsが正しく設定されているか確認
# print(f"[DEBUG] Trainer.compute_metrics: {trainer.compute_metrics}")
# if hasattr(trainer, 'compute_metrics') and trainer.compute_metrics:
#     print("[DEBUG] compute_metrics is properly set in Trainer")
# else:
#     print("[DEBUG] WARNING: compute_metrics is NOT set in Trainer")

print("\nStarting training with masked loss...")
print("Note: Invalid labels for each QuestionId will be masked during training.")
trainer.train()

# # --- 最終的なMAP@3スコアを表示 ---
# if val_ds is not None:
#     print("\nEvaluating on validation set...")
#     eval_results = trainer.evaluate()
#     print(f"\nValidation MAP@3: {eval_results.get('eval_map@3', 'N/A'):.4f}")

# # --- モデルの保存 ---
# best_model_path = f"{output_dir_masked}/best"
# print("\nSaving model...")
# # LoRAアダプターのみを保存
# model.save_pretrained(best_model_path)
# # トークナイザーも保存
# tokenizer.save_pretrained(best_model_path)

# print("Training completed successfully!")
# print(f"Model saved to: {best_model_path}")
# print(f"Label encoder saved to: {label_encoder_path}")
# print(f"Question-Label mapping saved to: {mapping_path}")

# # WandBの終了
# if USE_WANDB:
#     wandb.finish()



In [None]:
"""
訓練直後のモデルを直接マージして保存するスクリプト
train_mask_loss.pyの訓練完了後に実行する想定
"""

import torch
import os
import pickle
import joblib
from peft import PeftModel
from transformers import AutoTokenizer


def save_trained_model_directly(
    trained_peft_model,  # trainer.train()後のPEFTモデル（既に学習済み）
    tokenizer,
    output_dir,
    label_encoder_path,
    question_label_mapping_path
):
    """
    訓練済みのPEFTモデルを直接マージして保存

    Args:
        trained_peft_model: 訓練済みのPEFTモデル（trainer.model）
        tokenizer: 使用したトークナイザー
        output_dir: 保存先ディレクトリ
        label_encoder_path: ラベルエンコーダーのパス
        question_label_mapping_path: Question-Labelマッピングのパス
    """

    print("=" * 50)
    print("Saving trained model directly...")
    print("=" * 50)

    # 1. PEFTモデルをマージ
    print("\n1. Merging LoRA adapter with base model...")
    # merge_and_unload()でLoRAアダプターをベースモデルに統合
    merged_model = trained_peft_model.merge_and_unload()

    # 2. 出力ディレクトリの作成
    os.makedirs(output_dir, exist_ok=True)

    # 3. マージ済みモデル（qwen部分）の保存
    print(f"\n2. Saving merged base model (qwen) to: {output_dir}")
    # qwen部分を保存（AutoModelで読み込み可能な形式）
    merged_model.qwen.save_pretrained(output_dir, safe_serialization=True)

    # 4. classifierレイヤーの重みを保存
    classifier_path = os.path.join(output_dir, "classifier_weights.pt")
    print(f"\n3. Saving classifier weights to: {classifier_path}")
    torch.save({
        'classifier_state_dict': merged_model.classifier.state_dict(),
        'hidden_size': merged_model.qwen.config.hidden_size,
        'num_labels': merged_model.num_labels,
        'dropout': 0.1,
    }, classifier_path)

    # 5. トークナイザーも保存
    print(f"\n4. Saving tokenizer to: {output_dir}")
    tokenizer.save_pretrained(output_dir)

    # 6. Question-Labelマッピングをコピー
    question_label_map = question_label_mapping_path

    output_mapping_path = os.path.join(output_dir, "question_label_mapping.pkl")
    print(f"\n5. Saving question-label mapping to: {output_mapping_path}")
    with open(output_mapping_path, 'wb') as f:
        pickle.dump(question_label_map, f)

    # 7. ラベルエンコーダーをコピー
    le = label_encoder_path
    output_encoder_path = os.path.join(output_dir, "label_encoder.joblib")
    print(f"\n6. Saving label encoder to: {output_encoder_path}")
    joblib.dump(le, output_encoder_path)

    # 8. モデル設定情報を保存
    config_path = os.path.join(output_dir, "model_config.pkl")
    print(f"\n7. Saving model configuration to: {config_path}")
    with open(config_path, 'wb') as f:
        pickle.dump({
            'n_classes': merged_model.num_labels,
            'mask_value': -1e10,
            'dropout': 0.1,
        }, f)

    print("\n" + "=" * 50)
    print("Model saved successfully!")
    print(f"All files saved to: {output_dir}")
    print("\nSaved files:")
    print(f"  - Merged base model: {output_dir}/")
    print(f"  - Classifier weights: {classifier_path}")
    print(f"  - Question-label mapping: {output_mapping_path}")
    print(f"  - Label encoder: {output_encoder_path}")
    print(f"  - Model config: {config_path}")
    print("=" * 50)

    return merged_model


# train_mask_loss.pyの最後に追加する使用例
"""
使用例（train_mask_loss.pyの訓練完了後に追加）:

from save_trained_model_directly import save_trained_model_directly

# trainer.train()の後で実行
merged_model = save_trained_model_directly(
    trained_peft_model=model,  # trainer.train()後のモデル
    tokenizer=tokenizer,
    output_dir="./phi4-merged-complete",
    label_encoder_path=label_encoder_path,
    question_label_mapping_path=mapping_path
)

# Kaggleにアップロードする際はこのディレクトリをzipして使用
"""

In [None]:
merged_model = save_trained_model_directly(
    trained_peft_model=model,  # trainer.train()後のモデル
    tokenizer=tokenizer,
    output_dir="./phi4-merged",
    label_encoder_path=le,
    question_label_mapping_path=question_label_map
)

Saving trained model directly...

1. Merging LoRA adapter with base model...

2. Saving merged base model (qwen) to: ./phi4-merged

3. Saving classifier weights to: ./phi4-merged/classifier_weights.pt

4. Saving tokenizer to: ./phi4-merged

5. Saving question-label mapping to: ./phi4-merged/question_label_mapping.pkl

6. Saving label encoder to: ./phi4-merged/label_encoder.joblib

7. Saving model configuration to: ./phi4-merged/model_config.pkl

Model saved successfully!
All files saved to: ./phi4-merged

Saved files:
  - Merged base model: ./phi4-merged/
  - Classifier weights: ./phi4-merged/classifier_weights.pt
  - Question-label mapping: ./phi4-merged/question_label_mapping.pkl
  - Label encoder: ./phi4-merged/label_encoder.joblib
  - Model config: ./phi4-merged/model_config.pkl


In [None]:
"""

Google ColabでKaggleにモデルをアップロードするスクリプト
/content/phi4-reasoning-merged ディレクトリをKaggleデータセットとしてアップロード

"""

# Google Colabで実行するコード
# このスクリプトをColabにコピーして実行してください

# ========================================
# 1. Kaggle APIのセットアップ
# ========================================
print("=" * 50)
print("Step 1: Setting up Kaggle API")
print("=" * 50)

# kaggle.json をアップロード（既にある場合はスキップ）
from google.colab import files
import os

if not os.path.exists('/root/.kaggle/kaggle.json'):
    print("Please upload your kaggle.json file...")
    uploaded = files.upload()  # kaggle.json を選ぶ

    # Kaggle APIのインストールと設定
    !pip -q install kaggle
    !mkdir -p ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    print("Kaggle API setup completed!")
else:
    print("kaggle.json already exists. Skipping upload.")
    !pip -q install kaggle

# 動作確認
print("\nTesting Kaggle API connection...")
!kaggle datasets list -s "titanic" | head -n 3

# ========================================
# 2. データセットのメタデータ作成
# ========================================
print("\n" + "=" * 50)
print("Step 2: Creating dataset metadata")
print("=" * 50)

import json

# 設定（必要に応じて変更）
USERNAME = "masasato1999"           # あなたのKaggleユーザー名
DATASET_SLUG = "phi4-merged-masked"  # データセット名（英数字とハイフンのみ）
DST = "/content/phi4-merged"  # アップロードするディレクトリ

# ディレクトリの存在確認
if not os.path.exists(DST):
    print(f"ERROR: Directory {DST} not found!")
    print("Please make sure the model is saved at the correct location.")
else:
    print(f"Model directory found: {DST}")

    # ディレクトリの内容を確認
    print("\nDirectory contents:")
    !ls -la $DST

    # メタデータの作成
    meta = {
        "title": "Phi-4 Reasoning Merged with Masked Loss",
        "id": f"{USERNAME}/{DATASET_SLUG}",
        "licenses": [{"name": "other"}],
        "isPrivate": True  # まずは非公開でアップロード
    }

    metadata_path = f"{DST}/dataset-metadata.json"
    with open(metadata_path, "w") as f:
        json.dump(meta, f, indent=2)

    print(f"\nMetadata created at: {metadata_path}")
    print("Metadata content:")
    print(json.dumps(meta, indent=2))

# ========================================
# 3. Kaggleへのアップロード
# ========================================
print("\n" + "=" * 50)
print("Step 3: Uploading to Kaggle")
print("=" * 50)

print(f"Uploading directory: {DST}")
print(f"Dataset will be available at: kaggle.com/datasets/{USERNAME}/{DATASET_SLUG}")
print("\nThis may take several minutes depending on the model size...")

# アップロード実行
!kaggle datasets create -p $DST --dir-mode zip

# # ========================================
# # 4. アップロード確認
# # ========================================
# print("\n" + "=" * 50)
# print("Step 4: Verifying upload")
# print("=" * 50)

# # アップロードされたデータセットを確認
# print("\nChecking if dataset was uploaded successfully...")
# !kaggle datasets list -u $USERNAME | grep $DATASET_SLUG

# print("\n" + "=" * 50)
# print("Upload completed!")
# print("=" * 50)
# print(f"\n📦 Dataset URL: https://www.kaggle.com/datasets/{USERNAME}/{DATASET_SLUG}")
# print("\n⚠️  Note: The dataset is currently PRIVATE.")
# print("To make it public, go to the dataset page and change the visibility settings.")
# print("\n🔧 To use in Kaggle notebook:")
# print(f"   Input path: /kaggle/input/{DATASET_SLUG}/")

# # ========================================
# # 5. データセットの更新が必要な場合
# # ========================================
# print("\n" + "=" * 50)
# print("For future updates:")
# print("=" * 50)
# print("If you need to update this dataset later, use:")
# print(f"!kaggle datasets version -p {DST} -m 'Update message'")

Step 1: Setting up Kaggle API
Please upload your kaggle.json file...


Saving kaggle.json to kaggle (1).json
Kaggle API setup completed!

Testing Kaggle API connection...
ref                                  title                                                size  lastUpdated                 downloadCount  voteCount  usabilityRating  
-----------------------------------  ---------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
heptapod/titanic                     Titanic                                             11090  2017-05-16 08:14:22.210000         129925       1730  0.7058824        

Step 2: Creating dataset metadata
Model directory found: /content/phi4-merged

Directory contents:
total 27639236
drwxr-xr-x 2 root root       4096 Oct  2 12:15 .
drwxr-xr-x 1 root root       4096 Oct  2 12:18 ..
-rw-r--r-- 1 root root        462 Oct  2 12:10 chat_template.jinja
-rw-r--r-- 1 root root    1333565 Oct  2 12:10 classifier_weights.pt
-rw-r--r-- 1 root root       3514 Oct  2 12:09

In [None]:
from google.colab import runtime

runtime.unassign()