In [1]:
!pip install --no-index --no-deps /kaggle/input/bitsandbytes-20250725/bitsandbytes/bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl

Processing /kaggle/input/bitsandbytes-20250725/bitsandbytes/bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.1


In [None]:
%%writefile deepseek_0_946.py

import os
import gc
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import joblib
import torch

try:
    from peft import PeftModel, PeftConfig
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False
    print("Warning: PEFT not available, will use base model only")


# Model configuration
VER = 2
MODEL_NAME = "/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-14b/2"
MODEL_TYPE = "qwen2"  # DeepSeek-R1 is based on Qwen2 architecture
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 250  # Increase for DeepSeek model's better long context handling

# Directory settings
OUTPUT_DIR = f"/kaggle/input/deepseek-r1-distill-qwen-14b-cv0.9455-fulltrain/transformers/default/1/ver_2"

# Training parameters
TRAIN_BATCH_SIZE = 8  # Batch size 2 for RTX 5090 with 31GB VRAM
EVAL_BATCH_SIZE = 8  # Eval can use larger batch size
GRADIENT_ACCUMULATION_STEPS = 8  # Reduced to 32 for faster training
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200


# Data paths
TRAIN_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/checkpoint-1722"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.0000001

# GPU settings
CUDA_VISIBLE_DEVICES = "0,1"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'deepseek_r1_submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "deepseek-r1-14b-math-misconceptions"
WANDB_RUN_NAME = f"deepseek-r1-14b-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration
LORA_RANK = 32  # LoRAのランク - reduced for memory efficiency
LORA_ALPHA = 64  # LoRAのスケーリングパラメータ - reduced proportionally
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]  # 対象モジュール
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = False  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # DeepSeek-R1用のプロンプト - シンプルな形式
    prompt = (
        f"User: [Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n\n"
        "Assistant: <think>\n\n</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0.0
    for i, label in enumerate(labels):
        ranks = top3[i]
        if ranks[0] == label:
            score += 1.0
        elif ranks[1] == label:
            score += 1.0 / 2
        elif ranks[2] == label:
            score += 1.0 / 3
    return {"map@3": score / len(labels)}


def create_submission(predictions, test_data, label_encoder, filter_true_false = True):

    question_label_choices = {
        31772: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Incomplete',
            'True_Misconception:WNB',
            'False_Neither:NA',
            'False_Misconception:WNB',
            'False_Misconception:Incomplete',
            'False_Correct:NA'
        ],
        31774: [
            'False_Neither:NA',
            'False_Misconception:SwapDividend',
            'False_Misconception:Mult',
            'False_Correct:NA',
            'False_Misconception:FlipChange',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:SwapDividend',
            'True_Misconception:Mult',
            'True_Misconception:FlipChange'
        ],
        31777: [
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Irrelevant',
            'False_Misconception:Wrong_Fraction',
            'True_Correct:NA',
            'True_Neither:NA'
        ],
        31778: [
            'False_Neither:NA',
            'False_Misconception:Additive',
            'False_Misconception:Irrelevant',
            'False_Correct:NA',
            'False_Misconception:WNB',
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Irrelevant',
            'True_Misconception:Additive'
        ],
        32829: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Not_variable',
            'False_Neither:NA',
            'False_Misconception:Adding_terms',
            'False_Correct:NA',
            'False_Misconception:Not_variable',
            'False_Misconception:Inverse_operation'
        ],
        32833: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Inversion',
            'True_Misconception:Duplication',
            'False_Misconception:Duplication',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Inversion',
            'False_Misconception:Wrong_Operation'
        ],
        32835: [
            'False_Misconception:Whole_numbers_larger',
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Longer_is_bigger',
            'False_Misconception:Ignores_zeroes',
            'False_Misconception:Shorter_is_bigger',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Whole_numbers_larger',
            'True_Misconception:Shorter_is_bigger',
            'True_Misconception:Longer_is_bigger'
        ],
        33471: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Wrong_fraction',
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Wrong_fraction'
        ],
        33472: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Adding_across',
            'True_Misconception:Denominator-only_change',
            'True_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Denominator-only_change',
            'False_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Misconception:Adding_across'
        ],
        33474: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Division',
            'True_Misconception:Subtraction',
            'False_Neither:NA',
            'False_Misconception:Subtraction',
            'False_Misconception:Division',
            'False_Correct:NA'
        ],
        76870: [
            'False_Misconception:Unknowable',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Definition',
            'False_Misconception:Interior',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Definition'
        ],
        89443: [
            'False_Neither:NA',
            'False_Misconception:Positive',
            'False_Misconception:Tacking',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Tacking',
            'True_Misconception:Positive',
            'False_Correct:NA'
        ],
        91695: [
            'False_Neither:NA',
            'False_Misconception:Wrong_term',
            'False_Correct:NA',
            'False_Misconception:Firstterm',
            'True_Correct:NA',
            'True_Misconception:Wrong_term',
            'True_Neither:NA',
            'True_Misconception:Firstterm'
        ],
        104665: [
            'False_Neither:NA',
            'False_Misconception:Base_rate',
            'False_Correct:NA',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Base_rate',
            'True_Misconception:Multiplying_by_4',
            'False_Misconception:Multiplying_by_4'
        ],
        109465: [
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Certainty',
            'False_Misconception:Scale',
            'True_Correct:NA',
            'True_Neither:NA'
        ]
    }

    # Identify which are True/False classes
    true_classes = {}
    false_classes = {}
    for idx, c in enumerate(label_encoder.classes_):

        if 'True' in c:
            true_classes[idx] = c
        else:
            false_classes[idx] = c


    # Normalize for Label Encoder
    question_label_choice_ids = {}
    for qid, choices in question_label_choices.items():
        _label_ids = np.where(np.isin(label_encoder.classes_, question_label_choices[qid]))[0]

        question_label_choice_ids[qid] = [int(x) for x in _label_ids]


    test_probabilities = []
    test_predictions = []
    test_top3_predictions = []

    for qid, correct, row in zip(test_data.QuestionId.tolist(), test_data.is_correct.tolist(), predictions.predictions):

        candidate_idx = question_label_choice_ids[qid]

        # If filter candidates using True/False information
        if filter_true_false:
            if correct == 1:
                # use true_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in true_classes]
            if correct == 0:
                # use false_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in false_classes]

        candidate_logits = row[candidate_idx]

        candidate_probs = torch.nn.functional.softmax(torch.tensor(candidate_logits), dim=-1).numpy()

        top_k = np.argsort(-candidate_probs)

        # Have to convert back to the original label encoder space
        topk_idx = np.array(candidate_idx)[top_k]

        # Keep the probabilities
        topk_probs = candidate_probs[top_k].tolist()

        # Get the predicted labels
        topk_preds = label_encoder.inverse_transform(topk_idx).tolist()

        test_probabilities.append(topk_probs)
        test_predictions.append(topk_preds)
        test_top3_predictions.append(" ".join(topk_preds[:3]))

    test_submission_data = pd.DataFrame({
        "row_id": test_data.row_id.tolist(),
        "QuestionId": test_data.QuestionId.tolist(),
        "is_correct": test_data.is_correct.tolist(),
        "probs": test_probabilities,
        "preds": test_predictions,
        'Category:Misconception': test_top3_predictions
    })

    return test_submission_data


def main():
    """メイン推論関数"""

    # メモリキャッシュをクリア
    torch.cuda.empty_cache()
    gc.collect()

    # CUDAメモリ管理の最適化
    import os
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

    # 2つのGPUを使用可能にする
    if torch.cuda.device_count() > 1:
        print(f"Found {torch.cuda.device_count()} GPUs")

    print("Loading label encoder...")
    # ラベルエンコーダーの読み込み
    le = joblib.load(LABEL_ENCODER_PATH)
    n_classes = len(le.classes_)

    print("Loading trained model and tokenizer...")

    if PEFT_AVAILABLE:
        # LoRAアダプターを使用する場合
        print(f"Loading fine-tuned LoRA model from: {BEST_MODEL_PATH}")
        print(f"Loading base model from: {MODEL_NAME}")

        # ベースモデルを読み込む（float16で読み込み）
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=n_classes,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto",  # 自動的に複数GPUに分散
            low_cpu_mem_usage=True  # CPUメモリ使用量を削減
        )

        # LoRAアダプターを適用
        model = PeftModel.from_pretrained(model, BEST_MODEL_PATH)

        # 推論モードに設定（メモリ効率化）
        model.eval()
        # float16モデルは既にGPUに配置されているのでto('cuda')は不要

        # トークナイザーはベースモデルから読み込む
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        print("Successfully loaded LoRA fine-tuned model")
    else:
        # PEFTが利用できない場合はエラー
        raise ImportError("PEFT is required to load the fine-tuned model. Please install peft: pip install peft")

    # パディングトークンの設定
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # モデルの設定を更新（PeftModelのbase_modelにアクセス）
    if hasattr(model, 'base_model'):
        model.base_model.config.pad_token_id = tokenizer.pad_token_id
        # 内部のモデルにも設定
        if hasattr(model.base_model, 'model'):
            model.base_model.model.config.pad_token_id = tokenizer.pad_token_id
    else:
        model.config.pad_token_id = tokenizer.pad_token_id

    print("Loading test data...")
    # テストデータの読み込み
    test = pd.read_csv(TEST_DATA_PATH)

    print("Loading training data for correct answers...")
    # 正解答案データの準備（訓練データから取得）
    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    correct = prepare_correct_answers(train)

    print("Preprocessing test data...")
    # テストデータの前処理
    test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    test.is_correct = test.is_correct.fillna(0)
    test['text'] = test.apply(format_input, axis=1)

    print("Tokenizing test data...")
    # テストデータのトークナイズ
    ds_test = Dataset.from_pandas(test[['text']])
    ds_test = tokenize_dataset(ds_test, tokenizer, MAX_LEN)

    # パディングのためのデータコラレータの設定
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("Running inference...")

    # TF32を有効化（推論速度向上）
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # 推論の実行
    trainer = Trainer(
        model=model,
        processing_class=tokenizer,  # tokenizer の代替
        data_collator=data_collator,  # バッチ時に自動でパディングを適用
        args=TrainingArguments(
            output_dir="./tmp",  # 一時ディレクトリ（必須パラメータ）
            report_to="none",    # wandbを無効化
            per_device_eval_batch_size=EVAL_BATCH_SIZE,  # 設定ファイルから取得
            fp16=True,  # float16を使用
            dataloader_pin_memory=True,  # データローダーの高速化
            dataloader_num_workers=2,  # データ読み込みの並列化
        )
    )
    # no_gradコンテキストで推論を実行（メモリ効率化）
    with torch.no_grad():
        predictions = trainer.predict(ds_test)

    print("Creating submission file...")
    # 提出用ファイルの作成
    submission = create_submission(predictions, test, le)

    # ファイルの保存
    submission.to_csv(SUBMISSION_OUTPUT_PATH, index=False)
    print(f"Submission file saved to: {SUBMISSION_OUTPUT_PATH}")
    print("\nSubmission preview:")
    print(submission.head())
    print(f"\nSubmission shape: {submission.shape}")


if __name__ == "__main__":
    main()

Writing deepseek_0_946.py


In [None]:
%%writefile qwen3_14b_0_946.py

import os
import gc
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import joblib
import torch

try:
    from peft import PeftModel, PeftConfig
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False
    print("Warning: PEFT not available, will use base model only")

# Model configuration
VER = 2
MODEL_NAME = "/kaggle/input/qwen-3/transformers/14b/1"
MODEL_TYPE = "qwen2"  # Add model type for proper handling
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 250  # Increase max length for better context

# Directory settings
OUTPUT_DIR = f"/kaggle/input/qwen3-14b-lb0.945-fulltrain/transformers/default/1/ver_2"

# Training parameters
TRAIN_BATCH_SIZE = 4  # Batch size 2 for RTX 5090 with 31GB VRAM
EVAL_BATCH_SIZE = 4  # Eval can use larger batch size
GRADIENT_ACCUMULATION_STEPS = 16  # Reduced to 32 for faster training
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200


# Data paths
TRAIN_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/checkpoint-1722"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.00000001

# GPU settings
CUDA_VISIBLE_DEVICES = "0,1"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'qwen3_14b_submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "qwen3-14b-math-misconceptions"
WANDB_RUN_NAME = f"qwen3-14b-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration
LORA_RANK = 128  # LoRAのランク - reduced for memory efficiency
LORA_ALPHA = 256  # LoRAのスケーリングパラメータ - reduced proportionally
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]  # 対象モジュール
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = False  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # Qwen2.5-Math用の数学タスクに特化したプロンプト
    prompt = (
        "<|im_start|>user"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n\n"
        "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0.0
    for i, label in enumerate(labels):
        ranks = top3[i]
        if ranks[0] == label:
            score += 1.0
        elif ranks[1] == label:
            score += 1.0 / 2
        elif ranks[2] == label:
            score += 1.0 / 3
    return {"map@3": score / len(labels)}


def create_submission(predictions, test_data, label_encoder, filter_true_false = True):

    question_label_choices = {
        31772: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Incomplete',
            'True_Misconception:WNB',
            'False_Neither:NA',
            'False_Misconception:WNB',
            'False_Misconception:Incomplete',
            'False_Correct:NA'
        ],
        31774: [
            'False_Neither:NA',
            'False_Misconception:SwapDividend',
            'False_Misconception:Mult',
            'False_Correct:NA',
            'False_Misconception:FlipChange',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:SwapDividend',
            'True_Misconception:Mult',
            'True_Misconception:FlipChange'
        ],
        31777: [
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Irrelevant',
            'False_Misconception:Wrong_Fraction',
            'True_Correct:NA',
            'True_Neither:NA'
        ],
        31778: [
            'False_Neither:NA',
            'False_Misconception:Additive',
            'False_Misconception:Irrelevant',
            'False_Correct:NA',
            'False_Misconception:WNB',
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Irrelevant',
            'True_Misconception:Additive'
        ],
        32829: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Not_variable',
            'False_Neither:NA',
            'False_Misconception:Adding_terms',
            'False_Correct:NA',
            'False_Misconception:Not_variable',
            'False_Misconception:Inverse_operation'
        ],
        32833: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Inversion',
            'True_Misconception:Duplication',
            'False_Misconception:Duplication',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Inversion',
            'False_Misconception:Wrong_Operation'
        ],
        32835: [
            'False_Misconception:Whole_numbers_larger',
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Longer_is_bigger',
            'False_Misconception:Ignores_zeroes',
            'False_Misconception:Shorter_is_bigger',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Whole_numbers_larger',
            'True_Misconception:Shorter_is_bigger',
            'True_Misconception:Longer_is_bigger'
        ],
        33471: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Wrong_fraction',
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Wrong_fraction'
        ],
        33472: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Adding_across',
            'True_Misconception:Denominator-only_change',
            'True_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Denominator-only_change',
            'False_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Misconception:Adding_across'
        ],
        33474: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Division',
            'True_Misconception:Subtraction',
            'False_Neither:NA',
            'False_Misconception:Subtraction',
            'False_Misconception:Division',
            'False_Correct:NA'
        ],
        76870: [
            'False_Misconception:Unknowable',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Definition',
            'False_Misconception:Interior',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Definition'
        ],
        89443: [
            'False_Neither:NA',
            'False_Misconception:Positive',
            'False_Misconception:Tacking',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Tacking',
            'True_Misconception:Positive',
            'False_Correct:NA'
        ],
        91695: [
            'False_Neither:NA',
            'False_Misconception:Wrong_term',
            'False_Correct:NA',
            'False_Misconception:Firstterm',
            'True_Correct:NA',
            'True_Misconception:Wrong_term',
            'True_Neither:NA',
            'True_Misconception:Firstterm'
        ],
        104665: [
            'False_Neither:NA',
            'False_Misconception:Base_rate',
            'False_Correct:NA',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Base_rate',
            'True_Misconception:Multiplying_by_4',
            'False_Misconception:Multiplying_by_4'
        ],
        109465: [
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Certainty',
            'False_Misconception:Scale',
            'True_Correct:NA',
            'True_Neither:NA'
        ]
    }

    # Identify which are True/False classes
    true_classes = {}
    false_classes = {}
    for idx, c in enumerate(label_encoder.classes_):

        if 'True' in c:
            true_classes[idx] = c
        else:
            false_classes[idx] = c


    # Normalize for Label Encoder
    question_label_choice_ids = {}
    for qid, choices in question_label_choices.items():
        _label_ids = np.where(np.isin(label_encoder.classes_, question_label_choices[qid]))[0]

        question_label_choice_ids[qid] = [int(x) for x in _label_ids]


    test_probabilities = []
    test_predictions = []
    test_top3_predictions = []

    for qid, correct, row in zip(test_data.QuestionId.tolist(), test_data.is_correct.tolist(), predictions.predictions):

        candidate_idx = question_label_choice_ids[qid]

        # If filter candidates using True/False information
        if filter_true_false:
            if correct == 1:
                # use true_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in true_classes]
            if correct == 0:
                # use false_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in false_classes]

        candidate_logits = row[candidate_idx]

        candidate_probs = torch.nn.functional.softmax(torch.tensor(candidate_logits), dim=-1).numpy()

        top_k = np.argsort(-candidate_probs)

        # Have to convert back to the original label encoder space
        topk_idx = np.array(candidate_idx)[top_k]

        # Keep the probabilities
        topk_probs = candidate_probs[top_k].tolist()

        # Get the predicted labels
        topk_preds = label_encoder.inverse_transform(topk_idx).tolist()

        test_probabilities.append(topk_probs)
        test_predictions.append(topk_preds)
        test_top3_predictions.append(" ".join(topk_preds[:3]))

    test_submission_data = pd.DataFrame({
        "row_id": test_data.row_id.tolist(),
        "QuestionId": test_data.QuestionId.tolist(),
        "is_correct": test_data.is_correct.tolist(),
        "probs": test_probabilities,
        "preds": test_predictions,
        'Category:Misconception': test_top3_predictions
    })

    return test_submission_data


def main():
    """メイン推論関数"""

    # メモリキャッシュをクリア
    torch.cuda.empty_cache()
    gc.collect()

    # CUDAメモリ管理の最適化
    import os
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

    # 2つのGPUを使用可能にする
    if torch.cuda.device_count() > 1:
        print(f"Found {torch.cuda.device_count()} GPUs")

    print("Loading label encoder...")
    # ラベルエンコーダーの読み込み
    le = joblib.load(LABEL_ENCODER_PATH)
    n_classes = len(le.classes_)

    print("Loading trained model and tokenizer...")

    if PEFT_AVAILABLE:
        # LoRAアダプターを使用する場合
        print(f"Loading fine-tuned LoRA model from: {BEST_MODEL_PATH}")
        print(f"Loading base model from: {MODEL_NAME}")

        # ベースモデルを読み込む（float16で読み込み）
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=n_classes,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto",  # 自動的に複数GPUに分散
            low_cpu_mem_usage=True  # CPUメモリ使用量を削減
        )

        # LoRAアダプターを適用
        model = PeftModel.from_pretrained(model, BEST_MODEL_PATH)

        # 推論モードに設定（メモリ効率化）
        model.eval()
        # float16モデルは既にGPUに配置されているのでto('cuda')は不要

        # トークナイザーはベースモデルから読み込む
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        print("Successfully loaded LoRA fine-tuned model")
    else:
        # PEFTが利用できない場合はエラー
        raise ImportError("PEFT is required to load the fine-tuned model. Please install peft: pip install peft")

    # パディングトークンの設定
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # モデルの設定を更新（PeftModelのbase_modelにアクセス）
    if hasattr(model, 'base_model'):
        model.base_model.config.pad_token_id = tokenizer.pad_token_id
        # 内部のモデルにも設定
        if hasattr(model.base_model, 'model'):
            model.base_model.model.config.pad_token_id = tokenizer.pad_token_id
    else:
        model.config.pad_token_id = tokenizer.pad_token_id

    print("Loading test data...")
    # テストデータの読み込み
    test = pd.read_csv(TEST_DATA_PATH)

    print("Loading training data for correct answers...")
    # 正解答案データの準備（訓練データから取得）
    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    correct = prepare_correct_answers(train)

    print("Preprocessing test data...")
    # テストデータの前処理
    test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    test.is_correct = test.is_correct.fillna(0)
    test['text'] = test.apply(format_input, axis=1)

    print("Tokenizing test data...")
    # テストデータのトークナイズ
    ds_test = Dataset.from_pandas(test[['text']])
    ds_test = tokenize_dataset(ds_test, tokenizer, MAX_LEN)

    # パディングのためのデータコラレータの設定
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("Running inference...")

    # TF32を有効化（推論速度向上）
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # 推論の実行
    trainer = Trainer(
        model=model,
        processing_class=tokenizer,  # tokenizer の代替
        data_collator=data_collator,  # バッチ時に自動でパディングを適用
        args=TrainingArguments(
            output_dir="./tmp",  # 一時ディレクトリ（必須パラメータ）
            report_to="none",    # wandbを無効化
            per_device_eval_batch_size=EVAL_BATCH_SIZE,  # 設定ファイルから取得
            fp16=True,  # float16を使用
            dataloader_pin_memory=True,  # データローダーの高速化
            dataloader_num_workers=2,  # データ読み込みの並列化
        )
    )
    # no_gradコンテキストで推論を実行（メモリ効率化）
    with torch.no_grad():
        predictions = trainer.predict(ds_test)

    print("Creating submission file...")
    # 提出用ファイルの作成
    submission = create_submission(predictions, test, le)

    # ファイルの保存
    submission.to_csv(SUBMISSION_OUTPUT_PATH, index=False)
    print(f"Submission file saved to: {SUBMISSION_OUTPUT_PATH}")
    print("\nSubmission preview:")
    print(submission.head())
    print(f"\nSubmission shape: {submission.shape}")


if __name__ == "__main__":
    main()

Writing qwen3_14b_0_946.py


In [None]:
%%writefile qwen3_32b_0_947.py

import os
import gc
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import joblib
import torch

try:
    from peft import PeftModel, PeftConfig
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False
    print("Warning: PEFT not available, will use base model only")

# Model configuration
VER = 2
MODEL_NAME = "/kaggle/input/qwen-3/transformers/32b/1"
MODEL_TYPE = "qwen2"  # Add model type for proper handling
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 300  # Increase max length for better context

# Directory settings
OUTPUT_DIR = f"/kaggle/input/qwen3-32b-9468/transformers/default/1/ver_2"

# Training parameters
TRAIN_BATCH_SIZE = 16  # Batch size 2 for RTX 5090 with 31GB VRAM
EVAL_BATCH_SIZE = 16  # Eval can use larger batch size
GRADIENT_ACCUMULATION_STEPS = 2  # Reduced to 32 for faster training
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200


# Data paths
TRAIN_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/best"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.2

# GPU settings
CUDA_VISIBLE_DEVICES = "0"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'qwen3_32b_0_947_submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "qwen3-32b-math-misconceptions"
WANDB_RUN_NAME = f"qwen3-32b-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration
LORA_RANK = 16  # LoRAのランク - reduced for memory efficiency
LORA_ALPHA = 32  # LoRAのスケーリングパラメータ - reduced proportionally
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]  # 対象モジュール
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = True  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # Qwen2.5-Math用の数学タスクに特化したプロンプト
    prompt = (
        "<|im_start|>user"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n\n"
        "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0.0
    for i, label in enumerate(labels):
        ranks = top3[i]
        if ranks[0] == label:
            score += 1.0
        elif ranks[1] == label:
            score += 1.0 / 2
        elif ranks[2] == label:
            score += 1.0 / 3
    return {"map@3": score / len(labels)}


def create_submission(predictions, test_data, label_encoder, filter_true_false = True):

    question_label_choices = {
        31772: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Incomplete',
            'True_Misconception:WNB',
            'False_Neither:NA',
            'False_Misconception:WNB',
            'False_Misconception:Incomplete',
            'False_Correct:NA'
        ],
        31774: [
            'False_Neither:NA',
            'False_Misconception:SwapDividend',
            'False_Misconception:Mult',
            'False_Correct:NA',
            'False_Misconception:FlipChange',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:SwapDividend',
            'True_Misconception:Mult',
            'True_Misconception:FlipChange'
        ],
        31777: [
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Irrelevant',
            'False_Misconception:Wrong_Fraction',
            'True_Correct:NA',
            'True_Neither:NA'
        ],
        31778: [
            'False_Neither:NA',
            'False_Misconception:Additive',
            'False_Misconception:Irrelevant',
            'False_Correct:NA',
            'False_Misconception:WNB',
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Irrelevant',
            'True_Misconception:Additive'
        ],
        32829: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Not_variable',
            'False_Neither:NA',
            'False_Misconception:Adding_terms',
            'False_Correct:NA',
            'False_Misconception:Not_variable',
            'False_Misconception:Inverse_operation'
        ],
        32833: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Inversion',
            'True_Misconception:Duplication',
            'False_Misconception:Duplication',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Inversion',
            'False_Misconception:Wrong_Operation'
        ],
        32835: [
            'False_Misconception:Whole_numbers_larger',
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Longer_is_bigger',
            'False_Misconception:Ignores_zeroes',
            'False_Misconception:Shorter_is_bigger',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Whole_numbers_larger',
            'True_Misconception:Shorter_is_bigger',
            'True_Misconception:Longer_is_bigger'
        ],
        33471: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Wrong_fraction',
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Wrong_fraction'
        ],
        33472: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Adding_across',
            'True_Misconception:Denominator-only_change',
            'True_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Denominator-only_change',
            'False_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Misconception:Adding_across'
        ],
        33474: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Division',
            'True_Misconception:Subtraction',
            'False_Neither:NA',
            'False_Misconception:Subtraction',
            'False_Misconception:Division',
            'False_Correct:NA'
        ],
        76870: [
            'False_Misconception:Unknowable',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Definition',
            'False_Misconception:Interior',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Definition'
        ],
        89443: [
            'False_Neither:NA',
            'False_Misconception:Positive',
            'False_Misconception:Tacking',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Tacking',
            'True_Misconception:Positive',
            'False_Correct:NA'
        ],
        91695: [
            'False_Neither:NA',
            'False_Misconception:Wrong_term',
            'False_Correct:NA',
            'False_Misconception:Firstterm',
            'True_Correct:NA',
            'True_Misconception:Wrong_term',
            'True_Neither:NA',
            'True_Misconception:Firstterm'
        ],
        104665: [
            'False_Neither:NA',
            'False_Misconception:Base_rate',
            'False_Correct:NA',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Base_rate',
            'True_Misconception:Multiplying_by_4',
            'False_Misconception:Multiplying_by_4'
        ],
        109465: [
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Certainty',
            'False_Misconception:Scale',
            'True_Correct:NA',
            'True_Neither:NA'
        ]
    }

    # Identify which are True/False classes
    true_classes = {}
    false_classes = {}
    for idx, c in enumerate(label_encoder.classes_):

        if 'True' in c:
            true_classes[idx] = c
        else:
            false_classes[idx] = c


    # Normalize for Label Encoder
    question_label_choice_ids = {}
    for qid, choices in question_label_choices.items():
        _label_ids = np.where(np.isin(label_encoder.classes_, question_label_choices[qid]))[0]

        question_label_choice_ids[qid] = [int(x) for x in _label_ids]


    test_probabilities = []
    test_predictions = []
    test_top3_predictions = []

    for qid, correct, row in zip(test_data.QuestionId.tolist(), test_data.is_correct.tolist(), predictions.predictions):

        candidate_idx = question_label_choice_ids[qid]

        # If filter candidates using True/False information
        if filter_true_false:
            if correct == 1:
                # use true_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in true_classes]
            if correct == 0:
                # use false_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in false_classes]

        candidate_logits = row[candidate_idx]

        candidate_probs = torch.nn.functional.softmax(torch.tensor(candidate_logits), dim=-1).numpy()

        top_k = np.argsort(-candidate_probs)

        # Have to convert back to the original label encoder space
        topk_idx = np.array(candidate_idx)[top_k]

        # Keep the probabilities
        topk_probs = candidate_probs[top_k].tolist()

        # Get the predicted labels
        topk_preds = label_encoder.inverse_transform(topk_idx).tolist()

        test_probabilities.append(topk_probs)
        test_predictions.append(topk_preds)
        test_top3_predictions.append(" ".join(topk_preds[:3]))

    test_submission_data = pd.DataFrame({
        "row_id": test_data.row_id.tolist(),
        "QuestionId": test_data.QuestionId.tolist(),
        "is_correct": test_data.is_correct.tolist(),
        "probs": test_probabilities,
        "preds": test_predictions,
        'Category:Misconception': test_top3_predictions
    })

    return test_submission_data


def main():
    """メイン推論関数"""

    # メモリキャッシュをクリア
    torch.cuda.empty_cache()
    gc.collect()

    # CUDAメモリ管理の最適化
    import os
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

    # 2つのGPUを使用可能にする
    if torch.cuda.device_count() > 1:
        print(f"Found {torch.cuda.device_count()} GPUs")

    print("Loading label encoder...")
    # ラベルエンコーダーの読み込み
    le = joblib.load(LABEL_ENCODER_PATH)
    n_classes = len(le.classes_)

    print("Loading trained model and tokenizer...")

    if PEFT_AVAILABLE:
        # LoRAアダプターを使用する場合
        print(f"Loading fine-tuned LoRA model from: {BEST_MODEL_PATH}")
        print(f"Loading base model from: {MODEL_NAME}")

        # ベースモデルを読み込む（4bit量子化で読み込み）
        from transformers import BitsAndBytesConfig

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )

        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=n_classes,
            trust_remote_code=True,
            quantization_config=quantization_config,
            device_map="auto",  # 自動的に複数GPUに分散
            low_cpu_mem_usage=True  # CPUメモリ使用量を削減
        )

        # LoRAアダプターを適用
        model = PeftModel.from_pretrained(model, BEST_MODEL_PATH)

        # 推論モードに設定（メモリ効率化）
        model.eval()
        # 4bit量子化モデルは既にGPUに配置されているのでto('cuda')は不要

        # トークナイザーはベースモデルから読み込む
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        print("Successfully loaded LoRA fine-tuned model")
    else:
        # PEFTが利用できない場合はエラー
        raise ImportError("PEFT is required to load the fine-tuned model. Please install peft: pip install peft")

    # パディングトークンの設定
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # モデルの設定を更新（PeftModelのbase_modelにアクセス）
    if hasattr(model, 'base_model'):
        model.base_model.config.pad_token_id = tokenizer.pad_token_id
        # 内部のモデルにも設定
        if hasattr(model.base_model, 'model'):
            model.base_model.model.config.pad_token_id = tokenizer.pad_token_id
    else:
        model.config.pad_token_id = tokenizer.pad_token_id

    print("Loading test data...")
    # テストデータの読み込み
    test = pd.read_csv(TEST_DATA_PATH)

    print("Loading training data for correct answers...")
    # 正解答案データの準備（訓練データから取得）
    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    correct = prepare_correct_answers(train)

    print("Preprocessing test data...")
    # テストデータの前処理
    test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    test.is_correct = test.is_correct.fillna(0)
    test['text'] = test.apply(format_input, axis=1)

    print("Tokenizing test data...")
    # テストデータのトークナイズ
    ds_test = Dataset.from_pandas(test[['text']])
    ds_test = tokenize_dataset(ds_test, tokenizer, MAX_LEN)

    # パディングのためのデータコラレータの設定
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("Running inference...")

    # TF32を有効化（推論速度向上）
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # 推論の実行
    trainer = Trainer(
        model=model,
        processing_class=tokenizer,  # tokenizer の代替
        data_collator=data_collator,  # バッチ時に自動でパディングを適用
        args=TrainingArguments(
            output_dir="./tmp",  # 一時ディレクトリ（必須パラメータ）
            report_to="none",    # wandbを無効化
            per_device_eval_batch_size=EVAL_BATCH_SIZE,  # 設定ファイルから取得
            fp16=True,  # float16を使用
            dataloader_pin_memory=True,  # データローダーの高速化
            dataloader_num_workers=2,  # データ読み込みの並列化
        )
    )
    # no_gradコンテキストで推論を実行（メモリ効率化）
    with torch.no_grad():
        predictions = trainer.predict(ds_test)

    print("Creating submission file...")
    # 提出用ファイルの作成
    submission = create_submission(predictions, test, le)

    # ファイルの保存
    submission.to_csv(SUBMISSION_OUTPUT_PATH, index=False)
    print(f"Submission file saved to: {SUBMISSION_OUTPUT_PATH}")
    print("\nSubmission preview:")
    print(submission.head())
    print(f"\nSubmission shape: {submission.shape}")


if __name__ == "__main__":
    main()

Writing qwen3_32b_0_947.py


In [None]:
%%writefile phi_4_0_948_fulltrain.py

import os
import gc
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import joblib
import torch

try:
    from peft import PeftModel, PeftConfig
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False
    print("Warning: PEFT not available, will use base model only")

# Model configuration
VER = 2
MODEL_NAME = "/kaggle/input/ms-phi4/transformers/default/1/phi-4"
MODEL_TYPE = "phi"  # Phi-4 model type
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 250  # Phi-4 supports longer context

# Directory settings
OUTPUT_DIR = f"/kaggle/input/phi-4-cv0965-fulltrain/transformers/default/1/ver_2_0965ft"

# Training parameters
TRAIN_BATCH_SIZE = 4  # Smaller batch size for Phi-4
EVAL_BATCH_SIZE = 4  # Eval batch size
GRADIENT_ACCUMULATION_STEPS = 16  # Increased for effective batch size
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200


# Data paths
TRAIN_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/checkpoint-1722"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.2

# GPU settings
CUDA_VISIBLE_DEVICES = "0"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'phi_4_0_948_fulltrain_submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "phi-4-math-misconceptions"
WANDB_RUN_NAME = f"phi-4-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration for Phi-4
LORA_RANK = 64  # LoRAのランク - optimized for Phi-4
LORA_ALPHA = 128  # LoRAのスケーリングパラメータ - 1:1 ratio with rank
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]  # Phi-4 target modules
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率 - reduced for Phi-4
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = False  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # Phi-4用のプロンプトフォーマット（特別なthinkタグを含む）
    prompt = (
        "<|user|>\n"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n"
        "<|end|>\n"
        "<|assistant|>\n"
        "<think>\n"
        "Let me analyze this mathematical misconception...\n"
        "</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0.0
    for i, label in enumerate(labels):
        ranks = top3[i]
        if ranks[0] == label:
            score += 1.0
        elif ranks[1] == label:
            score += 1.0 / 2
        elif ranks[2] == label:
            score += 1.0 / 3
    return {"map@3": score / len(labels)}


def create_submission(predictions, test_data, label_encoder, filter_true_false = True):

    question_label_choices = {
        31772: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Incomplete',
            'True_Misconception:WNB',
            'False_Neither:NA',
            'False_Misconception:WNB',
            'False_Misconception:Incomplete',
            'False_Correct:NA'
        ],
        31774: [
            'False_Neither:NA',
            'False_Misconception:SwapDividend',
            'False_Misconception:Mult',
            'False_Correct:NA',
            'False_Misconception:FlipChange',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:SwapDividend',
            'True_Misconception:Mult',
            'True_Misconception:FlipChange'
        ],
        31777: [
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Irrelevant',
            'False_Misconception:Wrong_Fraction',
            'True_Correct:NA',
            'True_Neither:NA'
        ],
        31778: [
            'False_Neither:NA',
            'False_Misconception:Additive',
            'False_Misconception:Irrelevant',
            'False_Correct:NA',
            'False_Misconception:WNB',
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Irrelevant',
            'True_Misconception:Additive'
        ],
        32829: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Not_variable',
            'False_Neither:NA',
            'False_Misconception:Adding_terms',
            'False_Correct:NA',
            'False_Misconception:Not_variable',
            'False_Misconception:Inverse_operation'
        ],
        32833: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Inversion',
            'True_Misconception:Duplication',
            'False_Misconception:Duplication',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Inversion',
            'False_Misconception:Wrong_Operation'
        ],
        32835: [
            'False_Misconception:Whole_numbers_larger',
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Longer_is_bigger',
            'False_Misconception:Ignores_zeroes',
            'False_Misconception:Shorter_is_bigger',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Whole_numbers_larger',
            'True_Misconception:Shorter_is_bigger',
            'True_Misconception:Longer_is_bigger'
        ],
        33471: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Wrong_fraction',
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Wrong_fraction'
        ],
        33472: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Adding_across',
            'True_Misconception:Denominator-only_change',
            'True_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Denominator-only_change',
            'False_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Misconception:Adding_across'
        ],
        33474: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Division',
            'True_Misconception:Subtraction',
            'False_Neither:NA',
            'False_Misconception:Subtraction',
            'False_Misconception:Division',
            'False_Correct:NA'
        ],
        76870: [
            'False_Misconception:Unknowable',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Definition',
            'False_Misconception:Interior',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Definition'
        ],
        89443: [
            'False_Neither:NA',
            'False_Misconception:Positive',
            'False_Misconception:Tacking',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Tacking',
            'True_Misconception:Positive',
            'False_Correct:NA'
        ],
        91695: [
            'False_Neither:NA',
            'False_Misconception:Wrong_term',
            'False_Correct:NA',
            'False_Misconception:Firstterm',
            'True_Correct:NA',
            'True_Misconception:Wrong_term',
            'True_Neither:NA',
            'True_Misconception:Firstterm'
        ],
        104665: [
            'False_Neither:NA',
            'False_Misconception:Base_rate',
            'False_Correct:NA',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Base_rate',
            'True_Misconception:Multiplying_by_4',
            'False_Misconception:Multiplying_by_4'
        ],
        109465: [
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Certainty',
            'False_Misconception:Scale',
            'True_Correct:NA',
            'True_Neither:NA'
        ]
    }

    # Identify which are True/False classes
    true_classes = {}
    false_classes = {}
    for idx, c in enumerate(label_encoder.classes_):

        if 'True' in c:
            true_classes[idx] = c
        else:
            false_classes[idx] = c


    # Normalize for Label Encoder
    question_label_choice_ids = {}
    for qid, choices in question_label_choices.items():
        _label_ids = np.where(np.isin(label_encoder.classes_, question_label_choices[qid]))[0]

        question_label_choice_ids[qid] = [int(x) for x in _label_ids]


    test_probabilities = []
    test_predictions = []
    test_top3_predictions = []

    for qid, correct, row in zip(test_data.QuestionId.tolist(), test_data.is_correct.tolist(), predictions.predictions):

        candidate_idx = question_label_choice_ids[qid]

        # If filter candidates using True/False information
        if filter_true_false:
            if correct == 1:
                # use true_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in true_classes]
            if correct == 0:
                # use false_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in false_classes]

        candidate_logits = row[candidate_idx]

        candidate_probs = torch.nn.functional.softmax(torch.tensor(candidate_logits), dim=-1).numpy()

        top_k = np.argsort(-candidate_probs)

        # Have to convert back to the original label encoder space
        topk_idx = np.array(candidate_idx)[top_k]

        # Keep the probabilities
        topk_probs = candidate_probs[top_k].tolist()

        # Get the predicted labels
        topk_preds = label_encoder.inverse_transform(topk_idx).tolist()

        test_probabilities.append(topk_probs)
        test_predictions.append(topk_preds)
        test_top3_predictions.append(" ".join(topk_preds[:3]))

    test_submission_data = pd.DataFrame({
        "row_id": test_data.row_id.tolist(),
        "QuestionId": test_data.QuestionId.tolist(),
        "is_correct": test_data.is_correct.tolist(),
        "probs": test_probabilities,
        "preds": test_predictions,
        'Category:Misconception': test_top3_predictions
    })

    return test_submission_data


def main():
    """メイン推論関数"""

    # メモリキャッシュをクリア
    torch.cuda.empty_cache()
    gc.collect()

    # CUDAメモリ管理の最適化
    import os
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

    # 2つのGPUを使用可能にする
    if torch.cuda.device_count() > 1:
        print(f"Found {torch.cuda.device_count()} GPUs")

    print("Loading label encoder...")
    # ラベルエンコーダーの読み込み
    le = joblib.load(LABEL_ENCODER_PATH)
    n_classes = len(le.classes_)

    print("Loading trained model and tokenizer...")

    if PEFT_AVAILABLE:
        # LoRAアダプターを使用する場合
        print(f"Loading fine-tuned LoRA model from: {BEST_MODEL_PATH}")
        print(f"Loading base model from: {MODEL_NAME}")

        # ベースモデルを読み込む（量子化なしでフルプレシジョン）
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=n_classes,
            trust_remote_code=True,
            device_map="auto",  # 自動的に複数GPUに分散
            torch_dtype=torch.float16,  # float16を使用（メモリ効率とパフォーマンスのバランス）
            low_cpu_mem_usage=True  # CPUメモリ使用量を削減
        )

        # LoRAアダプターを適用
        model = PeftModel.from_pretrained(model, BEST_MODEL_PATH)

        # 推論モードに設定（メモリ効率化）
        model.eval()
        # モデルは既にdevice_mapでGPUに配置されているのでto('cuda')は不要

        # トークナイザーはベースモデルから読み込む
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        print("Successfully loaded LoRA fine-tuned model")
    else:
        # PEFTが利用できない場合はエラー
        raise ImportError("PEFT is required to load the fine-tuned model. Please install peft: pip install peft")

    # パディングトークンの設定
    if tokenizer.pad_token is None:
        tokenizer.pad_token = "<|finetune_right_pad_id|>"
        tokenizer.pad_token_id = 100257

    # モデルの設定を更新（PeftModelのbase_modelにアクセス）
    if hasattr(model, 'base_model'):
        model.base_model.config.pad_token_id = tokenizer.pad_token_id
        # 内部のモデルにも設定
        if hasattr(model.base_model, 'model'):
            model.base_model.model.config.pad_token_id = tokenizer.pad_token_id
    else:
        model.config.pad_token_id = tokenizer.pad_token_id

    print("Loading test data...")
    # テストデータの読み込み
    test = pd.read_csv(TEST_DATA_PATH)

    print("Loading training data for correct answers...")
    # 正解答案データの準備（訓練データから取得）
    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    correct = prepare_correct_answers(train)

    print("Preprocessing test data...")
    # テストデータの前処理
    test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    test.is_correct = test.is_correct.fillna(0)
    test['text'] = test.apply(format_input, axis=1)

    print("Tokenizing test data...")
    # テストデータのトークナイズ
    ds_test = Dataset.from_pandas(test[['text']])
    ds_test = tokenize_dataset(ds_test, tokenizer, MAX_LEN)

    # パディングのためのデータコラレータの設定
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("Running inference...")

    # TF32を有効化（推論速度向上）
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # 推論の実行
    trainer = Trainer(
        model=model,
        processing_class=tokenizer,  # tokenizer の代替
        data_collator=data_collator,  # バッチ時に自動でパディングを適用
        args=TrainingArguments(
            output_dir="./tmp",  # 一時ディレクトリ（必須パラメータ）
            report_to="none",    # wandbを無効化
            per_device_eval_batch_size=EVAL_BATCH_SIZE,  # 設定ファイルから取得
            fp16=True,  # float16を使用
            dataloader_pin_memory=True,  # データローダーの高速化
            dataloader_num_workers=2,  # データ読み込みの並列化
        )
    )
    # no_gradコンテキストで推論を実行（メモリ効率化）
    with torch.no_grad():
        predictions = trainer.predict(ds_test)

    print("Creating submission file...")
    # 提出用ファイルの作成
    submission = create_submission(predictions, test, le)

    # ファイルの保存
    submission.to_csv(SUBMISSION_OUTPUT_PATH, index=False)
    print(f"Submission file saved to: {SUBMISSION_OUTPUT_PATH}")
    print("\nSubmission preview:")
    print(submission.head())
    print(f"\nSubmission shape: {submission.shape}")


if __name__ == "__main__":
    main()

Writing phi_4_0_948_fulltrain.py


In [None]:
%%writefile phi_4_reasoning_0_948.py

import os
import gc
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import joblib
import torch

try:
    from peft import PeftModel, PeftConfig
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False
    print("Warning: PEFT not available, will use base model only")

# Model configuration
VER = 2
MODEL_NAME = "/kaggle/input/phi4-reasoning-plus/transformers/default/1/Phi-4-reasoning-plus"
MODEL_TYPE = "phi"  # Phi-4 model type
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 250  # Phi-4-reasoning-plus supports 32k context, but we use 1024 for efficiency

# Directory settings
OUTPUT_DIR = f"/kaggle/input/phi-4-reasoning-plus09476-ft/transformers/default/1/ver_2_9476ft"

# Training parameters
TRAIN_BATCH_SIZE = 4  # Smaller batch size for Phi-4
EVAL_BATCH_SIZE = 4  # Eval batch size
GRADIENT_ACCUMULATION_STEPS = 16  # Increased for effective batch size
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200


# Data paths
TRAIN_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/checkpoint-1722"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.0000001

# GPU settings
CUDA_VISIBLE_DEVICES = "0,1"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'phi_4_reasoning_0_948_submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "phi-4-reasoning-math-misconceptions"
WANDB_RUN_NAME = f"phi-4-reasoning-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration for Phi-4
LORA_RANK = 64  # LoRAのランク - optimized for Phi-4
LORA_ALPHA = 128  # LoRAのスケーリングパラメータ - 1:1 ratio with rank
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]  # Phi-4 target modules
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率 - reduced for Phi-4
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = False  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # Phi-4用のプロンプトフォーマット（特別なthinkタグを含む）
    prompt = (
        "<|user|>\n"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n"
        "<|end|>\n"
        "<|assistant|>\n"
        "<think>\n"
        "Let me analyze this mathematical misconception...\n"
        "</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0.0
    for i, label in enumerate(labels):
        ranks = top3[i]
        if ranks[0] == label:
            score += 1.0
        elif ranks[1] == label:
            score += 1.0 / 2
        elif ranks[2] == label:
            score += 1.0 / 3
    return {"map@3": score / len(labels)}


def create_submission(predictions, test_data, label_encoder, filter_true_false = True):

    question_label_choices = {
        31772: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Incomplete',
            'True_Misconception:WNB',
            'False_Neither:NA',
            'False_Misconception:WNB',
            'False_Misconception:Incomplete',
            'False_Correct:NA'
        ],
        31774: [
            'False_Neither:NA',
            'False_Misconception:SwapDividend',
            'False_Misconception:Mult',
            'False_Correct:NA',
            'False_Misconception:FlipChange',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:SwapDividend',
            'True_Misconception:Mult',
            'True_Misconception:FlipChange'
        ],
        31777: [
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Irrelevant',
            'False_Misconception:Wrong_Fraction',
            'True_Correct:NA',
            'True_Neither:NA'
        ],
        31778: [
            'False_Neither:NA',
            'False_Misconception:Additive',
            'False_Misconception:Irrelevant',
            'False_Correct:NA',
            'False_Misconception:WNB',
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Irrelevant',
            'True_Misconception:Additive'
        ],
        32829: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Not_variable',
            'False_Neither:NA',
            'False_Misconception:Adding_terms',
            'False_Correct:NA',
            'False_Misconception:Not_variable',
            'False_Misconception:Inverse_operation'
        ],
        32833: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Inversion',
            'True_Misconception:Duplication',
            'False_Misconception:Duplication',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Inversion',
            'False_Misconception:Wrong_Operation'
        ],
        32835: [
            'False_Misconception:Whole_numbers_larger',
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Longer_is_bigger',
            'False_Misconception:Ignores_zeroes',
            'False_Misconception:Shorter_is_bigger',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Whole_numbers_larger',
            'True_Misconception:Shorter_is_bigger',
            'True_Misconception:Longer_is_bigger'
        ],
        33471: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Wrong_fraction',
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Wrong_fraction'
        ],
        33472: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Adding_across',
            'True_Misconception:Denominator-only_change',
            'True_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Denominator-only_change',
            'False_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Misconception:Adding_across'
        ],
        33474: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Division',
            'True_Misconception:Subtraction',
            'False_Neither:NA',
            'False_Misconception:Subtraction',
            'False_Misconception:Division',
            'False_Correct:NA'
        ],
        76870: [
            'False_Misconception:Unknowable',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Definition',
            'False_Misconception:Interior',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Definition'
        ],
        89443: [
            'False_Neither:NA',
            'False_Misconception:Positive',
            'False_Misconception:Tacking',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Tacking',
            'True_Misconception:Positive',
            'False_Correct:NA'
        ],
        91695: [
            'False_Neither:NA',
            'False_Misconception:Wrong_term',
            'False_Correct:NA',
            'False_Misconception:Firstterm',
            'True_Correct:NA',
            'True_Misconception:Wrong_term',
            'True_Neither:NA',
            'True_Misconception:Firstterm'
        ],
        104665: [
            'False_Neither:NA',
            'False_Misconception:Base_rate',
            'False_Correct:NA',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Base_rate',
            'True_Misconception:Multiplying_by_4',
            'False_Misconception:Multiplying_by_4'
        ],
        109465: [
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Certainty',
            'False_Misconception:Scale',
            'True_Correct:NA',
            'True_Neither:NA'
        ]
    }

    # Identify which are True/False classes
    true_classes = {}
    false_classes = {}
    for idx, c in enumerate(label_encoder.classes_):

        if 'True' in c:
            true_classes[idx] = c
        else:
            false_classes[idx] = c


    # Normalize for Label Encoder
    question_label_choice_ids = {}
    for qid, choices in question_label_choices.items():
        _label_ids = np.where(np.isin(label_encoder.classes_, question_label_choices[qid]))[0]

        question_label_choice_ids[qid] = [int(x) for x in _label_ids]


    test_probabilities = []
    test_predictions = []
    test_top3_predictions = []

    for qid, correct, row in zip(test_data.QuestionId.tolist(), test_data.is_correct.tolist(), predictions.predictions):

        candidate_idx = question_label_choice_ids[qid]

        # If filter candidates using True/False information
        if filter_true_false:
            if correct == 1:
                # use true_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in true_classes]
            if correct == 0:
                # use false_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in false_classes]

        candidate_logits = row[candidate_idx]

        candidate_probs = torch.nn.functional.softmax(torch.tensor(candidate_logits), dim=-1).numpy()

        top_k = np.argsort(-candidate_probs)

        # Have to convert back to the original label encoder space
        topk_idx = np.array(candidate_idx)[top_k]

        # Keep the probabilities
        topk_probs = candidate_probs[top_k].tolist()

        # Get the predicted labels
        topk_preds = label_encoder.inverse_transform(topk_idx).tolist()

        test_probabilities.append(topk_probs)
        test_predictions.append(topk_preds)
        test_top3_predictions.append(" ".join(topk_preds[:3]))

    test_submission_data = pd.DataFrame({
        "row_id": test_data.row_id.tolist(),
        "QuestionId": test_data.QuestionId.tolist(),
        "is_correct": test_data.is_correct.tolist(),
        "probs": test_probabilities,
        "preds": test_predictions,
        'Category:Misconception': test_top3_predictions
    })

    return test_submission_data


def main():
    """メイン推論関数"""

    # メモリキャッシュをクリア
    torch.cuda.empty_cache()
    gc.collect()

    # CUDAメモリ管理の最適化
    import os
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

    # 2つのGPUを使用可能にする
    if torch.cuda.device_count() > 1:
        print(f"Found {torch.cuda.device_count()} GPUs")

    print("Loading label encoder...")
    # ラベルエンコーダーの読み込み
    le = joblib.load(LABEL_ENCODER_PATH)
    n_classes = len(le.classes_)

    print("Loading trained model and tokenizer...")

    if PEFT_AVAILABLE:
        # LoRAアダプターを使用する場合
        print(f"Loading fine-tuned LoRA model from: {BEST_MODEL_PATH}")
        print(f"Loading base model from: {MODEL_NAME}")

        # ベースモデルを読み込む（量子化なし）
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=n_classes,
            trust_remote_code=True,
            device_map="auto",  # 自動的に複数GPUに分散
            low_cpu_mem_usage=True,  # CPUメモリ使用量を削減
            torch_dtype=torch.float16  # FP16を使用してメモリ効率を改善
        )

        # LoRAアダプターを適用
        model = PeftModel.from_pretrained(model, BEST_MODEL_PATH)

        # 推論モードに設定（メモリ効率化）
        model.eval()
        # 8bit量子化モデルは既にGPUに配置されているのでto('cuda')は不要

        # トークナイザーはベースモデルから読み込む
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        print("Successfully loaded LoRA fine-tuned model")
    else:
        # PEFTが利用できない場合はエラー
        raise ImportError("PEFT is required to load the fine-tuned model. Please install peft: pip install peft")

    # パディングトークンの設定
    if tokenizer.pad_token is None:
        tokenizer.pad_token = "<|finetune_right_pad_id|>"
        tokenizer.pad_token_id = 100349  # Phi-4-reasoning-plusのPADトークンID

    # モデルの設定を更新（PeftModelのbase_modelにアクセス）
    if hasattr(model, 'base_model'):
        model.base_model.config.pad_token_id = tokenizer.pad_token_id
        # 内部のモデルにも設定
        if hasattr(model.base_model, 'model'):
            model.base_model.model.config.pad_token_id = tokenizer.pad_token_id
    else:
        model.config.pad_token_id = tokenizer.pad_token_id

    print("Loading test data...")
    # テストデータの読み込み
    test = pd.read_csv(TEST_DATA_PATH)

    print("Loading training data for correct answers...")
    # 正解答案データの準備（訓練データから取得）
    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    correct = prepare_correct_answers(train)

    print("Preprocessing test data...")
    # テストデータの前処理
    test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    test.is_correct = test.is_correct.fillna(0)
    test['text'] = test.apply(format_input, axis=1)

    print("Tokenizing test data...")
    # テストデータのトークナイズ
    ds_test = Dataset.from_pandas(test[['text']])
    ds_test = tokenize_dataset(ds_test, tokenizer, MAX_LEN)

    # パディングのためのデータコラレータの設定
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("Running inference...")

    # TF32を有効化（推論速度向上）
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # 推論の実行
    trainer = Trainer(
        model=model,
        processing_class=tokenizer,  # tokenizer の代替
        data_collator=data_collator,  # バッチ時に自動でパディングを適用
        args=TrainingArguments(
            output_dir="./tmp",  # 一時ディレクトリ（必須パラメータ）
            report_to="none",    # wandbを無効化
            per_device_eval_batch_size=EVAL_BATCH_SIZE,  # 設定ファイルから取得
            fp16=True,  # float16を使用
            dataloader_pin_memory=True,  # データローダーの高速化
            dataloader_num_workers=2,  # データ読み込みの並列化
        )
    )
    # no_gradコンテキストで推論を実行（メモリ効率化）
    with torch.no_grad():
        predictions = trainer.predict(ds_test)

    print("Creating submission file...")
    # 提出用ファイルの作成
    submission = create_submission(predictions, test, le)

    # ファイルの保存
    submission.to_csv(SUBMISSION_OUTPUT_PATH, index=False)
    print(f"Submission file saved to: {SUBMISSION_OUTPUT_PATH}")
    print("\nSubmission preview:")
    print(submission.head())
    print(f"\nSubmission shape: {submission.shape}")


if __name__ == "__main__":
    main()

Writing phi_4_reasoning_0_948.py


In [None]:
%%writefile phi_new_loss_0_947.py
"""
設定ファイル - Deberta モデルのトレーニングと推論用設定
"""

# Model configuration
VER = 2
# MODEL_NAME = "/kaggle/input/phi4-reasoning-plus/transformers/default/1/Phi-4-reasoning-plus"
# MODEL_TYPE = "qwen3"  # Add model type for proper handling
MODEL_DIR = "/kaggle/input/phi4-merged-masked"
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 250  # Increase max length for better context


CUDA_VISIBLE_DEVICES = "0,1"  # Use GPU 1 only

# Directory settings
OUTPUT_DIR = f"./"

# Training parameters
TRAIN_BATCH_SIZE = 2  # Reduced for large model memory efficiency
EVAL_BATCH_SIZE = 8  # Further reduced to avoid CUDA errors
GRADIENT_ACCUMULATION_STEPS = 2  # Increased for memory efficiency
LEARNING_RATE = 2e-4
WARMUP_STEPS = 100  # Warmup steps for learning rate scheduler
WEIGHT_DECAY = 0.01  # Weight decay for regularization
LOGGING_STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200
USE_FP16 = True  # Use FP16 training for memory efficiency


# Data paths
TRAIN_DATA_PATH ='/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
# '/home/sato/project/map/train_32835_updated.csv'
# '/home/sato/project/map/train_final.csv'
# '/home/sato/project/map/train_changetext.csv'
# '/home/sato/project/map/train_ocr_corrected_openai.csv'
# '/home/sato/project/map/train.csv'
# '/home/sato/project/map/train_ocr_corrected_openai.csv'
# '/home/sato/project/map/train.csv'
# '/home/sato/project/map/train_ocr_corrected_openai_checkpoint_30400.csv'
# '/home/sato/project/map/train.csv'
# '/home/sato/project/map/train_ocr_corrected_openai_checkpoint_30400.csv'
# '/home/sato/project/map/train_ocr_corrected.csv'
TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/best"
LABEL_ENCODER_PATH = f"/kaggle/input/question-label-mapping/label_encoder.joblib"

# Base model path for pseudo labeling (事前訓練済みモデルのパス)
BASE_MODEL_PATH = f"{OUTPUT_DIR}/best"  # train.pyで作成されたモデルのパス

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.2


# Submission settings
SUBMISSION_OUTPUT_PATH = 'submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "qwen3-0.6b-ocr-correct"
WANDB_RUN_NAME = "qwen3-0.6b-origin-real_choice"
# "qwen3-0.6b-4o-mini"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed


# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration
LORA_RANK = 16  # LoRAのランク
LORA_ALPHA = 32  # LoRAのスケーリングパラメータ
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]  # 対象モジュール
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# 3段階学習用の設定
STAGE1_EPOCHS = 3  # 第1段階のエポック数
STAGE3_EPOCHS = 1  # 第3段階のエポック数
STAGE3_LEARNING_RATE_RATIO = 1.0  # 第3段階の学習率比率（元の学習率に対する倍率）
SKIP_STAGE1_TRAINING = False  # 第1段階をスキップするかどうか
STAGE1_PRETRAINED_MODEL_PATH = "./stage1_final"  # 第1段階をスキップする場合の学習済み重みパス
"""
共通ユーティリティ関数 - 選択肢付きバージョン
"""

import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from datasets import Dataset
import torch


def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def prepare_answer_choices(train_data, mapping_file='/kaggle/input/question-label-mapping/question_answer_choice_mapping.csv'):
    """各問題のMC_Answer選択肢を準備（マッピングファイルを使用、小文字ラベル）"""
    # マッピングファイルを読み込み
    mapping_df = pd.read_csv(mapping_file)

    # 各QuestionIdごとに選択肢を作成
    choices_list = []

    for question_id in train_data['QuestionId'].unique():
        # 該当QuestionIdのマッピングを取得
        question_mapping = mapping_df[mapping_df['QuestionId'] == question_id].copy()

        if len(question_mapping) > 0:
            # Choice（A,B,C,D）でソート
            question_mapping = question_mapping.sort_values('Choice')
            # 選択肢文字列を作成（小文字ラベル）
            choice_items = []
            choice_mapping = {}  # MC_Answer -> choice label のマッピング
            for _, row in question_mapping.iterrows():
                lowercase_choice = row['Choice'].lower()  # A -> a, B -> b, etc.
                choice_items.append(f"{lowercase_choice}. {row['MC_Answer']}")
                choice_mapping[row['MC_Answer']] = lowercase_choice
            answer_choices_str = '\n'.join(choice_items)
        else:
            # マッピングがない場合は従来の番号方式にフォールバック
            question_answers = train_data[train_data['QuestionId'] == question_id]['MC_Answer'].unique()
            choice_items = []
            choice_mapping = {}
            for i, ans in enumerate(question_answers):
                lowercase_choice = chr(ord('a') + i)  # a, b, c, d, ...
                choice_items.append(f"{lowercase_choice}. {ans}")
                choice_mapping[ans] = lowercase_choice
            answer_choices_str = '\n'.join(choice_items)

        choices_list.append({
            'QuestionId': question_id,
            'answer_choices_str': answer_choices_str,
            'choice_mapping': choice_mapping  # MC_Answer -> choice label のマッピングも保存
        })

    choices = pd.DataFrame(choices_list)
    return choices


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット（選択肢付き、回答をラベルに変換）"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # MC_Answerを選択肢ラベル（a, b, c, d）に変換
    student_answer_label = row.get('choice_label', row['MC_Answer'])  # フォールバック

    # Qwen2.5-Math用の数学タスクに特化したプロンプト（選択肢付き）
    prompt = (
        "<|user|>\n"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n"
        "<|end|>\n"
        "<|assistant|>\n"
        "<think>\n"
        "Let me analyze this mathematical misconception...\n"
        "</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    print(f"[DEBUG] *** compute_map3 function called! ***")
    print(f"[DEBUG] eval_pred type: {type(eval_pred)}")
    print(f"[DEBUG] eval_pred: {eval_pred}")

    try:
        logits, labels = eval_pred
        print(f"[DEBUG] compute_map3 called with logits shape: {logits.shape}, labels shape: {labels.shape}")

        probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
        top3 = np.argsort(-probs, axis=1)[:, :3]
        score = 0.0
        for i, label in enumerate(labels):
            ranks = top3[i]
            if ranks[0] == label:
                score += 1.0
            elif ranks[1] == label:
                score += 1.0 / 2
            elif ranks[2] == label:
                score += 1.0 / 3

        map3_score = score / len(labels)
        result = {"eval_map@3": map3_score}
        print(f"[DEBUG] compute_map3 returning: {result}")
        print(f"[DEBUG] *** compute_map3 function completed successfully! ***")
        return result

    except Exception as e:
        print(f"[ERROR] compute_map3 failed: {e}")
        print(f"[ERROR] Exception type: {type(e)}")
        import traceback
        print(f"[ERROR] Full traceback: {traceback.format_exc()}")
        return {"eval_map@3": 0.0}


def create_submission(predictions_tuple, test_data, label_encoder, filter_true_false=True):
    """
    predict_with_model の出力 (all_predictions, all_row_ids, all_probs) を使って
    提出用の submission DataFrame を作成
    """

    # tupleを展開
    all_predictions, all_row_ids, all_probs = predictions_tuple

    # --- QuestionIdごとの有効なラベル定義（省略せずそのまま使う） ---
    question_label_choices = {
        31772: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Incomplete',
            'True_Misconception:WNB',
            'False_Neither:NA',
            'False_Misconception:WNB',
            'False_Misconception:Incomplete',
            'False_Correct:NA'
        ],
        31774: [
            'False_Neither:NA',
            'False_Misconception:SwapDividend',
            'False_Misconception:Mult',
            'False_Correct:NA',
            'False_Misconception:FlipChange',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:SwapDividend',
            'True_Misconception:Mult',
            'True_Misconception:FlipChange'
        ],
        31777: [
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Irrelevant',
            'False_Misconception:Wrong_Fraction',
            'True_Correct:NA',
            'True_Neither:NA'
        ],
        31778: [
            'False_Neither:NA',
            'False_Misconception:Additive',
            'False_Misconception:Irrelevant',
            'False_Correct:NA',
            'False_Misconception:WNB',
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Irrelevant',
            'True_Misconception:Additive'
        ],
        32829: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Not_variable',
            'False_Neither:NA',
            'False_Misconception:Adding_terms',
            'False_Correct:NA',
            'False_Misconception:Not_variable',
            'False_Misconception:Inverse_operation'
        ],
        32833: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Inversion',
            'True_Misconception:Duplication',
            'False_Misconception:Duplication',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Inversion',
            'False_Misconception:Wrong_Operation'
        ],
        32835: [
            'False_Misconception:Whole_numbers_larger',
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Longer_is_bigger',
            'False_Misconception:Ignores_zeroes',
            'False_Misconception:Shorter_is_bigger',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Whole_numbers_larger',
            'True_Misconception:Shorter_is_bigger',
            'True_Misconception:Longer_is_bigger'
        ],
        33471: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Wrong_fraction',
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Wrong_fraction'
        ],
        33472: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Adding_across',
            'True_Misconception:Denominator-only_change',
            'True_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Denominator-only_change',
            'False_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Misconception:Adding_across'
        ],
        33474: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Division',
            'True_Misconception:Subtraction',
            'False_Neither:NA',
            'False_Misconception:Subtraction',
            'False_Misconception:Division',
            'False_Correct:NA'
        ],
        76870: [
            'False_Misconception:Unknowable',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Definition',
            'False_Misconception:Interior',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Definition'
        ],
        89443: [
            'False_Neither:NA',
            'False_Misconception:Positive',
            'False_Misconception:Tacking',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Tacking',
            'True_Misconception:Positive',
            'False_Correct:NA'
        ],
        91695: [
            'False_Neither:NA',
            'False_Misconception:Wrong_term',
            'False_Correct:NA',
            'False_Misconception:Firstterm',
            'True_Correct:NA',
            'True_Misconception:Wrong_term',
            'True_Neither:NA',
            'True_Misconception:Firstterm'
        ],
        104665: [
            'False_Neither:NA',
            'False_Misconception:Base_rate',
            'False_Correct:NA',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Base_rate',
            'True_Misconception:Multiplying_by_4',
            'False_Misconception:Multiplying_by_4'
        ],
        109465: [
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Certainty',
            'False_Misconception:Scale',
            'True_Correct:NA',
            'True_Neither:NA'
        ]
    }

    # True/False クラスを分類
    true_classes = {}
    false_classes = {}
    for idx, c in enumerate(label_encoder.classes_):
        if 'True' in c:
            true_classes[idx] = c
        else:
            false_classes[idx] = c

    # QuestionIdごとにlabel_encoderのインデックスに変換
    question_label_choice_ids = {}
    for qid, choices in question_label_choices.items():
        _label_ids = np.where(np.isin(label_encoder.classes_, choices))[0]
        question_label_choice_ids[qid] = [int(x) for x in _label_ids]

    test_probabilities = []
    test_predictions = []
    test_top3_predictions = []

    # all_probs を使って確率を処理
    for qid, correct, row_probs in zip(
        test_data.QuestionId.tolist(),
        test_data.is_correct.tolist(),
        all_probs
    ):
        candidate_idx = question_label_choice_ids[qid]

        # True/False制約を適用
        if filter_true_false:
            if correct == 1:
                candidate_idx = [c for c in candidate_idx if c in true_classes]
            else:
                candidate_idx = [c for c in candidate_idx if c in false_classes]

        candidate_logits = row_probs[candidate_idx]
        candidate_probs = torch.nn.functional.softmax(
            torch.tensor(candidate_logits), dim=-1
        ).numpy()

        top_k = np.argsort(-candidate_probs)

        # label_encoderのインデックスに戻す
        topk_idx = np.array(candidate_idx)[top_k]
        topk_probs = candidate_probs[top_k].tolist()
        topk_preds = label_encoder.inverse_transform(topk_idx).tolist()

        test_probabilities.append(topk_probs)
        test_predictions.append(topk_preds)
        test_top3_predictions.append(" ".join(topk_preds[:3]))

    # DataFrame作成
    test_submission_data = pd.DataFrame({
        "row_id": test_data.row_id.tolist(),
        "QuestionId": test_data.QuestionId.tolist(),
        "is_correct": test_data.is_correct.tolist(),
        "probs": test_probabilities,
        "preds": test_predictions,
        "Category:Misconception": test_top3_predictions
    })

    return test_submission_data


"""
保存された結合済みモデルとclassifierレイヤーを読み込むためのスクリプト
"""

import torch
import torch.nn as nn
import pickle
import os
from transformers import AutoModel, AutoTokenizer
import joblib

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    AutoConfig
)
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import Dataset
import joblib
import torch
import torch.nn as nn
import torch.nn.functional as F
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from transformers import AutoModel
import wandb
from transformers import EarlyStoppingCallback, TrainerCallback
import pickle
from collections import defaultdict
import os, json, torch
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn


class Qwen2ForSequenceClassificationWithMaskedLoss(nn.Module):
    """保存された結合済みモデルを読み込むためのクラス"""

    def __init__(self, model_dir):
        super().__init__()

        # 1. モデル設定の読み込み
        config_path = os.path.join(model_dir, "model_config.pkl")
        with open(config_path, 'rb') as f:
            config = pickle.load(f)

        # 2. Question-Labelマッピングの読み込み
        mapping_path = os.path.join(model_dir, "question_label_mapping.pkl")
        with open(mapping_path, 'rb') as f:
            self.question_label_map = pickle.load(f)

        # 3. ベースモデル（結合済み）の読み込み
        self.qwen = AutoModel.from_pretrained(
            model_dir,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )

        # 4. キャッシュの無効化
        if hasattr(self.qwen.config, "use_cache"):
            self.qwen.config.use_cache = False

        # 5. Dropout層の初期化
        self.dropout = nn.Dropout(config['dropout'])

        # 6. Classifierレイヤーの初期化
        base_dtype = next(self.qwen.parameters()).dtype
        device = next(self.qwen.parameters()).device
        self.num_labels = config['n_classes']
        self.classifier = nn.Linear(
            self.qwen.config.hidden_size,
            self.num_labels
        ).to(dtype=base_dtype).to(device)

        # 7. 保存されたclassifier重みの読み込み
        classifier_path = os.path.join(model_dir, "classifier_weights.pt")
        classifier_data = torch.load(classifier_path, map_location=device)
        self.classifier.load_state_dict(classifier_data['classifier_state_dict'])

        # 8. その他の設定
        self.mask_value = -65000.0
        self.config = self.qwen.config
        self.config.num_labels = self.num_labels

        print(f"Model loaded successfully from: {model_dir}")
        print(f"  - Number of classes: {self.num_labels}")
        print(f"  - Question-label mapping: {len(self.question_label_map)} questions")
        print(f"  - Mask value: {self.mask_value}")

    def forward(self, input_ids, attention_mask=None, labels=None, question_ids=None, **kwargs):
        """フォワードパス（train_mask_loss.pyと同じ実装）"""
        outputs = self.qwen(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, -1, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # QuestionIdベースのマスクを適用
        if question_ids is not None and self.question_label_map is not None:
            masked_logits = self.apply_question_mask(logits, question_ids)
        else:
            masked_logits = logits

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            if labels.dim() > 1:
                labels = labels.view(-1)
            if masked_logits.dim() == 3:
                masked_logits = masked_logits.view(-1, self.num_labels)

            try:
                loss = loss_fct(masked_logits, labels)
                if torch.isnan(loss) or torch.isinf(loss):
                    loss = torch.tensor(100.0, requires_grad=True, device=masked_logits.device)
            except Exception as e:
                print(f"Error computing loss: {e}")
                loss = torch.tensor(100.0, requires_grad=True, device=masked_logits.device)

        from transformers.modeling_outputs import SequenceClassifierOutput
        return SequenceClassifierOutput(loss=loss, logits=masked_logits)

    def apply_question_mask(self, logits, question_ids):
        """QuestionIdごとに無効なラベルをマスクする"""
        batch_size = logits.size(0)
        device = logits.device

        mask = torch.full_like(logits, self.mask_value)

        for i in range(batch_size):
            q_id = question_ids[i].item() if torch.is_tensor(question_ids[i]) else question_ids[i]

            if q_id in self.question_label_map:
                valid_labels = self.question_label_map[q_id]
                for label_idx in valid_labels:
                    mask[i, label_idx] = 0

        masked_logits = logits + mask
        return masked_logits


def load_model_for_inference(model_dir):
    """
    推論用にモデルを読み込む便利関数

    Args:
        model_dir: 保存されたモデルのディレクトリ

    Returns:
        model: 読み込まれたモデル
        tokenizer: トークナイザー
        label_encoder: ラベルエンコーダー
    """
    # モデルの読み込み
    model = Qwen2ForSequenceClassificationWithMaskedLoss(model_dir)
    model.eval()

    # トークナイザーの読み込み
    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token_id = 0
        tokenizer.pad_token = tokenizer.decode([0])

    # ラベルエンコーダーの読み込み
    label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
    label_encoder = joblib.load(label_encoder_path)

    return model, tokenizer, label_encoder


"""
Phi-4 モデル予測スクリプト - マスク付き損失関数版で訓練されたモデル用
テストデータに対して予測を実行し、提出ファイルを生成
"""

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel
import joblib
from datasets import Dataset
from tqdm import tqdm
import pickle
from collections import defaultdict

# カスタムモジュールのインポート
# from config import *
# from utils_with_choices import prepare_answer_choices, format_input
# from train_phi_4_best_2 import Phi4ForSequenceClassificationWithMaskedLoss

# 警告を無効化
import warnings
warnings.filterwarnings('ignore')

def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # Phi-4用のプロンプトフォーマット（特別なthinkタグを含む）
    prompt = (
        "<|user|>\n"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n"
        "<|end|>\n"
        "<|assistant|>\n"
        "<think>\n"
        "Let me analyze this mathematical misconception...\n"
        "</think>\n\n"
    )
    return prompt

def prepare_test_data(test_df):
    """テストデータの前処理"""
    print("Preparing test data...")

    # --- QuestionId 32835のQuestionTextを更新 ---
    print("Updating QuestionId 32835...")
    new_question_text = "Which number is the greatest? Options: 6.0000 6.2 6.079 6.0001"
    mask_32835 = test_df['QuestionId'] == 32835
    update_count = mask_32835.sum()

    if update_count > 0:
        original_text = test_df[mask_32835]['QuestionText'].iloc[0]
        print(f"Found {update_count} rows with QuestionId 32835")
        print(f"Original: {original_text[:80]}...")
        print(f"Updated to: {new_question_text}")
        test_df.loc[mask_32835, 'QuestionText'] = new_question_text
    else:
        print("No rows found with QuestionId 32835")

    # --- 選択肢データの準備 ---
    print("Preparing answer choices for each question...")
    choices = prepare_answer_choices(test_df)
    test_df = test_df.merge(choices[['QuestionId', 'answer_choices_str']], on='QuestionId', how='left')

    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    correct = prepare_correct_answers(train)

    # --- MC_Answerを選択肢ラベルに変換 ---
    print("Converting MC_Answer to choice labels...")
    def get_choice_label(row):
        question_id = row['QuestionId']
        mc_answer = row['MC_Answer']
        # 該当するchoice_mappingを取得
        choice_mapping = choices[choices['QuestionId'] == question_id]['choice_mapping'].iloc[0]
        return choice_mapping.get(mc_answer, mc_answer)  # マッピングがない場合は元の値

    test_df['choice_label'] = test_df.apply(get_choice_label, axis=1)

    # --- 入力テキストのフォーマット（is_correctは常にFalseでダミー値として設定） ---
    print("Formatting input text with answer choices...")
    # test_df['is_correct'] = False  # テストデータには正解情報がないためダミー値
    # test_df['text'] = test_df.apply(format_input, axis=1)
    test_df = test_df.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    test_df.is_correct = test_df.is_correct.fillna(0)
    test_df['text'] = test_df.apply(format_input, axis=1)
    print(f"Test data shape: {test_df.shape}")
    print("Example test prompt:")
    print(test_df.text.values[0])

    return test_df


def tokenize_test_dataset(dataset, tokenizer, max_len):
    """テストデータセットのトークナイズ（QuestionId付き）"""
    def tokenize(batch):
        tokenized = tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None
        )
        # QuestionIdとrow_idを保持
        tokenized['question_ids'] = batch['QuestionId']
        tokenized['row_ids'] = batch['row_id']
        return tokenized

    tokenized_dataset = dataset.map(
        tokenize,
        batched=True,
        remove_columns=['text', 'QuestionId', 'row_id']
    )

    return tokenized_dataset


class DataCollatorWithQuestionIdForTest:
    """テスト用QuestionIdを含むカスタムデータコレーター"""
    def __init__(self, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, features):
        # バッチの最大長を取得
        max_length = max(len(feature["input_ids"]) for feature in features)

        # パディング
        batch = {}
        for key in features[0].keys():
            if key in ["input_ids", "attention_mask"]:
                # input_idsとattention_maskをパディング
                padded = []
                for feature in features:
                    # tensorをlistに変換
                    if torch.is_tensor(feature[key]):
                        feature_list = feature[key].tolist()
                    else:
                        feature_list = feature[key]

                    remainder = [self.tokenizer.pad_token_id if key == "input_ids" else 0] * (max_length - len(feature_list))
                    padded_feature = feature_list + remainder
                    padded.append(padded_feature)
                batch[key] = torch.tensor(padded, dtype=torch.long)
            elif key == "question_ids":
                # question_idsはパディング不要
                batch[key] = torch.tensor([f[key] for f in features], dtype=torch.long)
            elif key == "row_ids":
                # row_idsはリストのまま保持
                batch[key] = [f[key] for f in features]

        return batch

def predict_with_model(model, tokenizer, test_dataset, data_collator, batch_size=4, question_label_map=None):
    """モデルで予測を実行（マスクなし予測後に有効ラベルからTOP3選択）"""
    model.eval()

    all_predictions = []
    all_row_ids = []
    all_question_ids = []
    all_probs = []   # ← 確率ベクトル保存用リストを追加

    # データローダーの作成（カスタムデータコレーターを使用）
    dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=data_collator
    )

    print(f"Predicting on {len(test_dataset)} samples...")

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            input_ids = batch['input_ids'].cuda()
            attention_mask = batch['attention_mask'].cuda()
            question_ids = batch['question_ids']
            row_ids = batch['row_ids']

            # 予測
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()

            batch_predictions = []
            for i, q_id in enumerate(question_ids):
                q_id_value = q_id.item() if torch.is_tensor(q_id) else q_id
                sample_probs = probs[i]

                # 🔑 各サンプルの確率ベクトルを保存
                all_probs.append(sample_probs)

                if question_label_map and q_id_value in question_label_map:
                    valid_labels = list(question_label_map[q_id_value])
                    valid_probs = [(label, sample_probs[label]) for label in valid_labels]
                    valid_probs.sort(key=lambda x: x[1], reverse=True)

                    top3_labels = [label for label, _ in valid_probs[:3]]
                    while len(top3_labels) < 3:
                        top3_labels.append(0)

                    batch_predictions.append(top3_labels)
                else:
                    top3_indices = np.argsort(-sample_probs)[:3]
                    batch_predictions.append(top3_indices)

            all_predictions.extend(batch_predictions)
            all_row_ids.extend(row_ids)
            all_question_ids.extend([q.item() if torch.is_tensor(q) else q for q in question_ids])

    return all_predictions, all_row_ids, all_probs
def main():
    """メイン予測関数"""
    # GPU設定
    if CUDA_VISIBLE_DEVICES is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = CUDA_VISIBLE_DEVICES
        print(f"Using CUDA device(s): {CUDA_VISIBLE_DEVICES}")

    # モデルの読み込み
    model, tokenizer, le = load_model_for_inference(MODEL_DIR)
    n_classes = len(le.classes_)
    print(f"Number of classes: {n_classes}")

    # QuestionId-Labelマッピングの読み込み
    mapping_path = f"{MODEL_DIR}/question_label_mapping.pkl"
    print(f"Loading question-label mapping from: {mapping_path}")
    with open(mapping_path, 'rb') as f:
        question_label_map = pickle.load(f)
    print(f"Loaded mapping for {len(question_label_map)} questions")

    # --- テストデータの読み込みと前処理 ---
    print("\nLoading and preprocessing test data...")
    test_df = pd.read_csv(TEST_DATA_PATH)
    test_df = prepare_test_data(test_df)

    # データセットの作成
    COLS = ['text', 'QuestionId', 'row_id']
    test_ds = Dataset.from_pandas(test_df[COLS])

    # --- トークナイズ ---
    print("Tokenizing test dataset...")
    test_ds = tokenize_test_dataset(test_ds, tokenizer, MAX_LEN)

    # --- データコレーターの作成 ---
    data_collator = DataCollatorWithQuestionIdForTest(tokenizer=tokenizer, max_length=MAX_LEN)

    # --- 予測の実行 ---
    print("\nRunning predictions...")
    with torch.no_grad():
        predictions = predict_with_model(
            model,
            tokenizer,
            test_ds,
            data_collator,
            batch_size=EVAL_BATCH_SIZE,
            question_label_map=question_label_map
        )


    print("\nCreating submission file...")
    submission = create_submission(predictions, test_df, le, filter_true_false=True)

    # --- 保存 ---
    submission.to_csv("./phi4_masked_submission.csv", index=False)
    print("Saved prediction file: ./phi4_masked_submission.csv")
    print(submission.head())

    return submission


if __name__ == "__main__":
    submission = main()

Writing phi_new_loss_0_947.py


In [None]:
# %%writefile phi_new_loss_0_947.py
# """
# 設定ファイル - Deberta モデルのトレーニングと推論用設定
# """

# # Model configuration
# VER = 2
# # MODEL_NAME = "/kaggle/input/phi4-reasoning-plus/transformers/default/1/Phi-4-reasoning-plus"
# # MODEL_TYPE = "qwen3"  # Add model type for proper handling
# MODEL_DIR = "/kaggle/input/phi4-merged-masked"
# EPOCHS = 3  # Reduce epochs for initial testing
# MAX_LEN = 250  # Increase max length for better context


# CUDA_VISIBLE_DEVICES = "0,1"  # Use GPU 1 only

# # Directory settings
# OUTPUT_DIR = f"./"

# # Training parameters
# TRAIN_BATCH_SIZE = 2  # Reduced for large model memory efficiency
# EVAL_BATCH_SIZE = 8  # Further reduced to avoid CUDA errors
# GRADIENT_ACCUMULATION_STEPS = 2  # Increased for memory efficiency
# LEARNING_RATE = 2e-4
# WARMUP_STEPS = 100  # Warmup steps for learning rate scheduler
# WEIGHT_DECAY = 0.01  # Weight decay for regularization
# LOGGING_STEPS = 50
# SAVE_STEPS = 200
# EVAL_STEPS = 200
# USE_FP16 = True  # Use FP16 training for memory efficiency


# # Data paths
# TRAIN_DATA_PATH ='/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
# # '/home/sato/project/map/train_32835_updated.csv'
# # '/home/sato/project/map/train_final.csv'
# # '/home/sato/project/map/train_changetext.csv'
# # '/home/sato/project/map/train_ocr_corrected_openai.csv'
# # '/home/sato/project/map/train.csv'
# # '/home/sato/project/map/train_ocr_corrected_openai.csv'
# # '/home/sato/project/map/train.csv'
# # '/home/sato/project/map/train_ocr_corrected_openai_checkpoint_30400.csv'
# # '/home/sato/project/map/train.csv'
# # '/home/sato/project/map/train_ocr_corrected_openai_checkpoint_30400.csv'
# # '/home/sato/project/map/train_ocr_corrected.csv'
# TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# # Model save paths
# BEST_MODEL_PATH = f"{OUTPUT_DIR}/best"
# LABEL_ENCODER_PATH = f"/kaggle/input/question-label-mapping/label_encoder.joblib"

# # Base model path for pseudo labeling (事前訓練済みモデルのパス)
# BASE_MODEL_PATH = f"{OUTPUT_DIR}/best"  # train.pyで作成されたモデルのパス

# # Other settings
# RANDOM_SEED = 42
# VALIDATION_SPLIT = 0.2


# # Submission settings
# SUBMISSION_OUTPUT_PATH = 'submission.csv'

# # WandB settings
# USE_WANDB = True  # Set to False to disable WandB
# WANDB_PROJECT = "qwen3-0.6b-ocr-correct"
# WANDB_RUN_NAME = "qwen3-0.6b-origin-real_choice"
# # "qwen3-0.6b-4o-mini"
# WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed


# # Early stopping settings
# USE_EARLY_STOPPING = True
# EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
# EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# # LoRA configuration
# LORA_RANK = 16  # LoRAのランク
# LORA_ALPHA = 32  # LoRAのスケーリングパラメータ
# LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]  # 対象モジュール
# LORA_DROPOUT = 0.1  # LoRAのドロップアウト率
# LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# # 3段階学習用の設定
# STAGE1_EPOCHS = 3  # 第1段階のエポック数
# STAGE3_EPOCHS = 1  # 第3段階のエポック数
# STAGE3_LEARNING_RATE_RATIO = 1.0  # 第3段階の学習率比率（元の学習率に対する倍率）
# SKIP_STAGE1_TRAINING = False  # 第1段階をスキップするかどうか
# STAGE1_PRETRAINED_MODEL_PATH = "./stage1_final"  # 第1段階をスキップする場合の学習済み重みパス
# """
# 共通ユーティリティ関数 - 選択肢付きバージョン
# """

# import pandas as pd
# import numpy as np
# from transformers import AutoTokenizer
# from datasets import Dataset
# import torch


# def prepare_correct_answers(train_data):
#     """正解答案データを準備"""
#     idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
#     correct = train_data.loc[idx].copy()
#     correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
#     correct = correct.sort_values('c', ascending=False)
#     correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
#     correct['is_correct'] = 1
#     return correct


# def prepare_answer_choices(train_data, mapping_file='/kaggle/input/question-label-mapping/question_answer_choice_mapping.csv'):
#     """各問題のMC_Answer選択肢を準備（マッピングファイルを使用、小文字ラベル）"""
#     # マッピングファイルを読み込み
#     mapping_df = pd.read_csv(mapping_file)

#     # 各QuestionIdごとに選択肢を作成
#     choices_list = []

#     for question_id in train_data['QuestionId'].unique():
#         # 該当QuestionIdのマッピングを取得
#         question_mapping = mapping_df[mapping_df['QuestionId'] == question_id].copy()

#         if len(question_mapping) > 0:
#             # Choice（A,B,C,D）でソート
#             question_mapping = question_mapping.sort_values('Choice')
#             # 選択肢文字列を作成（小文字ラベル）
#             choice_items = []
#             choice_mapping = {}  # MC_Answer -> choice label のマッピング
#             for _, row in question_mapping.iterrows():
#                 lowercase_choice = row['Choice'].lower()  # A -> a, B -> b, etc.
#                 choice_items.append(f"{lowercase_choice}. {row['MC_Answer']}")
#                 choice_mapping[row['MC_Answer']] = lowercase_choice
#             answer_choices_str = '\n'.join(choice_items)
#         else:
#             # マッピングがない場合は従来の番号方式にフォールバック
#             question_answers = train_data[train_data['QuestionId'] == question_id]['MC_Answer'].unique()
#             choice_items = []
#             choice_mapping = {}
#             for i, ans in enumerate(question_answers):
#                 lowercase_choice = chr(ord('a') + i)  # a, b, c, d, ...
#                 choice_items.append(f"{lowercase_choice}. {ans}")
#                 choice_mapping[ans] = lowercase_choice
#             answer_choices_str = '\n'.join(choice_items)

#         choices_list.append({
#             'QuestionId': question_id,
#             'answer_choices_str': answer_choices_str,
#             'choice_mapping': choice_mapping  # MC_Answer -> choice label のマッピングも保存
#         })

#     choices = pd.DataFrame(choices_list)
#     return choices


# def format_input(row):
#     """入力データをモデル用プロンプトにフォーマット（選択肢付き、回答をラベルに変換）"""
#     if row["is_correct"]:
#         status = "Yes"
#     else:
#         status = "No"

#     # MC_Answerを選択肢ラベル（a, b, c, d）に変換
#     student_answer_label = row.get('choice_label', row['MC_Answer'])  # フォールバック

#     # Qwen2.5-Math用の数学タスクに特化したプロンプト（選択肢付き）
#     prompt = (
#         "<|user|>\n"
#         f"[Mathematical Misconception Analysis Task]\n\n"
#         f"Question: {row['QuestionText']}\n"
#         f"Answer: {row['MC_Answer']}\n"
#         f"Correct?: {status}\n"
#         f"Explanation: {row['StudentExplanation']}\n"
#         "<|end|>\n"
#         "<|assistant|>\n"
#         "<think>\n"
#         "Let me analyze this mathematical misconception...\n"
#         "</think>\n\n"
#     )
#     return prompt


# def tokenize_dataset(dataset, tokenizer, max_len):
#     """データセットをトークナイズ"""
#     def tokenize(batch):
#         # パディングはDataCollatorで行うため、ここではトークナイズのみ
#         return tokenizer(
#             batch['text'],
#             padding=False,  # パディングはDataCollatorに任せる
#             truncation=True,
#             max_length=max_len,
#             return_tensors=None  # map時は'None'を使用
#         )

#     dataset = dataset.map(tokenize, batched=True, batch_size=100)
#     # columnsの設定時にlabelを保持
#     columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
#     dataset.set_format(type='torch', columns=columns)
#     return dataset


# def compute_map3(eval_pred):
#     """Top-3 予測に基づくMAP@3を計算"""
#     print(f"[DEBUG] *** compute_map3 function called! ***")
#     print(f"[DEBUG] eval_pred type: {type(eval_pred)}")
#     print(f"[DEBUG] eval_pred: {eval_pred}")

#     try:
#         logits, labels = eval_pred
#         print(f"[DEBUG] compute_map3 called with logits shape: {logits.shape}, labels shape: {labels.shape}")

#         probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
#         top3 = np.argsort(-probs, axis=1)[:, :3]
#         score = 0.0
#         for i, label in enumerate(labels):
#             ranks = top3[i]
#             if ranks[0] == label:
#                 score += 1.0
#             elif ranks[1] == label:
#                 score += 1.0 / 2
#             elif ranks[2] == label:
#                 score += 1.0 / 3

#         map3_score = score / len(labels)
#         result = {"eval_map@3": map3_score}
#         print(f"[DEBUG] compute_map3 returning: {result}")
#         print(f"[DEBUG] *** compute_map3 function completed successfully! ***")
#         return result

#     except Exception as e:
#         print(f"[ERROR] compute_map3 failed: {e}")
#         print(f"[ERROR] Exception type: {type(e)}")
#         import traceback
#         print(f"[ERROR] Full traceback: {traceback.format_exc()}")
#         return {"eval_map@3": 0.0}


# def create_submission(predictions, test_data, label_encoder):
#     """予測結果から提出用ファイルを作成"""
#     probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=1).numpy()
#     top3 = np.argsort(-probs, axis=1)[:, :3]
#     flat = top3.flatten()
#     decoded = label_encoder.inverse_transform(flat)
#     top3_labels = decoded.reshape(top3.shape)
#     pred_strings = [" ".join(r) for r in top3_labels]

#     submission = pd.DataFrame({
#         'row_id': test_data.row_id.values,
#         'Category:Misconception': pred_strings
#     })
#     return submission

# """
# 保存された結合済みモデルとclassifierレイヤーを読み込むためのスクリプト
# """

# import torch
# import torch.nn as nn
# import pickle
# import os
# from transformers import AutoModel, AutoTokenizer
# import joblib

# import os
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# from transformers import (
#     AutoModelForSequenceClassification,
#     AutoTokenizer,
#     TrainingArguments,
#     Trainer,
#     AutoConfig
# )
# from transformers.modeling_outputs import SequenceClassifierOutput
# from datasets import Dataset
# import joblib
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from peft import LoraConfig, get_peft_model, TaskType, PeftModel
# from transformers import AutoModel
# import wandb
# from transformers import EarlyStoppingCallback, TrainerCallback
# import pickle
# from collections import defaultdict
# import os, json, torch
# from transformers import AutoModel, AutoTokenizer
# import torch.nn as nn


# class Qwen2ForSequenceClassificationWithMaskedLoss(nn.Module):
#     """保存された結合済みモデルを読み込むためのクラス"""

#     def __init__(self, model_dir):
#         super().__init__()

#         # 1. モデル設定の読み込み
#         config_path = os.path.join(model_dir, "model_config.pkl")
#         with open(config_path, 'rb') as f:
#             config = pickle.load(f)

#         # 2. Question-Labelマッピングの読み込み
#         mapping_path = os.path.join(model_dir, "question_label_mapping.pkl")
#         with open(mapping_path, 'rb') as f:
#             self.question_label_map = pickle.load(f)

#         # 3. ベースモデル（結合済み）の読み込み
#         self.qwen = AutoModel.from_pretrained(
#             model_dir,
#             trust_remote_code=True,
#             torch_dtype=torch.float16,
#             device_map="auto",
#             low_cpu_mem_usage=True
#         )

#         # 4. キャッシュの無効化
#         if hasattr(self.qwen.config, "use_cache"):
#             self.qwen.config.use_cache = False

#         # 5. Dropout層の初期化
#         self.dropout = nn.Dropout(config['dropout'])

#         # 6. Classifierレイヤーの初期化
#         base_dtype = next(self.qwen.parameters()).dtype
#         device = next(self.qwen.parameters()).device
#         self.num_labels = config['n_classes']
#         self.classifier = nn.Linear(
#             self.qwen.config.hidden_size,
#             self.num_labels
#         ).to(dtype=base_dtype).to(device)

#         # 7. 保存されたclassifier重みの読み込み
#         classifier_path = os.path.join(model_dir, "classifier_weights.pt")
#         classifier_data = torch.load(classifier_path, map_location=device)
#         self.classifier.load_state_dict(classifier_data['classifier_state_dict'])

#         # 8. その他の設定
#         self.mask_value = -65000.0
#         self.config = self.qwen.config
#         self.config.num_labels = self.num_labels

#         print(f"Model loaded successfully from: {model_dir}")
#         print(f"  - Number of classes: {self.num_labels}")
#         print(f"  - Question-label mapping: {len(self.question_label_map)} questions")
#         print(f"  - Mask value: {self.mask_value}")

#     def forward(self, input_ids, attention_mask=None, labels=None, question_ids=None, **kwargs):
#         """フォワードパス（train_mask_loss.pyと同じ実装）"""
#         outputs = self.qwen(input_ids=input_ids, attention_mask=attention_mask)
#         pooled_output = outputs.last_hidden_state[:, -1, :]
#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)

#         # QuestionIdベースのマスクを適用
#         if question_ids is not None and self.question_label_map is not None:
#             masked_logits = self.apply_question_mask(logits, question_ids)
#         else:
#             masked_logits = logits

#         loss = None
#         if labels is not None:
#             loss_fct = nn.CrossEntropyLoss()
#             if labels.dim() > 1:
#                 labels = labels.view(-1)
#             if masked_logits.dim() == 3:
#                 masked_logits = masked_logits.view(-1, self.num_labels)

#             try:
#                 loss = loss_fct(masked_logits, labels)
#                 if torch.isnan(loss) or torch.isinf(loss):
#                     loss = torch.tensor(100.0, requires_grad=True, device=masked_logits.device)
#             except Exception as e:
#                 print(f"Error computing loss: {e}")
#                 loss = torch.tensor(100.0, requires_grad=True, device=masked_logits.device)

#         from transformers.modeling_outputs import SequenceClassifierOutput
#         return SequenceClassifierOutput(loss=loss, logits=masked_logits)

#     def apply_question_mask(self, logits, question_ids):
#         """QuestionIdごとに無効なラベルをマスクする"""
#         batch_size = logits.size(0)
#         device = logits.device

#         mask = torch.full_like(logits, self.mask_value)

#         for i in range(batch_size):
#             q_id = question_ids[i].item() if torch.is_tensor(question_ids[i]) else question_ids[i]

#             if q_id in self.question_label_map:
#                 valid_labels = self.question_label_map[q_id]
#                 for label_idx in valid_labels:
#                     mask[i, label_idx] = 0

#         masked_logits = logits + mask
#         return masked_logits


# def load_model_for_inference(model_dir):
#     """
#     推論用にモデルを読み込む便利関数

#     Args:
#         model_dir: 保存されたモデルのディレクトリ

#     Returns:
#         model: 読み込まれたモデル
#         tokenizer: トークナイザー
#         label_encoder: ラベルエンコーダー
#     """
#     # モデルの読み込み
#     model = Qwen2ForSequenceClassificationWithMaskedLoss(model_dir)
#     model.eval()

#     # トークナイザーの読み込み
#     tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
#     if tokenizer.pad_token is None:
#         tokenizer.pad_token_id = 0
#         tokenizer.pad_token = tokenizer.decode([0])

#     # ラベルエンコーダーの読み込み
#     label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
#     label_encoder = joblib.load(label_encoder_path)

#     return model, tokenizer, label_encoder


# """
# Phi-4 モデル予測スクリプト - マスク付き損失関数版で訓練されたモデル用
# テストデータに対して予測を実行し、提出ファイルを生成
# """

# import os
# import pandas as pd
# import numpy as np
# import torch
# import torch.nn as nn
# from transformers import AutoTokenizer, AutoModel
# from peft import PeftModel
# import joblib
# from datasets import Dataset
# from tqdm import tqdm
# import pickle
# from collections import defaultdict

# # カスタムモジュールのインポート
# # from config import *
# # from utils_with_choices import prepare_answer_choices, format_input
# # from train_phi_4_best_2 import Phi4ForSequenceClassificationWithMaskedLoss

# # 警告を無効化
# import warnings
# warnings.filterwarnings('ignore')

# def prepare_correct_answers(train_data):
#     """正解答案データを準備"""
#     idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
#     correct = train_data.loc[idx].copy()
#     correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
#     correct = correct.sort_values('c', ascending=False)
#     correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
#     correct['is_correct'] = 1
#     return correct


# def format_input(row):
#     """入力データをモデル用プロンプトにフォーマット"""
#     if row["is_correct"]:
#         status = "Yes"
#     else:
#         status = "No"

#     # Phi-4用のプロンプトフォーマット（特別なthinkタグを含む）
#     prompt = (
#         "<|user|>\n"
#         f"[Mathematical Misconception Analysis Task]\n\n"
#         f"Question: {row['QuestionText']}\n"
#         f"Answer: {row['MC_Answer']}\n"
#         f"Correct?: {status}\n"
#         f"Explanation: {row['StudentExplanation']}\n"
#         "<|end|>\n"
#         "<|assistant|>\n"
#         "<think>\n"
#         "Let me analyze this mathematical misconception...\n"
#         "</think>\n\n"
#     )
#     return prompt

# def prepare_test_data(test_df):
#     """テストデータの前処理"""
#     print("Preparing test data...")

#     # --- QuestionId 32835のQuestionTextを更新 ---
#     print("Updating QuestionId 32835...")
#     new_question_text = "Which number is the greatest? Options: 6.0000 6.2 6.079 6.0001"
#     mask_32835 = test_df['QuestionId'] == 32835
#     update_count = mask_32835.sum()

#     if update_count > 0:
#         original_text = test_df[mask_32835]['QuestionText'].iloc[0]
#         print(f"Found {update_count} rows with QuestionId 32835")
#         print(f"Original: {original_text[:80]}...")
#         print(f"Updated to: {new_question_text}")
#         test_df.loc[mask_32835, 'QuestionText'] = new_question_text
#     else:
#         print("No rows found with QuestionId 32835")

#     # --- 選択肢データの準備 ---
#     print("Preparing answer choices for each question...")
#     choices = prepare_answer_choices(test_df)
#     test_df = test_df.merge(choices[['QuestionId', 'answer_choices_str']], on='QuestionId', how='left')

#     train = pd.read_csv(TRAIN_DATA_PATH)
#     train.Misconception = train.Misconception.fillna('NA')
#     correct = prepare_correct_answers(train)

#     # --- MC_Answerを選択肢ラベルに変換 ---
#     print("Converting MC_Answer to choice labels...")
#     def get_choice_label(row):
#         question_id = row['QuestionId']
#         mc_answer = row['MC_Answer']
#         # 該当するchoice_mappingを取得
#         choice_mapping = choices[choices['QuestionId'] == question_id]['choice_mapping'].iloc[0]
#         return choice_mapping.get(mc_answer, mc_answer)  # マッピングがない場合は元の値

#     test_df['choice_label'] = test_df.apply(get_choice_label, axis=1)

#     # --- 入力テキストのフォーマット（is_correctは常にFalseでダミー値として設定） ---
#     print("Formatting input text with answer choices...")
#     # test_df['is_correct'] = False  # テストデータには正解情報がないためダミー値
#     # test_df['text'] = test_df.apply(format_input, axis=1)
#     test_df = test_df.merge(correct, on=['QuestionId','MC_Answer'], how='left')
#     test_df.is_correct = test_df.is_correct.fillna(0)
#     test_df['text'] = test_df.apply(format_input, axis=1)
#     print(f"Test data shape: {test_df.shape}")
#     print("Example test prompt:")
#     print(test_df.text.values[0])

#     return test_df


# def tokenize_test_dataset(dataset, tokenizer, max_len):
#     """テストデータセットのトークナイズ（QuestionId付き）"""
#     def tokenize(batch):
#         tokenized = tokenizer(
#             batch['text'],
#             padding=False,  # パディングはDataCollatorに任せる
#             truncation=True,
#             max_length=max_len,
#             return_tensors=None
#         )
#         # QuestionIdとrow_idを保持
#         tokenized['question_ids'] = batch['QuestionId']
#         tokenized['row_ids'] = batch['row_id']
#         return tokenized

#     tokenized_dataset = dataset.map(
#         tokenize,
#         batched=True,
#         remove_columns=['text', 'QuestionId', 'row_id']
#     )

#     return tokenized_dataset


# class DataCollatorWithQuestionIdForTest:
#     """テスト用QuestionIdを含むカスタムデータコレーター"""
#     def __init__(self, tokenizer, max_length):
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __call__(self, features):
#         # バッチの最大長を取得
#         max_length = max(len(feature["input_ids"]) for feature in features)

#         # パディング
#         batch = {}
#         for key in features[0].keys():
#             if key in ["input_ids", "attention_mask"]:
#                 # input_idsとattention_maskをパディング
#                 padded = []
#                 for feature in features:
#                     # tensorをlistに変換
#                     if torch.is_tensor(feature[key]):
#                         feature_list = feature[key].tolist()
#                     else:
#                         feature_list = feature[key]

#                     remainder = [self.tokenizer.pad_token_id if key == "input_ids" else 0] * (max_length - len(feature_list))
#                     padded_feature = feature_list + remainder
#                     padded.append(padded_feature)
#                 batch[key] = torch.tensor(padded, dtype=torch.long)
#             elif key == "question_ids":
#                 # question_idsはパディング不要
#                 batch[key] = torch.tensor([f[key] for f in features], dtype=torch.long)
#             elif key == "row_ids":
#                 # row_idsはリストのまま保持
#                 batch[key] = [f[key] for f in features]

#         return batch

# def predict_with_model(model, tokenizer, test_dataset, data_collator, batch_size=4, question_label_map=None):
#     """モデルで予測を実行（マスクなし予測後に有効ラベルからTOP3選択）"""
#     model.eval()

#     all_predictions = []
#     all_row_ids = []
#     all_question_ids = []
#     all_probs = []   # ← 確率ベクトル保存用リストを追加

#     # データローダーの作成（カスタムデータコレーターを使用）
#     dataloader = torch.utils.data.DataLoader(
#         test_dataset,
#         batch_size=batch_size,
#         shuffle=False,
#         collate_fn=data_collator
#     )

#     print(f"Predicting on {len(test_dataset)} samples...")

#     with torch.no_grad():
#         for batch in tqdm(dataloader, desc="Predicting"):
#             input_ids = batch['input_ids'].cuda()
#             attention_mask = batch['attention_mask'].cuda()
#             question_ids = batch['question_ids']
#             row_ids = batch['row_ids']

#             # 予測
#             outputs = model(
#                 input_ids=input_ids,
#                 attention_mask=attention_mask
#             )
#             probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()

#             batch_predictions = []
#             for i, q_id in enumerate(question_ids):
#                 q_id_value = q_id.item() if torch.is_tensor(q_id) else q_id
#                 sample_probs = probs[i]

#                 # 🔑 各サンプルの確率ベクトルを保存
#                 all_probs.append(sample_probs)

#                 if question_label_map and q_id_value in question_label_map:
#                     valid_labels = list(question_label_map[q_id_value])
#                     valid_probs = [(label, sample_probs[label]) for label in valid_labels]
#                     valid_probs.sort(key=lambda x: x[1], reverse=True)

#                     top3_labels = [label for label, _ in valid_probs[:3]]
#                     while len(top3_labels) < 3:
#                         top3_labels.append(0)

#                     batch_predictions.append(top3_labels)
#                 else:
#                     top3_indices = np.argsort(-sample_probs)[:3]
#                     batch_predictions.append(top3_indices)

#             all_predictions.extend(batch_predictions)
#             all_row_ids.extend(row_ids)
#             all_question_ids.extend([q.item() if torch.is_tensor(q) else q for q in question_ids])

#     return all_predictions, all_row_ids, all_probs


# def main():
#     """メイン予測関数"""

#     # GPU設定
#     if CUDA_VISIBLE_DEVICES is not None:
#         os.environ['CUDA_VISIBLE_DEVICES'] = CUDA_VISIBLE_DEVICES
#         print(f"Using CUDA device(s): {CUDA_VISIBLE_DEVICES}")



#     # モデルの読み込み
#     model, tokenizer, le = load_model_for_inference(MODEL_DIR)

#     n_classes = len(le.classes_)
#     print(f"Number of classes: {n_classes}")

#     # QuestionId-Labelマッピングの読み込み
#     mapping_path = f"{MODEL_DIR}/question_label_mapping.pkl"
#     print(f"Loading question-label mapping from: {mapping_path}")
#     with open(mapping_path, 'rb') as f:
#         question_label_map = pickle.load(f)
#     print(f"Loaded mapping for {len(question_label_map)} questions")

#     # --- テストデータの読み込みと前処理 ---
#     print("\nLoading and preprocessing test data...")
#     test_df = pd.read_csv(TEST_DATA_PATH)
#     print(f"Original test data shape: {test_df.shape}")

#     # 前処理の実行
#     test_df = prepare_test_data(test_df)

#     # データセットの作成
#     COLS = ['text', 'QuestionId', 'row_id']
#     test_ds = Dataset.from_pandas(test_df[COLS])

#     # # --- トークナイザーの初期化 ---
#     # print("\nInitializing tokenizer...")
#     # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

#     # # パディングトークンの設定
#     # if tokenizer.pad_token is None:
#     #     tokenizer.pad_token_id = 0
#     #     tokenizer.pad_token = tokenizer.decode([0])

#     # --- テストデータのトークナイズ ---
#     print("Tokenizing test dataset...")
#     test_ds = tokenize_test_dataset(test_ds, tokenizer, MAX_LEN)

#     # --- モデルの読み込み ---
#     print("\nLoading model...")

#     # --- データコレーターの作成 ---
#     data_collator = DataCollatorWithQuestionIdForTest(tokenizer=tokenizer, max_length=MAX_LEN)

#     # --- 予測の実行 ---
#     print("\nRunning predictions...")
#     # predictions, row_ids = predict_with_model(
#     #     model,
#     #     tokenizer,
#     #     test_ds,
#     #     data_collator,
#     #     batch_size=EVAL_BATCH_SIZE,
#     #     question_label_map=question_label_map
#     # )
#     all_predictions, all_row_ids, probs_all = predict_with_model(
#     model,
#     tokenizer,
#     test_ds,
#     data_collator,
#     batch_size=EVAL_BATCH_SIZE,
#     question_label_map=question_label_map
#     )

#     # --- 予測結果の処理 ---
#     print("\nProcessing predictions...")



#     # --- 提出ファイルの作成 ---
#     submission_data = []
#     for row_id, pred_indices, prob_vector in zip(all_row_ids, all_predictions, probs_all):
#         # ラベルに変換
#         pred_labels = le.inverse_transform(range(len(prob_vector)))
#         prob_list = prob_vector.tolist()

#         row = {
#             "row_id": row_id,
#             "preds": str(list(pred_labels)),  # 全クラスのラベル
#             "probs": str(list(prob_list))     # 各ラベルの確率
#         }
#         submission_data.append(row)

#     submission_df = pd.DataFrame(submission_data)
#     submission_df = submission_df.sort_values("row_id")

#     submission_path = "./phi4_masked_submission.csv"
#     submission_df.to_csv(submission_path, index=False)
#     print(f"Saved prediction file for ensemble: {submission_path}")


#     # 統計情報の表示
#     print(f"\nSubmission statistics:")
#     print(f"Total predictions: {len(submission_df)}")
#     print(f"First 5 predictions:")
#     print(submission_df.head())

#     # ユニークな誤概念の数を計算
#     all_misconceptions = []
#     for misconceptions in submission_df['Category:Misconception'].values:
#         all_misconceptions.extend(misconceptions.split())
#     unique_misconceptions = len(set(all_misconceptions))
#     print(f"\nUnique misconceptions in predictions: {unique_misconceptions}")

#     print("\nPrediction completed successfully!")
#     return submission_df


# if __name__ == "__main__":
#     submission = main()

In [9]:
!python qwen3_32b_0_947.py

2025-09-23 10:35:38.704035: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758623738.888515      44 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758623738.944436      44 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Found 2 GPUs
Loading label encoder...
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Loading trained model and tokenizer...
Loading fine-tuned LoRA model from: /kaggle/input/qwen3-32b-9468/transformers/default/1/ver_2/best
Loading base model from: /kaggle/input/qwen-3/transformers/32b/1
Loading checkpoint shards: 100%|████████████████| 17/17 [05:41<00:00, 20.07s/it]
Some weights 

In [10]:
!python phi_4_reasoning_0_948.py

2025-09-23 10:41:59.580180: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758624119.606795      81 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758624119.614434      81 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Found 2 GPUs
Loading label encoder...
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Loading trained model and tokenizer...
Loading fine-tuned LoRA model from: /kaggle/input/phi-4-reasoning-plus09476-ft/transformers/default/1/ver_2_9476ft/checkpoint-1722
Loading base model from: /kaggle/input/phi4-reasoning-plus/transformers/default/1/Phi-4-reasoning-plus
Loading checkpoint shards

In [11]:
!python phi_4_0_948_fulltrain.py

2025-09-23 10:44:58.042286: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758624298.064934     112 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758624298.072009     112 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Found 2 GPUs
Loading label encoder...
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Loading trained model and tokenizer...
Loading fine-tuned LoRA model from: /kaggle/input/phi-4-cv0965-fulltrain/transformers/default/1/ver_2_0965ft/checkpoint-1722
Loading base model from: /kaggle/input/ms-phi4/transformers/default/1/phi-4
Loading checkpoint shards: 100%|██████████████████| 6/6 [0

In [12]:
!python deepseek_0_946.py

2025-09-23 10:47:48.935993: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758624468.958211     143 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758624468.965103     143 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Found 2 GPUs
Loading label encoder...
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Loading trained model and tokenizer...
Loading fine-tuned LoRA model from: /kaggle/input/deepseek-r1-distill-qwen-14b-cv0.9455-fulltrain/transformers/default/1/ver_2/checkpoint-1722
Loading base model from: /kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-14b/2
Loading checkpoint sh

In [13]:
!python qwen3_14b_0_946.py

2025-09-23 10:50:39.946718: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758624639.969482     174 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758624639.976231     174 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Found 2 GPUs
Loading label encoder...
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Loading trained model and tokenizer...
Loading fine-tuned LoRA model from: /kaggle/input/qwen3-14b-lb0.945-fulltrain/transformers/default/1/ver_2/checkpoint-1722
Loading base model from: /kaggle/input/qwen-3/transformers/14b/1
Loading checkpoint shards: 100%|██████████████████| 8/8 [02:33<00:00, 1

In [14]:
!python  phi_new_loss_0_947.py

2025-09-23 10:53:33.906704: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758624813.929240     205 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758624813.936130     205 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using CUDA device(s): 0,1
Loading checkpoint shards: 100%|██████████████████| 6/6 [04:20<00:00, 43.44s/it]
Model loaded successfully from: /kaggle/input/phi4-merged-masked
  - Number of classes: 65
  - Question-label mapping: 15 questions
  - Mask value: -65000.0
Number of classes: 65
Loading question-label mapping from: /kaggle/input/phi4-merged-masked/question_label_mapping.pkl
Loaded mapping for 15 questions

Loading 

In [15]:
# import pandas as pd
# a = pd.read_csv('/kaggle/working/phi4_masked_submission.csv')

In [16]:
# a

In [17]:
from collections import defaultdict

def get_top_k_ensemble(ll, k=3, weights=None):

    lists = [l.split(' ') for l in ll]
    # If no weights provided, use equal weighting
    if weights is None:
        weights = [1.0 for _ in lists]
    score = defaultdict(int)

    for i, lst in enumerate(lists):
        weight = weights[i]
        for rank, item in enumerate(lst):
            score[item] += (len(lst) - rank) * weight

    sorted_items = sorted(score.items(), key=lambda x: -x[1])
    return ' '.join([item for item, _ in sorted_items[:k]])

list1 = 'a b d f'
list2 = 'b c a e'
list3 = 'c e b'
list4 = 'c e d'

print(get_top_k_ensemble([list1, list2, list3, list4], k=3))

c b a


In [18]:
import pandas as pd
import numpy as np

df1 = pd.read_csv('/kaggle/working/deepseek_r1_submission.csv')
df2 = pd.read_csv('/kaggle/working/qwen3_14b_submission.csv')
df3 = pd.read_csv('/kaggle/working/phi_4_reasoning_0_948_submission.csv')
df4 = pd.read_csv('/kaggle/working/phi_4_0_948_fulltrain_submission.csv')
df5 = pd.read_csv('/kaggle/working/qwen3_32b_0_947_submission.csv')
df6 = pd.read_csv('/kaggle/working/phi4_masked_submission.csv')

df1 = df1.sort_values('row_id').reset_index(drop=True)
df2 = df2.sort_values('row_id').reset_index(drop=True)
df3 = df3.sort_values('row_id').reset_index(drop=True)
df4 = df4.sort_values('row_id').reset_index(drop=True)
df5 = df5.sort_values('row_id').reset_index(drop=True)
df6 = df6.sort_values('row_id').reset_index(drop=True)

In [None]:
ensemble_predictions = []
for r1, r2, r3, r4, r5, r6 in zip(
    df1.itertuples(), df2.itertuples(), df3.itertuples(),
    df4.itertuples(), df5.itertuples(), df6.itertuples()
):
    prob_preds_1 = sorted([(pb, pr) for pb, pr in zip(eval(r1.probs), eval(r1.preds))], key=lambda x: x[1])
    prob_preds_2 = sorted([(pb, pr) for pb, pr in zip(eval(r2.probs), eval(r2.preds))], key=lambda x: x[1])
    prob_preds_3 = sorted([(pb, pr) for pb, pr in zip(eval(r3.probs), eval(r3.preds))], key=lambda x: x[1])
    prob_preds_4 = sorted([(pb, pr) for pb, pr in zip(eval(r4.probs), eval(r4.preds))], key=lambda x: x[1])
    prob_preds_5 = sorted([(pb, pr) for pb, pr in zip(eval(r5.probs), eval(r5.preds))], key=lambda x: x[1])
    prob_preds_6 = sorted([(pb, pr) for pb, pr in zip(eval(r6.probs), eval(r6.preds))], key=lambda x: x[1])

    # Should be same for all row_ids
    choices = [x[1] for x in prob_preds_1]

    # Set your model weights here (must match number of models).
    # Default: equal weights (edit as needed).
    weights = np.array([0.1, 0.1, 0.2, 0.2, 0.15, 0.25], dtype=float)

    weighted_probs = np.average([
        [x[0] for x in prob_preds_1],
        [x[0] for x in prob_preds_2],
        [x[0] for x in prob_preds_3],
        [x[0] for x in prob_preds_4],
        [x[0] for x in prob_preds_5],
        [x[0] for x in prob_preds_6],
    ], axis=0, weights=weights)

    final_prob_preds = sorted([(l, p) for l, p in zip(choices, weighted_probs)], key=lambda x: -x[1])

    row = {
        "row_id": r1.row_id,
        "Category:Misconception": " ".join([x[0] for x in final_prob_preds[:3]])
    }

    ensemble_predictions.append(row)

ensemble_predictions = pd.DataFrame(ensemble_predictions)


In [20]:
# ensemble_predictions = []
# for r1, r2, r3, r4, r5 in zip(df1.itertuples(), df2.itertuples(), df3.itertuples(), df4.itertuples(), df5.itertuples()):

#     prob_preds_1 = sorted([(pb, pr) for pb, pr in zip(eval(r1.probs), eval(r1.preds))], key=lambda x: x[1])
#     prob_preds_2 = sorted([(pb, pr) for pb, pr in zip(eval(r2.probs), eval(r2.preds))], key=lambda x: x[1])
#     prob_preds_3 = sorted([(pb, pr) for pb, pr in zip(eval(r3.probs), eval(r3.preds))], key=lambda x: x[1])
#     prob_preds_4 = sorted([(pb, pr) for pb, pr in zip(eval(r4.probs), eval(r4.preds))], key=lambda x: x[1])
#     prob_preds_5 = sorted([(pb, pr) for pb, pr in zip(eval(r5.probs), eval(r5.preds))], key=lambda x: x[1])

#     # Should be same for all row_ids
#     choices = [x[1] for x in prob_preds_1]

#     mean_probs = np.mean([
#         [x[0] for x in prob_preds_1],
#         [x[0] for x in prob_preds_2],
#         [x[0] for x in prob_preds_3],
#         [x[0] for x in prob_preds_4],
#         [x[0] for x in prob_preds_5],
#     ],
#     axis=0)

#     final_prob_preds = sorted([(l, p) for l, p in zip(choices, mean_probs)], key=lambda x: -x[1])

#     row = {
#         "row_id": r1.row_id,
#         "Category:Misconception": " ".join([x[0] for x in final_prob_preds[:3]])
#     }

#     ensemble_predictions.append(row)

# ensemble_predictions = pd.DataFrame(ensemble_predictions)

In [21]:
ensemble_predictions.to_csv('submission.csv', index = False)

In [22]:
ensemble_predictions

Unnamed: 0,row_id,Category:Misconception
0,36696,True_Correct:NA True_Neither:NA True_Misconcep...
1,36697,False_Misconception:WNB False_Neither:NA False...
2,36698,True_Neither:NA True_Correct:NA True_Misconcep...
