In [None]:
!pip install --no-index --no-deps /kaggle/input/bitsandbytes-20250725/bitsandbytes/bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl

In [None]:
%%writefile deepseek_0_946.py

import os
import gc
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import joblib
import torch

try:
    from peft import PeftModel, PeftConfig
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False
    print("Warning: PEFT not available, will use base model only")


# Model configuration
VER = 2
MODEL_NAME = "/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-14b/2"
MODEL_TYPE = "qwen2"  # DeepSeek-R1 is based on Qwen2 architecture
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 250  # Increase for DeepSeek model's better long context handling

# Directory settings
OUTPUT_DIR = f"/kaggle/input/deepseek-r1-distill-qwen-14b-cv0.9455-fulltrain/transformers/default/1/ver_2"

# Training parameters
TRAIN_BATCH_SIZE = 8  # Batch size 2 for RTX 5090 with 31GB VRAM
EVAL_BATCH_SIZE = 8  # Eval can use larger batch size
GRADIENT_ACCUMULATION_STEPS = 8  # Reduced to 32 for faster training
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200


# Data paths
TRAIN_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/checkpoint-1722"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.0000001

# GPU settings
CUDA_VISIBLE_DEVICES = "0,1"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'deepseek_r1_submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "deepseek-r1-14b-math-misconceptions"
WANDB_RUN_NAME = f"deepseek-r1-14b-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration
LORA_RANK = 32  # LoRAのランク - reduced for memory efficiency
LORA_ALPHA = 64  # LoRAのスケーリングパラメータ - reduced proportionally
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]  # 対象モジュール
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = False  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # DeepSeek-R1用のプロンプト - シンプルな形式
    prompt = (
        f"User: [Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n\n"
        "Assistant: <think>\n\n</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0.0
    for i, label in enumerate(labels):
        ranks = top3[i]
        if ranks[0] == label:
            score += 1.0
        elif ranks[1] == label:
            score += 1.0 / 2
        elif ranks[2] == label:
            score += 1.0 / 3
    return {"map@3": score / len(labels)}


def create_submission(predictions, test_data, label_encoder, filter_true_false = True):

    question_label_choices = {
        31772: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Incomplete',
            'True_Misconception:WNB',
            'False_Neither:NA',
            'False_Misconception:WNB',
            'False_Misconception:Incomplete',
            'False_Correct:NA'
        ],
        31774: [
            'False_Neither:NA',
            'False_Misconception:SwapDividend',
            'False_Misconception:Mult',
            'False_Correct:NA',
            'False_Misconception:FlipChange',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:SwapDividend',
            'True_Misconception:Mult',
            'True_Misconception:FlipChange'
        ],
        31777: [
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Irrelevant',
            'False_Misconception:Wrong_Fraction',
            'True_Correct:NA',
            'True_Neither:NA'
        ],
        31778: [
            'False_Neither:NA',
            'False_Misconception:Additive',
            'False_Misconception:Irrelevant',
            'False_Correct:NA',
            'False_Misconception:WNB',
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Irrelevant',
            'True_Misconception:Additive'
        ],
        32829: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Not_variable',
            'False_Neither:NA',
            'False_Misconception:Adding_terms',
            'False_Correct:NA',
            'False_Misconception:Not_variable',
            'False_Misconception:Inverse_operation'
        ],
        32833: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Inversion',
            'True_Misconception:Duplication',
            'False_Misconception:Duplication',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Inversion',
            'False_Misconception:Wrong_Operation'
        ],
        32835: [
            'False_Misconception:Whole_numbers_larger',
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Longer_is_bigger',
            'False_Misconception:Ignores_zeroes',
            'False_Misconception:Shorter_is_bigger',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Whole_numbers_larger',
            'True_Misconception:Shorter_is_bigger',
            'True_Misconception:Longer_is_bigger'
        ],
        33471: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Wrong_fraction',
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Wrong_fraction'
        ],
        33472: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Adding_across',
            'True_Misconception:Denominator-only_change',
            'True_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Denominator-only_change',
            'False_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Misconception:Adding_across'
        ],
        33474: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Division',
            'True_Misconception:Subtraction',
            'False_Neither:NA',
            'False_Misconception:Subtraction',
            'False_Misconception:Division',
            'False_Correct:NA'
        ],
        76870: [
            'False_Misconception:Unknowable',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Definition',
            'False_Misconception:Interior',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Definition'
        ],
        89443: [
            'False_Neither:NA',
            'False_Misconception:Positive',
            'False_Misconception:Tacking',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Tacking',
            'True_Misconception:Positive',
            'False_Correct:NA'
        ],
        91695: [
            'False_Neither:NA',
            'False_Misconception:Wrong_term',
            'False_Correct:NA',
            'False_Misconception:Firstterm',
            'True_Correct:NA',
            'True_Misconception:Wrong_term',
            'True_Neither:NA',
            'True_Misconception:Firstterm'
        ],
        104665: [
            'False_Neither:NA',
            'False_Misconception:Base_rate',
            'False_Correct:NA',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Base_rate',
            'True_Misconception:Multiplying_by_4',
            'False_Misconception:Multiplying_by_4'
        ],
        109465: [
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Certainty',
            'False_Misconception:Scale',
            'True_Correct:NA',
            'True_Neither:NA'
        ]
    }

    # Identify which are True/False classes
    true_classes = {}
    false_classes = {}
    for idx, c in enumerate(label_encoder.classes_):
    
        if 'True' in c:
            true_classes[idx] = c
        else:
            false_classes[idx] = c

    
    # Normalize for Label Encoder
    question_label_choice_ids = {}
    for qid, choices in question_label_choices.items():
        _label_ids = np.where(np.isin(label_encoder.classes_, question_label_choices[qid]))[0]
        
        question_label_choice_ids[qid] = [int(x) for x in _label_ids]
        
    
    test_probabilities = []
    test_predictions = []
    test_top3_predictions = []

    for qid, correct, row in zip(test_data.QuestionId.tolist(), test_data.is_correct.tolist(), predictions.predictions):
    
        candidate_idx = question_label_choice_ids[qid]

        # If filter candidates using True/False information
        if filter_true_false:
            if correct == 1:
                # use true_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in true_classes]
            if correct == 0:
                # use false_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in false_classes]
    
        candidate_logits = row[candidate_idx]
    
        candidate_probs = torch.nn.functional.softmax(torch.tensor(candidate_logits), dim=-1).numpy()
    
        top_k = np.argsort(-candidate_probs)
    
        # Have to convert back to the original label encoder space
        topk_idx = np.array(candidate_idx)[top_k]
    
        # Keep the probabilities
        topk_probs = candidate_probs[top_k].tolist()
    
        # Get the predicted labels
        topk_preds = label_encoder.inverse_transform(topk_idx).tolist()
    
        test_probabilities.append(topk_probs)
        test_predictions.append(topk_preds)
        test_top3_predictions.append(" ".join(topk_preds[:3]))
    
    test_submission_data = pd.DataFrame({
        "row_id": test_data.row_id.tolist(),
        "QuestionId": test_data.QuestionId.tolist(),
        "is_correct": test_data.is_correct.tolist(),
        "probs": test_probabilities,
        "preds": test_predictions,
        'Category:Misconception': test_top3_predictions
    })

    return test_submission_data


def main():
    """メイン推論関数"""
    
    # メモリキャッシュをクリア
    torch.cuda.empty_cache()
    gc.collect()
    
    # CUDAメモリ管理の最適化
    import os
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    
    # 2つのGPUを使用可能にする
    if torch.cuda.device_count() > 1:
        print(f"Found {torch.cuda.device_count()} GPUs")

    print("Loading label encoder...")
    # ラベルエンコーダーの読み込み
    le = joblib.load(LABEL_ENCODER_PATH)
    n_classes = len(le.classes_)

    print("Loading trained model and tokenizer...")

    if PEFT_AVAILABLE:
        # LoRAアダプターを使用する場合
        print(f"Loading fine-tuned LoRA model from: {BEST_MODEL_PATH}")
        print(f"Loading base model from: {MODEL_NAME}")

        # ベースモデルを読み込む（float16で読み込み）
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=n_classes,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto",  # 自動的に複数GPUに分散
            low_cpu_mem_usage=True  # CPUメモリ使用量を削減
        )

        # LoRAアダプターを適用
        model = PeftModel.from_pretrained(model, BEST_MODEL_PATH)
        
        # 推論モードに設定（メモリ効率化）
        model.eval()
        # float16モデルは既にGPUに配置されているのでto('cuda')は不要

        # トークナイザーはベースモデルから読み込む
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        print("Successfully loaded LoRA fine-tuned model")
    else:
        # PEFTが利用できない場合はエラー
        raise ImportError("PEFT is required to load the fine-tuned model. Please install peft: pip install peft")

    # パディングトークンの設定
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
    
    # モデルの設定を更新（PeftModelのbase_modelにアクセス）
    if hasattr(model, 'base_model'):
        model.base_model.config.pad_token_id = tokenizer.pad_token_id
        # 内部のモデルにも設定
        if hasattr(model.base_model, 'model'):
            model.base_model.model.config.pad_token_id = tokenizer.pad_token_id
    else:
        model.config.pad_token_id = tokenizer.pad_token_id

    print("Loading test data...")
    # テストデータの読み込み
    test = pd.read_csv(TEST_DATA_PATH)

    print("Loading training data for correct answers...")
    # 正解答案データの準備（訓練データから取得）
    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    correct = prepare_correct_answers(train)

    print("Preprocessing test data...")
    # テストデータの前処理
    test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    test.is_correct = test.is_correct.fillna(0)
    test['text'] = test.apply(format_input, axis=1)

    print("Tokenizing test data...")
    # テストデータのトークナイズ
    ds_test = Dataset.from_pandas(test[['text']])
    ds_test = tokenize_dataset(ds_test, tokenizer, MAX_LEN)

    # パディングのためのデータコラレータの設定
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("Running inference...")
    
    # TF32を有効化（推論速度向上）
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    
    # 推論の実行
    trainer = Trainer(
        model=model,
        processing_class=tokenizer,  # tokenizer の代替
        data_collator=data_collator,  # バッチ時に自動でパディングを適用
        args=TrainingArguments(
            output_dir="./tmp",  # 一時ディレクトリ（必須パラメータ）
            report_to="none",    # wandbを無効化
            per_device_eval_batch_size=EVAL_BATCH_SIZE,  # 設定ファイルから取得
            fp16=True,  # float16を使用
            dataloader_pin_memory=True,  # データローダーの高速化
            dataloader_num_workers=2,  # データ読み込みの並列化
        )
    )
    # no_gradコンテキストで推論を実行（メモリ効率化）
    with torch.no_grad():
        predictions = trainer.predict(ds_test)

    print("Creating submission file...")
    # 提出用ファイルの作成
    submission = create_submission(predictions, test, le)

    # ファイルの保存
    submission.to_csv(SUBMISSION_OUTPUT_PATH, index=False)
    print(f"Submission file saved to: {SUBMISSION_OUTPUT_PATH}")
    print("\nSubmission preview:")
    print(submission.head())
    print(f"\nSubmission shape: {submission.shape}")


if __name__ == "__main__":
    main()

In [None]:
%%writefile qwen3_14b_0_946.py

import os
import gc
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import joblib
import torch

try:
    from peft import PeftModel, PeftConfig
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False
    print("Warning: PEFT not available, will use base model only")

# Model configuration
VER = 2
MODEL_NAME = "/kaggle/input/qwen-3/transformers/14b/1"
MODEL_TYPE = "qwen2"  # Add model type for proper handling
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 250  # Increase max length for better context

# Directory settings
OUTPUT_DIR = f"/kaggle/input/qwen3-14b-lb0.945-fulltrain/transformers/default/1/ver_2"

# Training parameters
TRAIN_BATCH_SIZE = 4  # Batch size 2 for RTX 5090 with 31GB VRAM
EVAL_BATCH_SIZE = 4  # Eval can use larger batch size
GRADIENT_ACCUMULATION_STEPS = 16  # Reduced to 32 for faster training
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200


# Data paths
TRAIN_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/checkpoint-1722"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.00000001

# GPU settings
CUDA_VISIBLE_DEVICES = "0,1"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'qwen3_14b_submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "qwen3-14b-math-misconceptions"
WANDB_RUN_NAME = f"qwen3-14b-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration
LORA_RANK = 128  # LoRAのランク - reduced for memory efficiency
LORA_ALPHA = 256  # LoRAのスケーリングパラメータ - reduced proportionally
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]  # 対象モジュール
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = False  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # Qwen2.5-Math用の数学タスクに特化したプロンプト
    prompt = (
        "<|im_start|>user"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n\n"
        "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0.0
    for i, label in enumerate(labels):
        ranks = top3[i]
        if ranks[0] == label:
            score += 1.0
        elif ranks[1] == label:
            score += 1.0 / 2
        elif ranks[2] == label:
            score += 1.0 / 3
    return {"map@3": score / len(labels)}


def create_submission(predictions, test_data, label_encoder, filter_true_false = True):

    question_label_choices = {
        31772: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Incomplete',
            'True_Misconception:WNB',
            'False_Neither:NA',
            'False_Misconception:WNB',
            'False_Misconception:Incomplete',
            'False_Correct:NA'
        ],
        31774: [
            'False_Neither:NA',
            'False_Misconception:SwapDividend',
            'False_Misconception:Mult',
            'False_Correct:NA',
            'False_Misconception:FlipChange',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:SwapDividend',
            'True_Misconception:Mult',
            'True_Misconception:FlipChange'
        ],
        31777: [
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Irrelevant',
            'False_Misconception:Wrong_Fraction',
            'True_Correct:NA',
            'True_Neither:NA'
        ],
        31778: [
            'False_Neither:NA',
            'False_Misconception:Additive',
            'False_Misconception:Irrelevant',
            'False_Correct:NA',
            'False_Misconception:WNB',
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Irrelevant',
            'True_Misconception:Additive'
        ],
        32829: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Not_variable',
            'False_Neither:NA',
            'False_Misconception:Adding_terms',
            'False_Correct:NA',
            'False_Misconception:Not_variable',
            'False_Misconception:Inverse_operation'
        ],
        32833: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Inversion',
            'True_Misconception:Duplication',
            'False_Misconception:Duplication',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Inversion',
            'False_Misconception:Wrong_Operation'
        ],
        32835: [
            'False_Misconception:Whole_numbers_larger',
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Longer_is_bigger',
            'False_Misconception:Ignores_zeroes',
            'False_Misconception:Shorter_is_bigger',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Whole_numbers_larger',
            'True_Misconception:Shorter_is_bigger',
            'True_Misconception:Longer_is_bigger'
        ],
        33471: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Wrong_fraction',
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Wrong_fraction'
        ],
        33472: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Adding_across',
            'True_Misconception:Denominator-only_change',
            'True_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Denominator-only_change',
            'False_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Misconception:Adding_across'
        ],
        33474: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Division',
            'True_Misconception:Subtraction',
            'False_Neither:NA',
            'False_Misconception:Subtraction',
            'False_Misconception:Division',
            'False_Correct:NA'
        ],
        76870: [
            'False_Misconception:Unknowable',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Definition',
            'False_Misconception:Interior',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Definition'
        ],
        89443: [
            'False_Neither:NA',
            'False_Misconception:Positive',
            'False_Misconception:Tacking',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Tacking',
            'True_Misconception:Positive',
            'False_Correct:NA'
        ],
        91695: [
            'False_Neither:NA',
            'False_Misconception:Wrong_term',
            'False_Correct:NA',
            'False_Misconception:Firstterm',
            'True_Correct:NA',
            'True_Misconception:Wrong_term',
            'True_Neither:NA',
            'True_Misconception:Firstterm'
        ],
        104665: [
            'False_Neither:NA',
            'False_Misconception:Base_rate',
            'False_Correct:NA',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Base_rate',
            'True_Misconception:Multiplying_by_4',
            'False_Misconception:Multiplying_by_4'
        ],
        109465: [
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Certainty',
            'False_Misconception:Scale',
            'True_Correct:NA',
            'True_Neither:NA'
        ]
    }

    # Identify which are True/False classes
    true_classes = {}
    false_classes = {}
    for idx, c in enumerate(label_encoder.classes_):
    
        if 'True' in c:
            true_classes[idx] = c
        else:
            false_classes[idx] = c

    
    # Normalize for Label Encoder
    question_label_choice_ids = {}
    for qid, choices in question_label_choices.items():
        _label_ids = np.where(np.isin(label_encoder.classes_, question_label_choices[qid]))[0]
        
        question_label_choice_ids[qid] = [int(x) for x in _label_ids]
        
    
    test_probabilities = []
    test_predictions = []
    test_top3_predictions = []

    for qid, correct, row in zip(test_data.QuestionId.tolist(), test_data.is_correct.tolist(), predictions.predictions):
    
        candidate_idx = question_label_choice_ids[qid]

        # If filter candidates using True/False information
        if filter_true_false:
            if correct == 1:
                # use true_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in true_classes]
            if correct == 0:
                # use false_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in false_classes]
    
        candidate_logits = row[candidate_idx]
    
        candidate_probs = torch.nn.functional.softmax(torch.tensor(candidate_logits), dim=-1).numpy()
    
        top_k = np.argsort(-candidate_probs)
    
        # Have to convert back to the original label encoder space
        topk_idx = np.array(candidate_idx)[top_k]
    
        # Keep the probabilities
        topk_probs = candidate_probs[top_k].tolist()
    
        # Get the predicted labels
        topk_preds = label_encoder.inverse_transform(topk_idx).tolist()
    
        test_probabilities.append(topk_probs)
        test_predictions.append(topk_preds)
        test_top3_predictions.append(" ".join(topk_preds[:3]))
    
    test_submission_data = pd.DataFrame({
        "row_id": test_data.row_id.tolist(),
        "QuestionId": test_data.QuestionId.tolist(),
        "is_correct": test_data.is_correct.tolist(),
        "probs": test_probabilities,
        "preds": test_predictions,
        'Category:Misconception': test_top3_predictions
    })

    return test_submission_data


def main():
    """メイン推論関数"""
    
    # メモリキャッシュをクリア
    torch.cuda.empty_cache()
    gc.collect()
    
    # CUDAメモリ管理の最適化
    import os
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    
    # 2つのGPUを使用可能にする
    if torch.cuda.device_count() > 1:
        print(f"Found {torch.cuda.device_count()} GPUs")

    print("Loading label encoder...")
    # ラベルエンコーダーの読み込み
    le = joblib.load(LABEL_ENCODER_PATH)
    n_classes = len(le.classes_)

    print("Loading trained model and tokenizer...")

    if PEFT_AVAILABLE:
        # LoRAアダプターを使用する場合
        print(f"Loading fine-tuned LoRA model from: {BEST_MODEL_PATH}")
        print(f"Loading base model from: {MODEL_NAME}")

        # ベースモデルを読み込む（float16で読み込み）
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=n_classes,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto",  # 自動的に複数GPUに分散
            low_cpu_mem_usage=True  # CPUメモリ使用量を削減
        )

        # LoRAアダプターを適用
        model = PeftModel.from_pretrained(model, BEST_MODEL_PATH)
        
        # 推論モードに設定（メモリ効率化）
        model.eval()
        # float16モデルは既にGPUに配置されているのでto('cuda')は不要

        # トークナイザーはベースモデルから読み込む
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        print("Successfully loaded LoRA fine-tuned model")
    else:
        # PEFTが利用できない場合はエラー
        raise ImportError("PEFT is required to load the fine-tuned model. Please install peft: pip install peft")

    # パディングトークンの設定
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
    
    # モデルの設定を更新（PeftModelのbase_modelにアクセス）
    if hasattr(model, 'base_model'):
        model.base_model.config.pad_token_id = tokenizer.pad_token_id
        # 内部のモデルにも設定
        if hasattr(model.base_model, 'model'):
            model.base_model.model.config.pad_token_id = tokenizer.pad_token_id
    else:
        model.config.pad_token_id = tokenizer.pad_token_id

    print("Loading test data...")
    # テストデータの読み込み
    test = pd.read_csv(TEST_DATA_PATH)

    print("Loading training data for correct answers...")
    # 正解答案データの準備（訓練データから取得）
    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    correct = prepare_correct_answers(train)

    print("Preprocessing test data...")
    # テストデータの前処理
    test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    test.is_correct = test.is_correct.fillna(0)
    test['text'] = test.apply(format_input, axis=1)

    print("Tokenizing test data...")
    # テストデータのトークナイズ
    ds_test = Dataset.from_pandas(test[['text']])
    ds_test = tokenize_dataset(ds_test, tokenizer, MAX_LEN)

    # パディングのためのデータコラレータの設定
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("Running inference...")
    
    # TF32を有効化（推論速度向上）
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    
    # 推論の実行
    trainer = Trainer(
        model=model,
        processing_class=tokenizer,  # tokenizer の代替
        data_collator=data_collator,  # バッチ時に自動でパディングを適用
        args=TrainingArguments(
            output_dir="./tmp",  # 一時ディレクトリ（必須パラメータ）
            report_to="none",    # wandbを無効化
            per_device_eval_batch_size=EVAL_BATCH_SIZE,  # 設定ファイルから取得
            fp16=True,  # float16を使用
            dataloader_pin_memory=True,  # データローダーの高速化
            dataloader_num_workers=2,  # データ読み込みの並列化
        )
    )
    # no_gradコンテキストで推論を実行（メモリ効率化）
    with torch.no_grad():
        predictions = trainer.predict(ds_test)

    print("Creating submission file...")
    # 提出用ファイルの作成
    submission = create_submission(predictions, test, le)

    # ファイルの保存
    submission.to_csv(SUBMISSION_OUTPUT_PATH, index=False)
    print(f"Submission file saved to: {SUBMISSION_OUTPUT_PATH}")
    print("\nSubmission preview:")
    print(submission.head())
    print(f"\nSubmission shape: {submission.shape}")


if __name__ == "__main__":
    main()

In [None]:
%%writefile qwen3_32b_0_947.py

import os
import gc
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import joblib
import torch

try:
    from peft import PeftModel, PeftConfig
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False
    print("Warning: PEFT not available, will use base model only")

# Model configuration
VER = 2
MODEL_NAME = "/kaggle/input/qwen-3/transformers/32b/1"
MODEL_TYPE = "qwen2"  # Add model type for proper handling
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 300  # Increase max length for better context

# Directory settings
OUTPUT_DIR = f"/kaggle/input/qwen3-32b-9468/transformers/default/1/ver_2"

# Training parameters
TRAIN_BATCH_SIZE = 16  # Batch size 2 for RTX 5090 with 31GB VRAM
EVAL_BATCH_SIZE = 16  # Eval can use larger batch size
GRADIENT_ACCUMULATION_STEPS = 2  # Reduced to 32 for faster training
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200


# Data paths
TRAIN_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/best"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.2

# GPU settings
CUDA_VISIBLE_DEVICES = "0"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'qwen3_32b_0_947_submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "qwen3-32b-math-misconceptions"
WANDB_RUN_NAME = f"qwen3-32b-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration
LORA_RANK = 16  # LoRAのランク - reduced for memory efficiency
LORA_ALPHA = 32  # LoRAのスケーリングパラメータ - reduced proportionally
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]  # 対象モジュール
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = True  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # Qwen2.5-Math用の数学タスクに特化したプロンプト
    prompt = (
        "<|im_start|>user"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n\n"
        "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0.0
    for i, label in enumerate(labels):
        ranks = top3[i]
        if ranks[0] == label:
            score += 1.0
        elif ranks[1] == label:
            score += 1.0 / 2
        elif ranks[2] == label:
            score += 1.0 / 3
    return {"map@3": score / len(labels)}


def create_submission(predictions, test_data, label_encoder, filter_true_false = True):

    question_label_choices = {
        31772: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Incomplete',
            'True_Misconception:WNB',
            'False_Neither:NA',
            'False_Misconception:WNB',
            'False_Misconception:Incomplete',
            'False_Correct:NA'
        ],
        31774: [
            'False_Neither:NA',
            'False_Misconception:SwapDividend',
            'False_Misconception:Mult',
            'False_Correct:NA',
            'False_Misconception:FlipChange',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:SwapDividend',
            'True_Misconception:Mult',
            'True_Misconception:FlipChange'
        ],
        31777: [
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Irrelevant',
            'False_Misconception:Wrong_Fraction',
            'True_Correct:NA',
            'True_Neither:NA'
        ],
        31778: [
            'False_Neither:NA',
            'False_Misconception:Additive',
            'False_Misconception:Irrelevant',
            'False_Correct:NA',
            'False_Misconception:WNB',
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Irrelevant',
            'True_Misconception:Additive'
        ],
        32829: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Not_variable',
            'False_Neither:NA',
            'False_Misconception:Adding_terms',
            'False_Correct:NA',
            'False_Misconception:Not_variable',
            'False_Misconception:Inverse_operation'
        ],
        32833: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Inversion',
            'True_Misconception:Duplication',
            'False_Misconception:Duplication',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Inversion',
            'False_Misconception:Wrong_Operation'
        ],
        32835: [
            'False_Misconception:Whole_numbers_larger',
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Longer_is_bigger',
            'False_Misconception:Ignores_zeroes',
            'False_Misconception:Shorter_is_bigger',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Whole_numbers_larger',
            'True_Misconception:Shorter_is_bigger',
            'True_Misconception:Longer_is_bigger'
        ],
        33471: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Wrong_fraction',
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Wrong_fraction'
        ],
        33472: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Adding_across',
            'True_Misconception:Denominator-only_change',
            'True_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Denominator-only_change',
            'False_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Misconception:Adding_across'
        ],
        33474: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Division',
            'True_Misconception:Subtraction',
            'False_Neither:NA',
            'False_Misconception:Subtraction',
            'False_Misconception:Division',
            'False_Correct:NA'
        ],
        76870: [
            'False_Misconception:Unknowable',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Definition',
            'False_Misconception:Interior',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Definition'
        ],
        89443: [
            'False_Neither:NA',
            'False_Misconception:Positive',
            'False_Misconception:Tacking',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Tacking',
            'True_Misconception:Positive',
            'False_Correct:NA'
        ],
        91695: [
            'False_Neither:NA',
            'False_Misconception:Wrong_term',
            'False_Correct:NA',
            'False_Misconception:Firstterm',
            'True_Correct:NA',
            'True_Misconception:Wrong_term',
            'True_Neither:NA',
            'True_Misconception:Firstterm'
        ],
        104665: [
            'False_Neither:NA',
            'False_Misconception:Base_rate',
            'False_Correct:NA',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Base_rate',
            'True_Misconception:Multiplying_by_4',
            'False_Misconception:Multiplying_by_4'
        ],
        109465: [
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Certainty',
            'False_Misconception:Scale',
            'True_Correct:NA',
            'True_Neither:NA'
        ]
    }

    # Identify which are True/False classes
    true_classes = {}
    false_classes = {}
    for idx, c in enumerate(label_encoder.classes_):
    
        if 'True' in c:
            true_classes[idx] = c
        else:
            false_classes[idx] = c

    
    # Normalize for Label Encoder
    question_label_choice_ids = {}
    for qid, choices in question_label_choices.items():
        _label_ids = np.where(np.isin(label_encoder.classes_, question_label_choices[qid]))[0]
        
        question_label_choice_ids[qid] = [int(x) for x in _label_ids]
        
    
    test_probabilities = []
    test_predictions = []
    test_top3_predictions = []

    for qid, correct, row in zip(test_data.QuestionId.tolist(), test_data.is_correct.tolist(), predictions.predictions):
    
        candidate_idx = question_label_choice_ids[qid]

        # If filter candidates using True/False information
        if filter_true_false:
            if correct == 1:
                # use true_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in true_classes]
            if correct == 0:
                # use false_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in false_classes]
    
        candidate_logits = row[candidate_idx]
    
        candidate_probs = torch.nn.functional.softmax(torch.tensor(candidate_logits), dim=-1).numpy()
    
        top_k = np.argsort(-candidate_probs)
    
        # Have to convert back to the original label encoder space
        topk_idx = np.array(candidate_idx)[top_k]
    
        # Keep the probabilities
        topk_probs = candidate_probs[top_k].tolist()
    
        # Get the predicted labels
        topk_preds = label_encoder.inverse_transform(topk_idx).tolist()
    
        test_probabilities.append(topk_probs)
        test_predictions.append(topk_preds)
        test_top3_predictions.append(" ".join(topk_preds[:3]))
    
    test_submission_data = pd.DataFrame({
        "row_id": test_data.row_id.tolist(),
        "QuestionId": test_data.QuestionId.tolist(),
        "is_correct": test_data.is_correct.tolist(),
        "probs": test_probabilities,
        "preds": test_predictions,
        'Category:Misconception': test_top3_predictions
    })

    return test_submission_data


def main():
    """メイン推論関数"""

    # メモリキャッシュをクリア
    torch.cuda.empty_cache()
    gc.collect()

    # CUDAメモリ管理の最適化
    import os
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

    # 2つのGPUを使用可能にする
    if torch.cuda.device_count() > 1:
        print(f"Found {torch.cuda.device_count()} GPUs")

    print("Loading label encoder...")
    # ラベルエンコーダーの読み込み
    le = joblib.load(LABEL_ENCODER_PATH)
    n_classes = len(le.classes_)

    print("Loading trained model and tokenizer...")

    if PEFT_AVAILABLE:
        # LoRAアダプターを使用する場合
        print(f"Loading fine-tuned LoRA model from: {BEST_MODEL_PATH}")
        print(f"Loading base model from: {MODEL_NAME}")

        # ベースモデルを読み込む（4bit量子化で読み込み）
        from transformers import BitsAndBytesConfig

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )

        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=n_classes,
            trust_remote_code=True,
            quantization_config=quantization_config,
            device_map="auto",  # 自動的に複数GPUに分散
            low_cpu_mem_usage=True  # CPUメモリ使用量を削減
        )

        # LoRAアダプターを適用
        model = PeftModel.from_pretrained(model, BEST_MODEL_PATH)

        # 推論モードに設定（メモリ効率化）
        model.eval()
        # 4bit量子化モデルは既にGPUに配置されているのでto('cuda')は不要

        # トークナイザーはベースモデルから読み込む
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        print("Successfully loaded LoRA fine-tuned model")
    else:
        # PEFTが利用できない場合はエラー
        raise ImportError("PEFT is required to load the fine-tuned model. Please install peft: pip install peft")

    # パディングトークンの設定
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # モデルの設定を更新（PeftModelのbase_modelにアクセス）
    if hasattr(model, 'base_model'):
        model.base_model.config.pad_token_id = tokenizer.pad_token_id
        # 内部のモデルにも設定
        if hasattr(model.base_model, 'model'):
            model.base_model.model.config.pad_token_id = tokenizer.pad_token_id
    else:
        model.config.pad_token_id = tokenizer.pad_token_id

    print("Loading test data...")
    # テストデータの読み込み
    test = pd.read_csv(TEST_DATA_PATH)

    print("Loading training data for correct answers...")
    # 正解答案データの準備（訓練データから取得）
    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    correct = prepare_correct_answers(train)

    print("Preprocessing test data...")
    # テストデータの前処理
    test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    test.is_correct = test.is_correct.fillna(0)
    test['text'] = test.apply(format_input, axis=1)

    print("Tokenizing test data...")
    # テストデータのトークナイズ
    ds_test = Dataset.from_pandas(test[['text']])
    ds_test = tokenize_dataset(ds_test, tokenizer, MAX_LEN)

    # パディングのためのデータコラレータの設定
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("Running inference...")

    # TF32を有効化（推論速度向上）
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # 推論の実行
    trainer = Trainer(
        model=model,
        processing_class=tokenizer,  # tokenizer の代替
        data_collator=data_collator,  # バッチ時に自動でパディングを適用
        args=TrainingArguments(
            output_dir="./tmp",  # 一時ディレクトリ（必須パラメータ）
            report_to="none",    # wandbを無効化
            per_device_eval_batch_size=EVAL_BATCH_SIZE,  # 設定ファイルから取得
            fp16=True,  # float16を使用
            dataloader_pin_memory=True,  # データローダーの高速化
            dataloader_num_workers=2,  # データ読み込みの並列化
        )
    )
    # no_gradコンテキストで推論を実行（メモリ効率化）
    with torch.no_grad():
        predictions = trainer.predict(ds_test)

    print("Creating submission file...")
    # 提出用ファイルの作成
    submission = create_submission(predictions, test, le)

    # ファイルの保存
    submission.to_csv(SUBMISSION_OUTPUT_PATH, index=False)
    print(f"Submission file saved to: {SUBMISSION_OUTPUT_PATH}")
    print("\nSubmission preview:")
    print(submission.head())
    print(f"\nSubmission shape: {submission.shape}")


if __name__ == "__main__":
    main()

In [None]:
%%writefile phi_4_0_948_fulltrain.py

import os
import gc
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import joblib
import torch

try:
    from peft import PeftModel, PeftConfig
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False
    print("Warning: PEFT not available, will use base model only")

# Model configuration
VER = 2
MODEL_NAME = "/kaggle/input/ms-phi4/transformers/default/1/phi-4"
MODEL_TYPE = "phi"  # Phi-4 model type
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 250  # Phi-4 supports longer context

# Directory settings
OUTPUT_DIR = f"/kaggle/input/phi-4-cv0965-fulltrain/transformers/default/1/ver_2_0965ft"

# Training parameters
TRAIN_BATCH_SIZE = 4  # Smaller batch size for Phi-4
EVAL_BATCH_SIZE = 4  # Eval batch size
GRADIENT_ACCUMULATION_STEPS = 16  # Increased for effective batch size
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200


# Data paths
TRAIN_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/checkpoint-1722"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.2

# GPU settings
CUDA_VISIBLE_DEVICES = "0"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'phi_4_0_948_fulltrain_submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "phi-4-math-misconceptions"
WANDB_RUN_NAME = f"phi-4-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration for Phi-4
LORA_RANK = 64  # LoRAのランク - optimized for Phi-4
LORA_ALPHA = 128  # LoRAのスケーリングパラメータ - 1:1 ratio with rank
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]  # Phi-4 target modules
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率 - reduced for Phi-4
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = False  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # Phi-4用のプロンプトフォーマット（特別なthinkタグを含む）
    prompt = (
        "<|user|>\n"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n"
        "<|end|>\n"
        "<|assistant|>\n"
        "<think>\n"
        "Let me analyze this mathematical misconception...\n"
        "</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0.0
    for i, label in enumerate(labels):
        ranks = top3[i]
        if ranks[0] == label:
            score += 1.0
        elif ranks[1] == label:
            score += 1.0 / 2
        elif ranks[2] == label:
            score += 1.0 / 3
    return {"map@3": score / len(labels)}


def create_submission(predictions, test_data, label_encoder, filter_true_false = True):

    question_label_choices = {
        31772: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Incomplete',
            'True_Misconception:WNB',
            'False_Neither:NA',
            'False_Misconception:WNB',
            'False_Misconception:Incomplete',
            'False_Correct:NA'
        ],
        31774: [
            'False_Neither:NA',
            'False_Misconception:SwapDividend',
            'False_Misconception:Mult',
            'False_Correct:NA',
            'False_Misconception:FlipChange',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:SwapDividend',
            'True_Misconception:Mult',
            'True_Misconception:FlipChange'
        ],
        31777: [
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Irrelevant',
            'False_Misconception:Wrong_Fraction',
            'True_Correct:NA',
            'True_Neither:NA'
        ],
        31778: [
            'False_Neither:NA',
            'False_Misconception:Additive',
            'False_Misconception:Irrelevant',
            'False_Correct:NA',
            'False_Misconception:WNB',
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Irrelevant',
            'True_Misconception:Additive'
        ],
        32829: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Not_variable',
            'False_Neither:NA',
            'False_Misconception:Adding_terms',
            'False_Correct:NA',
            'False_Misconception:Not_variable',
            'False_Misconception:Inverse_operation'
        ],
        32833: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Inversion',
            'True_Misconception:Duplication',
            'False_Misconception:Duplication',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Inversion',
            'False_Misconception:Wrong_Operation'
        ],
        32835: [
            'False_Misconception:Whole_numbers_larger',
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Longer_is_bigger',
            'False_Misconception:Ignores_zeroes',
            'False_Misconception:Shorter_is_bigger',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Whole_numbers_larger',
            'True_Misconception:Shorter_is_bigger',
            'True_Misconception:Longer_is_bigger'
        ],
        33471: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Wrong_fraction',
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Wrong_fraction'
        ],
        33472: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Adding_across',
            'True_Misconception:Denominator-only_change',
            'True_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Denominator-only_change',
            'False_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Misconception:Adding_across'
        ],
        33474: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Division',
            'True_Misconception:Subtraction',
            'False_Neither:NA',
            'False_Misconception:Subtraction',
            'False_Misconception:Division',
            'False_Correct:NA'
        ],
        76870: [
            'False_Misconception:Unknowable',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Definition',
            'False_Misconception:Interior',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Definition'
        ],
        89443: [
            'False_Neither:NA',
            'False_Misconception:Positive',
            'False_Misconception:Tacking',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Tacking',
            'True_Misconception:Positive',
            'False_Correct:NA'
        ],
        91695: [
            'False_Neither:NA',
            'False_Misconception:Wrong_term',
            'False_Correct:NA',
            'False_Misconception:Firstterm',
            'True_Correct:NA',
            'True_Misconception:Wrong_term',
            'True_Neither:NA',
            'True_Misconception:Firstterm'
        ],
        104665: [
            'False_Neither:NA',
            'False_Misconception:Base_rate',
            'False_Correct:NA',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Base_rate',
            'True_Misconception:Multiplying_by_4',
            'False_Misconception:Multiplying_by_4'
        ],
        109465: [
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Certainty',
            'False_Misconception:Scale',
            'True_Correct:NA',
            'True_Neither:NA'
        ]
    }

    # Identify which are True/False classes
    true_classes = {}
    false_classes = {}
    for idx, c in enumerate(label_encoder.classes_):
    
        if 'True' in c:
            true_classes[idx] = c
        else:
            false_classes[idx] = c

    
    # Normalize for Label Encoder
    question_label_choice_ids = {}
    for qid, choices in question_label_choices.items():
        _label_ids = np.where(np.isin(label_encoder.classes_, question_label_choices[qid]))[0]
        
        question_label_choice_ids[qid] = [int(x) for x in _label_ids]
        
    
    test_probabilities = []
    test_predictions = []
    test_top3_predictions = []

    for qid, correct, row in zip(test_data.QuestionId.tolist(), test_data.is_correct.tolist(), predictions.predictions):
    
        candidate_idx = question_label_choice_ids[qid]

        # If filter candidates using True/False information
        if filter_true_false:
            if correct == 1:
                # use true_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in true_classes]
            if correct == 0:
                # use false_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in false_classes]
    
        candidate_logits = row[candidate_idx]
    
        candidate_probs = torch.nn.functional.softmax(torch.tensor(candidate_logits), dim=-1).numpy()
    
        top_k = np.argsort(-candidate_probs)
    
        # Have to convert back to the original label encoder space
        topk_idx = np.array(candidate_idx)[top_k]
    
        # Keep the probabilities
        topk_probs = candidate_probs[top_k].tolist()
    
        # Get the predicted labels
        topk_preds = label_encoder.inverse_transform(topk_idx).tolist()
    
        test_probabilities.append(topk_probs)
        test_predictions.append(topk_preds)
        test_top3_predictions.append(" ".join(topk_preds[:3]))
    
    test_submission_data = pd.DataFrame({
        "row_id": test_data.row_id.tolist(),
        "QuestionId": test_data.QuestionId.tolist(),
        "is_correct": test_data.is_correct.tolist(),
        "probs": test_probabilities,
        "preds": test_predictions,
        'Category:Misconception': test_top3_predictions
    })

    return test_submission_data


def main():
    """メイン推論関数"""
    
    # メモリキャッシュをクリア
    torch.cuda.empty_cache()
    gc.collect()
    
    # CUDAメモリ管理の最適化
    import os
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    
    # 2つのGPUを使用可能にする
    if torch.cuda.device_count() > 1:
        print(f"Found {torch.cuda.device_count()} GPUs")

    print("Loading label encoder...")
    # ラベルエンコーダーの読み込み
    le = joblib.load(LABEL_ENCODER_PATH)
    n_classes = len(le.classes_)

    print("Loading trained model and tokenizer...")

    if PEFT_AVAILABLE:
        # LoRAアダプターを使用する場合
        print(f"Loading fine-tuned LoRA model from: {BEST_MODEL_PATH}")
        print(f"Loading base model from: {MODEL_NAME}")

        # ベースモデルを読み込む（量子化なしでフルプレシジョン）
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=n_classes,
            trust_remote_code=True,
            device_map="auto",  # 自動的に複数GPUに分散
            torch_dtype=torch.float16,  # float16を使用（メモリ効率とパフォーマンスのバランス）
            low_cpu_mem_usage=True  # CPUメモリ使用量を削減
        )

        # LoRAアダプターを適用
        model = PeftModel.from_pretrained(model, BEST_MODEL_PATH)
        
        # 推論モードに設定（メモリ効率化）
        model.eval()
        # モデルは既にdevice_mapでGPUに配置されているのでto('cuda')は不要

        # トークナイザーはベースモデルから読み込む
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        print("Successfully loaded LoRA fine-tuned model")
    else:
        # PEFTが利用できない場合はエラー
        raise ImportError("PEFT is required to load the fine-tuned model. Please install peft: pip install peft")

    # パディングトークンの設定
    if tokenizer.pad_token is None:
        tokenizer.pad_token = "<|finetune_right_pad_id|>"
        tokenizer.pad_token_id = 100257
    
    # モデルの設定を更新（PeftModelのbase_modelにアクセス）
    if hasattr(model, 'base_model'):
        model.base_model.config.pad_token_id = tokenizer.pad_token_id
        # 内部のモデルにも設定
        if hasattr(model.base_model, 'model'):
            model.base_model.model.config.pad_token_id = tokenizer.pad_token_id
    else:
        model.config.pad_token_id = tokenizer.pad_token_id

    print("Loading test data...")
    # テストデータの読み込み
    test = pd.read_csv(TEST_DATA_PATH)

    print("Loading training data for correct answers...")
    # 正解答案データの準備（訓練データから取得）
    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    correct = prepare_correct_answers(train)

    print("Preprocessing test data...")
    # テストデータの前処理
    test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    test.is_correct = test.is_correct.fillna(0)
    test['text'] = test.apply(format_input, axis=1)

    print("Tokenizing test data...")
    # テストデータのトークナイズ
    ds_test = Dataset.from_pandas(test[['text']])
    ds_test = tokenize_dataset(ds_test, tokenizer, MAX_LEN)

    # パディングのためのデータコラレータの設定
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("Running inference...")
    
    # TF32を有効化（推論速度向上）
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    
    # 推論の実行
    trainer = Trainer(
        model=model,
        processing_class=tokenizer,  # tokenizer の代替
        data_collator=data_collator,  # バッチ時に自動でパディングを適用
        args=TrainingArguments(
            output_dir="./tmp",  # 一時ディレクトリ（必須パラメータ）
            report_to="none",    # wandbを無効化
            per_device_eval_batch_size=EVAL_BATCH_SIZE,  # 設定ファイルから取得
            fp16=True,  # float16を使用
            dataloader_pin_memory=True,  # データローダーの高速化
            dataloader_num_workers=2,  # データ読み込みの並列化
        )
    )
    # no_gradコンテキストで推論を実行（メモリ効率化）
    with torch.no_grad():
        predictions = trainer.predict(ds_test)

    print("Creating submission file...")
    # 提出用ファイルの作成
    submission = create_submission(predictions, test, le)

    # ファイルの保存
    submission.to_csv(SUBMISSION_OUTPUT_PATH, index=False)
    print(f"Submission file saved to: {SUBMISSION_OUTPUT_PATH}")
    print("\nSubmission preview:")
    print(submission.head())
    print(f"\nSubmission shape: {submission.shape}")


if __name__ == "__main__":
    main()

In [None]:
%%writefile phi_4_reasoning_0_948.py

import os
import gc
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import joblib
import torch

try:
    from peft import PeftModel, PeftConfig
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False
    print("Warning: PEFT not available, will use base model only")

# Model configuration
VER = 2
MODEL_NAME = "/kaggle/input/phi4-reasoning-plus/transformers/default/1/Phi-4-reasoning-plus"
MODEL_TYPE = "phi"  # Phi-4 model type
EPOCHS = 3  # Reduce epochs for initial testing
MAX_LEN = 250  # Phi-4-reasoning-plus supports 32k context, but we use 1024 for efficiency

# Directory settings
OUTPUT_DIR = f"/kaggle/input/phi-4-reasoning-plus09476-ft/transformers/default/1/ver_2_9476ft"

# Training parameters
TRAIN_BATCH_SIZE = 4  # Smaller batch size for Phi-4
EVAL_BATCH_SIZE = 4  # Eval batch size
GRADIENT_ACCUMULATION_STEPS = 16  # Increased for effective batch size
LEARNING_RATE = 2e-4
LOGGING_STEPS = 50
SAVE_STEPS = 200
EVAL_STEPS = 200


# Data paths
TRAIN_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
TEST_DATA_PATH = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

# Model save paths
BEST_MODEL_PATH = f"{OUTPUT_DIR}/checkpoint-1722"
LABEL_ENCODER_PATH = f"{OUTPUT_DIR}/label_encoder.joblib"

# Other settings
RANDOM_SEED = 42
VALIDATION_SPLIT = 0.0000001

# GPU settings
CUDA_VISIBLE_DEVICES = "0,1"  # GPU device to use. Set to None to use all available GPUs

# Submission settings
SUBMISSION_OUTPUT_PATH = 'phi_4_reasoning_0_948_submission.csv'

# WandB settings
USE_WANDB = True  # Set to False to disable WandB
WANDB_PROJECT = "phi-4-reasoning-math-misconceptions"
WANDB_RUN_NAME = f"phi-4-reasoning-ver{VER}"
WANDB_ENTITY = None  # Set your WandB entity (username or team name) if needed

# Early stopping settings
USE_EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 10  # 改善が見られない評価回数の上限（評価はEVAL_STEPSごとに実行される）
EARLY_STOPPING_THRESHOLD = 0.001  # 改善とみなす最小変化量

# LoRA configuration for Phi-4
LORA_RANK = 64  # LoRAのランク - optimized for Phi-4
LORA_ALPHA = 128  # LoRAのスケーリングパラメータ - 1:1 ratio with rank
LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]  # Phi-4 target modules
LORA_DROPOUT = 0.1  # LoRAのドロップアウト率 - reduced for Phi-4
LORA_BIAS = "none"  # biasの扱い: "none", "all", "lora_only"

# Memory optimization settings
USE_GRADIENT_CHECKPOINTING = True  # Enable gradient checkpointing
USE_8BIT_ADAM = False  # Use 8-bit Adam optimizer for memory efficiency
MAX_GRAD_NORM = 1.0  # Gradient clipping value

def prepare_correct_answers(train_data):
    """正解答案データを準備"""
    idx = train_data.apply(lambda row: row.Category.split('_')[0] == 'True', axis=1)
    correct = train_data.loc[idx].copy()
    correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
    correct = correct.sort_values('c', ascending=False)
    correct = correct.drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']]
    correct['is_correct'] = 1
    return correct


def format_input(row):
    """入力データをモデル用プロンプトにフォーマット"""
    if row["is_correct"]:
        status = "Yes"
    else:
        status = "No"

    # Phi-4用のプロンプトフォーマット（特別なthinkタグを含む）
    prompt = (
        "<|user|>\n"
        f"[Mathematical Misconception Analysis Task]\n\n"
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct?: {status}\n"
        f"Explanation: {row['StudentExplanation']}\n"
        "<|end|>\n"
        "<|assistant|>\n"
        "<think>\n"
        "Let me analyze this mathematical misconception...\n"
        "</think>\n\n"
    )
    return prompt


def tokenize_dataset(dataset, tokenizer, max_len):
    """データセットをトークナイズ"""
    def tokenize(batch):
        # パディングはDataCollatorで行うため、ここではトークナイズのみ
        return tokenizer(
            batch['text'],
            padding=False,  # パディングはDataCollatorに任せる
            truncation=True,
            max_length=max_len,
            return_tensors=None  # map時は'None'を使用
        )

    dataset = dataset.map(tokenize, batched=True, batch_size=100)
    # columnsの設定時にlabelを保持
    columns = ['input_ids', 'attention_mask', 'label'] if 'label' in dataset.column_names else ['input_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)
    return dataset


def compute_map3(eval_pred):
    """Top-3 予測に基づくMAP@3を計算"""
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0.0
    for i, label in enumerate(labels):
        ranks = top3[i]
        if ranks[0] == label:
            score += 1.0
        elif ranks[1] == label:
            score += 1.0 / 2
        elif ranks[2] == label:
            score += 1.0 / 3
    return {"map@3": score / len(labels)}


def create_submission(predictions, test_data, label_encoder, filter_true_false = True):

    question_label_choices = {
        31772: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Incomplete',
            'True_Misconception:WNB',
            'False_Neither:NA',
            'False_Misconception:WNB',
            'False_Misconception:Incomplete',
            'False_Correct:NA'
        ],
        31774: [
            'False_Neither:NA',
            'False_Misconception:SwapDividend',
            'False_Misconception:Mult',
            'False_Correct:NA',
            'False_Misconception:FlipChange',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:SwapDividend',
            'True_Misconception:Mult',
            'True_Misconception:FlipChange'
        ],
        31777: [
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Irrelevant',
            'False_Misconception:Wrong_Fraction',
            'True_Correct:NA',
            'True_Neither:NA'
        ],
        31778: [
            'False_Neither:NA',
            'False_Misconception:Additive',
            'False_Misconception:Irrelevant',
            'False_Correct:NA',
            'False_Misconception:WNB',
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Irrelevant',
            'True_Misconception:Additive'
        ],
        32829: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Not_variable',
            'False_Neither:NA',
            'False_Misconception:Adding_terms',
            'False_Correct:NA',
            'False_Misconception:Not_variable',
            'False_Misconception:Inverse_operation'
        ],
        32833: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Inversion',
            'True_Misconception:Duplication',
            'False_Misconception:Duplication',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Inversion',
            'False_Misconception:Wrong_Operation'
        ],
        32835: [
            'False_Misconception:Whole_numbers_larger',
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Longer_is_bigger',
            'False_Misconception:Ignores_zeroes',
            'False_Misconception:Shorter_is_bigger',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Whole_numbers_larger',
            'True_Misconception:Shorter_is_bigger',
            'True_Misconception:Longer_is_bigger'
        ],
        33471: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Wrong_fraction',
            'False_Correct:NA',
            'False_Misconception:Incomplete',
            'False_Neither:NA',
            'False_Misconception:Wrong_fraction'
        ],
        33472: [
            'True_Neither:NA',
            'True_Correct:NA',
            'True_Misconception:Adding_across',
            'True_Misconception:Denominator-only_change',
            'True_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Denominator-only_change',
            'False_Misconception:Incorrect_equivalent_fraction_addition',
            'False_Misconception:Adding_across'
        ],
        33474: [
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Division',
            'True_Misconception:Subtraction',
            'False_Neither:NA',
            'False_Misconception:Subtraction',
            'False_Misconception:Division',
            'False_Correct:NA'
        ],
        76870: [
            'False_Misconception:Unknowable',
            'False_Correct:NA',
            'False_Neither:NA',
            'False_Misconception:Definition',
            'False_Misconception:Interior',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Definition'
        ],
        89443: [
            'False_Neither:NA',
            'False_Misconception:Positive',
            'False_Misconception:Tacking',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Tacking',
            'True_Misconception:Positive',
            'False_Correct:NA'
        ],
        91695: [
            'False_Neither:NA',
            'False_Misconception:Wrong_term',
            'False_Correct:NA',
            'False_Misconception:Firstterm',
            'True_Correct:NA',
            'True_Misconception:Wrong_term',
            'True_Neither:NA',
            'True_Misconception:Firstterm'
        ],
        104665: [
            'False_Neither:NA',
            'False_Misconception:Base_rate',
            'False_Correct:NA',
            'True_Correct:NA',
            'True_Neither:NA',
            'True_Misconception:Base_rate',
            'True_Misconception:Multiplying_by_4',
            'False_Misconception:Multiplying_by_4'
        ],
        109465: [
            'False_Neither:NA',
            'False_Correct:NA',
            'False_Misconception:Certainty',
            'False_Misconception:Scale',
            'True_Correct:NA',
            'True_Neither:NA'
        ]
    }

    # Identify which are True/False classes
    true_classes = {}
    false_classes = {}
    for idx, c in enumerate(label_encoder.classes_):
    
        if 'True' in c:
            true_classes[idx] = c
        else:
            false_classes[idx] = c

    
    # Normalize for Label Encoder
    question_label_choice_ids = {}
    for qid, choices in question_label_choices.items():
        _label_ids = np.where(np.isin(label_encoder.classes_, question_label_choices[qid]))[0]
        
        question_label_choice_ids[qid] = [int(x) for x in _label_ids]
        
    
    test_probabilities = []
    test_predictions = []
    test_top3_predictions = []

    for qid, correct, row in zip(test_data.QuestionId.tolist(), test_data.is_correct.tolist(), predictions.predictions):
    
        candidate_idx = question_label_choice_ids[qid]

        # If filter candidates using True/False information
        if filter_true_false:
            if correct == 1:
                # use true_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in true_classes]
            if correct == 0:
                # use false_classes to filter candidate_idx
                candidate_idx = [c for c in candidate_idx if c in false_classes]
    
        candidate_logits = row[candidate_idx]
    
        candidate_probs = torch.nn.functional.softmax(torch.tensor(candidate_logits), dim=-1).numpy()
    
        top_k = np.argsort(-candidate_probs)
    
        # Have to convert back to the original label encoder space
        topk_idx = np.array(candidate_idx)[top_k]
    
        # Keep the probabilities
        topk_probs = candidate_probs[top_k].tolist()
    
        # Get the predicted labels
        topk_preds = label_encoder.inverse_transform(topk_idx).tolist()
    
        test_probabilities.append(topk_probs)
        test_predictions.append(topk_preds)
        test_top3_predictions.append(" ".join(topk_preds[:3]))
    
    test_submission_data = pd.DataFrame({
        "row_id": test_data.row_id.tolist(),
        "QuestionId": test_data.QuestionId.tolist(),
        "is_correct": test_data.is_correct.tolist(),
        "probs": test_probabilities,
        "preds": test_predictions,
        'Category:Misconception': test_top3_predictions
    })

    return test_submission_data


def main():
    """メイン推論関数"""
    
    # メモリキャッシュをクリア
    torch.cuda.empty_cache()
    gc.collect()
    
    # CUDAメモリ管理の最適化
    import os
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    
    # 2つのGPUを使用可能にする
    if torch.cuda.device_count() > 1:
        print(f"Found {torch.cuda.device_count()} GPUs")

    print("Loading label encoder...")
    # ラベルエンコーダーの読み込み
    le = joblib.load(LABEL_ENCODER_PATH)
    n_classes = len(le.classes_)

    print("Loading trained model and tokenizer...")

    if PEFT_AVAILABLE:
        # LoRAアダプターを使用する場合
        print(f"Loading fine-tuned LoRA model from: {BEST_MODEL_PATH}")
        print(f"Loading base model from: {MODEL_NAME}")

        # ベースモデルを読み込む（量子化なし）
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=n_classes,
            trust_remote_code=True,
            device_map="auto",  # 自動的に複数GPUに分散
            low_cpu_mem_usage=True,  # CPUメモリ使用量を削減
            torch_dtype=torch.float16  # FP16を使用してメモリ効率を改善
        )

        # LoRAアダプターを適用
        model = PeftModel.from_pretrained(model, BEST_MODEL_PATH)
        
        # 推論モードに設定（メモリ効率化）
        model.eval()
        # 8bit量子化モデルは既にGPUに配置されているのでto('cuda')は不要

        # トークナイザーはベースモデルから読み込む
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        print("Successfully loaded LoRA fine-tuned model")
    else:
        # PEFTが利用できない場合はエラー
        raise ImportError("PEFT is required to load the fine-tuned model. Please install peft: pip install peft")

    # パディングトークンの設定
    if tokenizer.pad_token is None:
        tokenizer.pad_token = "<|finetune_right_pad_id|>"
        tokenizer.pad_token_id = 100349  # Phi-4-reasoning-plusのPADトークンID
    
    # モデルの設定を更新（PeftModelのbase_modelにアクセス）
    if hasattr(model, 'base_model'):
        model.base_model.config.pad_token_id = tokenizer.pad_token_id
        # 内部のモデルにも設定
        if hasattr(model.base_model, 'model'):
            model.base_model.model.config.pad_token_id = tokenizer.pad_token_id
    else:
        model.config.pad_token_id = tokenizer.pad_token_id

    print("Loading test data...")
    # テストデータの読み込み
    test = pd.read_csv(TEST_DATA_PATH)

    print("Loading training data for correct answers...")
    # 正解答案データの準備（訓練データから取得）
    train = pd.read_csv(TRAIN_DATA_PATH)
    train.Misconception = train.Misconception.fillna('NA')
    correct = prepare_correct_answers(train)

    print("Preprocessing test data...")
    # テストデータの前処理
    test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
    test.is_correct = test.is_correct.fillna(0)
    test['text'] = test.apply(format_input, axis=1)

    print("Tokenizing test data...")
    # テストデータのトークナイズ
    ds_test = Dataset.from_pandas(test[['text']])
    ds_test = tokenize_dataset(ds_test, tokenizer, MAX_LEN)

    # パディングのためのデータコラレータの設定
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("Running inference...")
    
    # TF32を有効化（推論速度向上）
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    
    # 推論の実行
    trainer = Trainer(
        model=model,
        processing_class=tokenizer,  # tokenizer の代替
        data_collator=data_collator,  # バッチ時に自動でパディングを適用
        args=TrainingArguments(
            output_dir="./tmp",  # 一時ディレクトリ（必須パラメータ）
            report_to="none",    # wandbを無効化
            per_device_eval_batch_size=EVAL_BATCH_SIZE,  # 設定ファイルから取得
            fp16=True,  # float16を使用
            dataloader_pin_memory=True,  # データローダーの高速化
            dataloader_num_workers=2,  # データ読み込みの並列化
        )
    )
    # no_gradコンテキストで推論を実行（メモリ効率化）
    with torch.no_grad():
        predictions = trainer.predict(ds_test)

    print("Creating submission file...")
    # 提出用ファイルの作成
    submission = create_submission(predictions, test, le)

    # ファイルの保存
    submission.to_csv(SUBMISSION_OUTPUT_PATH, index=False)
    print(f"Submission file saved to: {SUBMISSION_OUTPUT_PATH}")
    print("\nSubmission preview:")
    print(submission.head())
    print(f"\nSubmission shape: {submission.shape}")


if __name__ == "__main__":
    main()

In [None]:
!python qwen3_32b_0_947.py

In [None]:
!python phi_4_reasoning_0_948.py

In [None]:
!python phi_4_0_948_fulltrain.py

In [None]:
!python deepseek_0_946.py

In [None]:
!python qwen3_14b_0_946.py

In [None]:
from collections import defaultdict

def get_top_k_ensemble(ll, k=3):

    lists = [l.split(' ') for l in ll]
    weights = [4 for l in lists]
    score = defaultdict(int)

    for i, lst in enumerate(lists):
        weight = weights[i]
        for rank, item in enumerate(lst):
            score[item] += (len(lst) - rank) * weight

    sorted_items = sorted(score.items(), key=lambda x: -x[1])
    return ' '.join([item for item, _ in sorted_items[:k]])

list1 = 'a b d f'
list2 = 'b c a e'
list3 = 'c e b'
list4 = 'c e d'

print(get_top_k_ensemble([list1, list2, list3, list4], k=3))

In [None]:
import pandas as pd
import numpy as np

df1 = pd.read_csv('/kaggle/working/deepseek_r1_submission.csv')
df2 = pd.read_csv('/kaggle/working/qwen3_14b_submission.csv')
df3 = pd.read_csv('/kaggle/working/phi_4_reasoning_0_948_submission.csv')
df4 = pd.read_csv('/kaggle/working/phi_4_0_948_fulltrain_submission.csv')
df5 = pd.read_csv('/kaggle/working/qwen3_32b_0_947_submission.csv')

df1 = df1.sort_values('row_id').reset_index(drop=True)
df2 = df2.sort_values('row_id').reset_index(drop=True)
df3 = df3.sort_values('row_id').reset_index(drop=True)
df4 = df4.sort_values('row_id').reset_index(drop=True)
df5 = df5.sort_values('row_id').reset_index(drop=True)

In [None]:
ensemble_predictions = []
for r1, r2, r3, r4, r5 in zip(df1.itertuples(), df2.itertuples(), df3.itertuples(), df4.itertuples(), df5.itertuples()):

    prob_preds_1 = sorted([(pb, pr) for pb, pr in zip(eval(r1.probs), eval(r1.preds))], key=lambda x: x[1])
    prob_preds_2 = sorted([(pb, pr) for pb, pr in zip(eval(r2.probs), eval(r2.preds))], key=lambda x: x[1])
    prob_preds_3 = sorted([(pb, pr) for pb, pr in zip(eval(r3.probs), eval(r3.preds))], key=lambda x: x[1])
    prob_preds_4 = sorted([(pb, pr) for pb, pr in zip(eval(r4.probs), eval(r4.preds))], key=lambda x: x[1])
    prob_preds_5 = sorted([(pb, pr) for pb, pr in zip(eval(r5.probs), eval(r5.preds))], key=lambda x: x[1])

    # Should be same for all row_ids
    choices = [x[1] for x in prob_preds_1]

    mean_probs = np.mean([
        [x[0] for x in prob_preds_1],
        [x[0] for x in prob_preds_2],
        [x[0] for x in prob_preds_3],
        [x[0] for x in prob_preds_4],
        [x[0] for x in prob_preds_5],
    ],
    axis=0)

    final_prob_preds = sorted([(l, p) for l, p in zip(choices, mean_probs)], key=lambda x: -x[1])

    row = {
        "row_id": r1.row_id,
        "Category:Misconception": " ".join([x[0] for x in final_prob_preds[:3]])
    }

    ensemble_predictions.append(row)

ensemble_predictions = pd.DataFrame(ensemble_predictions)

In [None]:
ensemble_predictions.to_csv('submission.csv', index = False)

In [None]:
ensemble_predictions