### MAP True Baseline - Qwen Math + QLoRA (Sequence Classification)

This notebook implements a **baseline end-to-end pipeline** for the
Misconception Annotation Project (MAP).

The approach:
- Treats the task as a **multi-class sequence classification problem**
- Uses a **Qwen Math instruction-tuned model**
- Applies **4-bit quantization + LoRA (QLoRA)** for memory-efficient fine-tuning
- Trains on student explanations to predict `Category:Misconception`
- Evaluates using **MAP@3**
- Produces a Kaggle-compatible submission file

This notebook serves as the main **starting point** where the accuracy starts raking off and I start seeing good results.

In [None]:
# Install dependencies and configure environment
import os

!pip install -q bitsandbytes accelerate transformers peft datasets scikit-learn pandas numpy torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
import pandas as pd
import numpy as np


# Global configuration
VER = 1
EPOCHS = 1

model_name = "Qwen/Qwen2.5-Math-1.5B-Instruct"

DIR = f"ver_{VER}"
os.makedirs(DIR, exist_ok=True)

print("Using model:", model_name)
print("Output dir:", DIR)
print("CUDA available:", torch.cuda.is_available())

In [None]:
# Load training data and encode labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

train = pd.read_csv("/kaggle/input/map-qd/train.csv")
train["Misconception"] = train["Misconception"].fillna("NA")
train["target"] = train["Category"] + ":" + train["Misconception"]
train["label"] = le.fit_transform(train["target"])

n_classes = len(le.classes_)
print(f"Train shape: {train.shape}, Num classes: {n_classes}")

train.head()

In [None]:
# Identify correct answers per question
idx = train.apply(lambda r: r.Category.split("_")[0], axis=1) == "True"

correct = train.loc[idx].copy()
correct["count"] = correct.groupby(["QuestionId", "MC_Answer"]).MC_Answer.transform("count")
correct = correct.sort_values("count", ascending=False)
correct = correct.drop_duplicates(["QuestionId"])
correct = correct[["QuestionId", "MC_Answer"]]
correct["is_correct"] = 1

train = train.merge(correct, on=["QuestionId", "MC_Answer"], how="left")
train["is_correct"] = train["is_correct"].fillna(0)

In [None]:
# Build text prompt for the model
def format_input(row):
    correctness = "This answer is correct." if row["is_correct"] else "This answer is incorrect."
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"{correctness}\n"
        f"Student Explanation: {row['StudentExplanation']}"
    )

train["text"] = train.apply(format_input, axis=1)

print("Example prompt:\n")
print(train["text"].iloc[0])

In [None]:
# Inspect token length distribution
from transformers import AutoTokenizer
import matplotlib.pyplot as plt

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

lengths = [len(tokenizer.encode(t, truncation=False)) for t in train["text"]]

plt.hist(lengths, bins=50)
plt.title("Token Length Distribution")
plt.xlabel("Tokens")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

MAX_LEN = 192
print(f"Samples exceeding {MAX_LEN} tokens:", (np.array(lengths) > MAX_LEN).sum())

In [None]:
# Train / validation split and HF Dataset conversion
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

COLS = ["text", "label"]
train_ds = Dataset.from_pandas(train_df[COLS])
val_ds = Dataset.from_pandas(val_df[COLS])

In [None]:
# Tokenization
def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
# Load quantized model and attach LoRA adapters
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=n_classes,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map={"": 0},
)

base_model = prepare_model_for_kbit_training(base_model)

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

model = get_peft_model(base_model, lora_config)
model.gradient_checkpointing_enable()
model.print_trainable_parameters()

In [None]:
# Training arguments and MAP@3 metric
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=500,
    bf16=True,
    fp16=False,
    report_to="none",
)

def compute_map3(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
    top3 = np.argsort(-probs, axis=1)[:, :3]
    score = 0
    for i in range(len(labels)):
        if labels[i] == top3[i, 0]: score += 1
        elif labels[i] == top3[i, 1]: score += 0.5
        elif labels[i] == top3[i, 2]: score += 1/3
    return {"map@3": score / len(labels)}

In [None]:
# Train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_map3,
)

trainer.train()

In [None]:
# Save model and label encoder
import joblib

trainer.save_model(f"{DIR}/best")
joblib.dump(le, f"{DIR}/label_encoder.joblib")

In [None]:
# Generate predictions and create submission
test = pd.read_csv("/kaggle/input/map-qd/test.csv")
test = test.merge(correct, on=["QuestionId", "MC_Answer"], how="left")
test["is_correct"] = test["is_correct"].fillna(0)
test["text"] = test.apply(format_input, axis=1)

ds_test = Dataset.from_pandas(test[["text"]])
ds_test = ds_test.map(tokenize, batched=True)

preds = trainer.predict(ds_test)
probs = torch.softmax(torch.tensor(preds.predictions), dim=1).numpy()

top3 = np.argsort(-probs, axis=1)[:, :3]
decoded = le.inverse_transform(top3.flatten()).reshape(top3.shape)

submission = pd.DataFrame({
    "row_id": test["row_id"],
    "Category:Misconception": [" ".join(row) for row in decoded],
})

submission.to_csv("submission.csv", index=False)
submission.head()