In [1]:
!pip install --upgrade --no-index --find-links=/kaggle/input/transformers-4-56-1-and-deps transformers -qq

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.6.0 requires fsspec[http]<=2025.3.0,>=2023.1.0, but you have fsspec 2025.5.1 which is incompatible.[0m[31m
[0m

## Hunyaun 7B 0.945

In [2]:
%%writefile Hunyaun_inference.py

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
import pandas as pd
import torch
import numpy as np
from argparse import Namespace
from sklearn.preprocessing import LabelEncoder

class DataProcessor:
    def __init__(self, args):
        self.args = args
        self.le = None
        self.isPreprocess = False
        self.correct_lookup = None

    def load_data(self):
        self.train_df = pd.read_csv(self.args.train_path)
        self.test_df = pd.read_csv(self.args.test_path)
        if self.args.use_extra_data:
            self.extra_df = pd.read_csv(self.args.extra_data_path)
            self.train_df = pd.concat([self.train_df, self.extra_df], ignore_index=True)

    def get_num_classes(self):
        if self.isPreprocess == False:
            return "please preprocess first"
        num_class = self.train_df['label'].nunique()
        return num_class

    def get_label_encoder(self):
        if self.le is None:
            raise ValueError("LabelEncoder not initialized. Please run preprocess first.")
        return self.le

    @staticmethod
    def format_input(row):
        correct_text = "Yes" if row['IsCorrect'] else "No"
        return (
            f"Question: {row['QuestionText']}\n"
            f"Answer: {row['MC_Answer']}\n"
            f"Correct? {correct_text}\n"
            f"Student Explanation: {row['StudentExplanation']}\n"
        )

    def preprocess(self):
        self.load_data()
        self.train_df['Misconception'] = self.train_df['Misconception'].fillna('NA')
        self.train_df['target'] = self.train_df['Category'] + ':' + self.train_df['Misconception']

        correct_samples = self.train_df[self.train_df['Category'].str.startswith('True', na=False)].copy()
        correct_samples['count'] = correct_samples.groupby(['QuestionId', 'MC_Answer'])['MC_Answer'].transform('count')
        most_popular_correct = correct_samples.sort_values('count', ascending=False).drop_duplicates(['QuestionId'])
        self.correct_lookup = most_popular_correct[['QuestionId', 'MC_Answer']].copy()
        self.correct_lookup['IsCorrect_flag'] = True

        self.train_df = self.train_df.merge(self.correct_lookup, on=['QuestionId', 'MC_Answer'], how='left')
        self.train_df['IsCorrect'] = self.train_df['IsCorrect_flag'].notna()
        self.train_df = self.train_df.drop(columns=['IsCorrect_flag'])

        self.le = LabelEncoder()
        self.train_df['label'] = self.le.fit_transform(self.train_df['target'])
        self.train_df['text'] = self.train_df.apply(self.format_input, axis=1)

        self.isPreprocess = True
        return self.train_df

    def inference_processor(self):
        if self.isPreprocess == False:
            return "Have you do the train? please preprocess first"
        self.test_df = self.test_df.merge(self.correct_lookup, on=['QuestionId', 'MC_Answer'], how='left')
        self.test_df['IsCorrect'] = self.test_df['IsCorrect_flag'].notna()
        self.test_df = self.test_df.drop(columns=['IsCorrect_flag'])
        self.test_df['text'] = self.test_df.apply(self.format_input, axis=1)
        return self.test_df

# inference
args = Namespace(
    train_path='/kaggle/input/map-charting-student-math-misunderstandings/train.csv',
    test_path='/kaggle/input/map-charting-student-math-misunderstandings/test.csv',
    use_extra_data=False,
    extra_data_path='no_datas.csv',
    model_dir="/kaggle/input/hunyuan-7b-instruct-map",      
    inference_model_dir="/kaggle/input/hunyuan-7b-instruct-map",
    mode='inference',
    model_name="/kaggle/input/hunyuan-7b-instruct-bf16"   
)

DP = DataProcessor(args)
_ = DP.preprocess()

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import PeftModel

tokenizer = AutoTokenizer.from_pretrained(args.inference_model_dir, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForSequenceClassification.from_pretrained(
    args.model_name,
    num_labels=DP.get_num_classes(),
    device_map="auto",
    torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(base_model, args.inference_model_dir)
model.config.pad_token_id = tokenizer.pad_token_id

MAX_LEN = 256
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

test_df = DP.inference_processor()
ds_test = Dataset.from_pandas(test_df[['text']])
ds_test = ds_test.map(tokenize_function, batched=True)

inference_args = TrainingArguments(
    do_train=False,
    do_eval=True,
    output_dir="./temp",
    per_device_eval_batch_size=16,
    fp16=True,
    bf16=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=inference_args,
    processing_class=tokenizer,
)

from scipy.special import softmax
import numpy as np
import pandas as pd

# Run prediction with Trainer
pred_output = trainer.predict(ds_test)
# pred_output.predictions -> shape (num_samples, num_classes)
logits = pred_output.predictions

# Convert to probabilities
probs = softmax(logits, axis=1)   # numpy array, same shape

# Sort classes by probability (descending)
top_indices = np.argsort(-probs, axis=1)

# Decode labels
le = DP.get_label_encoder()
flat_indices = top_indices.flatten()
decoded_labels = le.inverse_transform(flat_indices)
top_labels = decoded_labels.reshape(top_indices.shape)

# ---- Save Top-3 submission ----
joined_preds = [" ".join(row[:35]) for row in top_labels]
sub = pd.DataFrame({
    "row_id": test_df.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission_trainer.csv", index=False)

# ---- Save full probability table (for ensembling) ----
prob_data = []
num_classes = logits.shape[1]
for i in range(len(logits)):
    # store probability for each class in descending order
    prob_dict = {f"prob_{j}": probs[i, top_indices[i, j]] for j in range(num_classes)}
    prob_dict["row_id"] = test_df.row_id.values[i]
    prob_dict["top_classes"] = " ".join(top_labels[i, :num_classes])
    prob_data.append(prob_dict)

prob_df = pd.DataFrame(prob_data)
prob_df.to_csv("submission_Hunyaun_prob.csv", index=False)

Writing Hunyaun_inference.py


## Qwen 3 4B Inf 0.945

In [3]:
%%writefile qwen3_4b_inference.py

import os
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from scipy.special import softmax
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
test  = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')

model_name = "/kaggle/input/qwen3-4b-map-lora-training"

def format_input(row):
    x = "This answer is correct." if row['is_correct'] else "This answer is incorrect."
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"{x}\n"
        f"Student Explanation: {row['StudentExplanation']}"
    )

# Encode targets
le = LabelEncoder()
train.Misconception = train.Misconception.fillna('NA')
train['target'] = train.Category + ':' + train.Misconception
train['label'] = le.fit_transform(train['target'])
n_classes = len(le.classes_)
print(f"Train shape: {train.shape} with {n_classes} target classes")

# Identify correct answers to mark test rows
idx = train.apply(lambda row: row.Category.split('_')[0], axis=1) == 'True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId', 'MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c', ascending=False)
correct = correct.drop_duplicates(['QuestionId'])
correct = correct[['QuestionId', 'MC_Answer']]
correct['is_correct'] = 1

test = test.merge(correct, on=['QuestionId', 'MC_Answer'], how='left')
test.is_correct = test.is_correct.fillna(0)
test['text'] = test.apply(format_input, axis=1)

# Load model/tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="balanced",
    dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.pad_token_id
model.eval()

# Tokenize
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

ds_test = Dataset.from_pandas(test[['text']])
ds_test = ds_test.map(tokenize, batched=True, remove_columns=['text'])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

dataloader = DataLoader(
    ds_test,
    batch_size=4,
    shuffle=False,
    collate_fn=data_collator,
    pin_memory=True,
    num_workers=0
)

# Inference
all_logits = []
with torch.no_grad():
    for batch in tqdm(dataloader, desc="deepseek"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        all_logits.append(outputs.logits.float().cpu().numpy())

predictions = np.concatenate(all_logits, axis=0)
probs = softmax(predictions, axis=1)
top_indices = np.argsort(-probs, axis=1)
flat_indices = top_indices.flatten()
decoded_labels = le.inverse_transform(flat_indices)
top_labels = decoded_labels.reshape(top_indices.shape)

# Save top-3 submission
joined_preds = [" ".join(row[:3]) for row in top_labels]
sub = pd.DataFrame({
    "row_id": test.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission_qwen3_4B_top3.csv", index=False)

# Save top-25 probabilities for ensembling
prob_data = []
for i in range(len(predictions)):
    prob_dict = {f"prob_{j}": probs[i, top_indices[i, j]] for j in range(25)}
    prob_dict['row_id'] = test.row_id.values[i]
    prob_dict['top_classes'] = " ".join(top_labels[i, :25])
    prob_data.append(prob_dict)

prob_df = pd.DataFrame(prob_data)
prob_df.to_csv("submission_qwen3_4B_probabilities.csv", index=False)

print("✅ Completed - saved submission and probabilities")

# Clean up GPU memory
del model, tokenizer
torch.cuda.empty_cache()

Writing qwen3_4b_inference.py


## EEDI Qwen 2 LoRA 0.945

In [4]:
%%writefile qwen2_8b.py

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

VER=1

EPOCHS = 2

DIR = f"ver_{VER}"
os.makedirs(DIR, exist_ok=True)

import pandas as pd, numpy as np
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
train.Misconception = train.Misconception.fillna('NA')
train['target'] = train.Category+":"+train.Misconception
train['label'] = le.fit_transform(train['target'])
target_classes = le.classes_
n_classes = len(target_classes)
print(f"Train shape: {train.shape} with {n_classes} target classes")
train.head()

idx = train.apply(lambda row: row.Category.split('_')[0],axis=1)=='True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c',ascending=False)
correct = correct.drop_duplicates(['QuestionId'])
correct = correct[['QuestionId','MC_Answer']]
correct['is_correct'] = 1

train = train.merge(correct, on=['QuestionId','MC_Answer'], how='left')
train.is_correct = train.is_correct.fillna(0)

import torch
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/edi-trained-map-qwen/best")
MAX_LEN = 256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def format_input(row):
    x = "Yes"
    if not row['is_correct']:
        x = "No"
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Is Correct Answer: {x}\n"
        f"Student Explanation: {row['StudentExplanation']}"
    )

train['text'] = train.apply(format_input,axis=1)
print("Example prompt for our LLM:")
print()
print( train.text.values[0] )

# Split into train and validation sets
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
COLS = ['text','label']
train_ds = Dataset.from_pandas(train_df[COLS])
val_ds = Dataset.from_pandas(val_df[COLS])

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

# Set format for PyTorch
columns = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BitsAndBytesConfig
import torch

Model_Name = "/kaggle/input/qwen-2-5-map" # Main Model
model_name = "/kaggle/input/edi-trained-map-qwen/best" # LoRa adaptors
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

from transformers import AutoTokenizer

# Load tokenizer for the LoRA checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)


model = AutoModelForSequenceClassification.from_pretrained(
    Model_Name, 
    num_labels=n_classes,
    quantization_config=bnb_config,
    device_map="balanced",
    trust_remote_code=True
)

# Resize base model embeddings
model.resize_token_embeddings(len(tokenizer))


from peft import PeftModel
model = PeftModel.from_pretrained(model, model_name)

# Cast LoRA parameters to float16
model = model.to(dtype=torch.float16)

print(next(model.parameters()).dtype)

# Add a new padding token

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Set the pad token id in the model's config

model.config.pad_token_id = tokenizer.pad_token_id
from transformers import  TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = f"./{DIR}",
    do_train=True,
    do_eval=True,
    eval_strategy="steps",
    save_strategy="steps", #no for no saving 
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    save_total_limit=1,
    metric_for_best_model="map@3",
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="none",
    bf16=False, # TRAIN WITH BF16 IF LOCAL GPU IS NEWER GPU          
    fp16=True, # INFER WITH FP16 BECAUSE KAGGLE IS T4 GPU
)
# CUSTOM MAP@3 METRIC

from sklearn.metrics import average_precision_score

def compute_map3(eval_pred):
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    
    top3 = np.argsort(-probs, axis=1)[:, :3]  # Top 3 predictions
    match = (top3 == labels[:, None])

    # Compute MAP@3 manually
    map3 = 0
    for i in range(len(labels)):
        if match[i, 0]:
            map3 += 1.0
        elif match[i, 1]:
            map3 += 1.0 / 2
        elif match[i, 2]:
            map3 += 1.0 / 3
    return {"map@3": map3 / len(labels)}
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
    compute_metrics=compute_map3,
)

# Load test set
test = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')
test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
test.is_correct = test.is_correct.fillna(0)
test['text'] = test.apply(format_input, axis=1)

# HuggingFace Dataset
ds_test = Dataset.from_pandas(test[['text']])
ds_test = ds_test.map(tokenize, batched=True)

# Run inference
pred_output = trainer.predict(ds_test)
logits = pred_output.predictions
probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1).numpy()

# ---- Top-3 predictions ----
top3 = np.argsort(-probs, axis=1)[:, :3]   # shape: [num_samples, 3]

# Decode numeric class indices to original string labels
flat_top3 = top3.flatten()
decoded_labels = le.inverse_transform(flat_top3)
top_labels = decoded_labels.reshape(top3.shape)

# Join 3 labels per row with space
joined_preds = [" ".join(row) for row in top_labels]

# Save Top-3 submission
sub = pd.DataFrame({
    "row_id": test.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission_qwen3_14b.csv", index=False)

# ---- Save Top-25 probability table (for ensembling) ----
prob_data = []
top_k = 25
top_indices = np.argsort(-probs, axis=1)

for i in range(len(logits)):
    prob_dict = {f"prob_{j}": probs[i, top_indices[i, j]] for j in range(top_k)}
    prob_dict["row_id"] = test.row_id.values[i]
    prob_dict["top_classes"] = " ".join(
        le.inverse_transform(top_indices[i, :top_k])
    )
    prob_data.append(prob_dict)

prob_df = pd.DataFrame(prob_data)
prob_df.to_csv("submission_qwen2_8b_prob.csv", index=False)

Writing qwen2_8b.py


## Qwen 3 14B LoRA 0.944

In [5]:
%%writefile qwen3_14B.py

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

VER=1
model_name = "/kaggle/input/qwen3-14b-lora-map/results (1)/best"
EPOCHS = 2

DIR = f"ver_{VER}"
os.makedirs(DIR, exist_ok=True)

import pandas as pd, numpy as np
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
train.Misconception = train.Misconception.fillna('NA')
train['target'] = train.Category+":"+train.Misconception
train['label'] = le.fit_transform(train['target'])
target_classes = le.classes_
n_classes = len(target_classes)
print(f"Train shape: {train.shape} with {n_classes} target classes")
train.head()

idx = train.apply(lambda row: row.Category.split('_')[0],axis=1)=='True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c',ascending=False)
correct = correct.drop_duplicates(['QuestionId'])
correct = correct[['QuestionId','MC_Answer']]
correct['is_correct'] = 1

train = train.merge(correct, on=['QuestionId','MC_Answer'], how='left')
train.is_correct = train.is_correct.fillna(0)

import torch
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset
import numpy as np

tokenizer = AutoTokenizer.from_pretrained(model_name)
MAX_LEN = 256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def format_input(row):
    x = "Yes"
    if not row['is_correct']:
        x = "No"
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Is Correct Answer: {x}\n"
        f"Student Explanation: {row['StudentExplanation']}"
    )

train['text'] = train.apply(format_input,axis=1)
print("Example prompt for our LLM:")
print()
print( train.text.values[0] )

# Split into train and validation sets
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
COLS = ['text','label']
train_ds = Dataset.from_pandas(train_df[COLS])
val_ds = Dataset.from_pandas(val_df[COLS])

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

# Set format for PyTorch
columns = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BitsAndBytesConfig
import torch

Model_Name = "/kaggle/input/qwen-3/transformers/14b/1" # Main Model
model_name = "/kaggle/input/qwen3-14b-lora-map/results (1)/best" # LoRa adaptors
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

from transformers import AutoTokenizer

# Load tokenizer for the LoRA checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)


model = AutoModelForSequenceClassification.from_pretrained(
    Model_Name, 
    num_labels=n_classes,
    quantization_config=bnb_config,
    device_map="balanced",
    trust_remote_code=True
)

# Resize base model embeddings
model.resize_token_embeddings(len(tokenizer))


from peft import PeftModel
model = PeftModel.from_pretrained(model, model_name)

# Cast LoRA parameters to float16
model = model.to(dtype=torch.float16)

print(next(model.parameters()).dtype)

# Add a new padding token

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Set the pad token id in the model's config

model.config.pad_token_id = tokenizer.pad_token_id
from transformers import  TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = f"./{DIR}",
    do_train=True,
    do_eval=True,
    eval_strategy="steps",
    save_strategy="steps", #no for no saving 
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    save_total_limit=1,
    metric_for_best_model="map@3",
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="none",
    bf16=False, # TRAIN WITH BF16 IF LOCAL GPU IS NEWER GPU          
    fp16=True, # INFER WITH FP16 BECAUSE KAGGLE IS T4 GPU
)
# CUSTOM MAP@3 METRIC

from sklearn.metrics import average_precision_score

def compute_map3(eval_pred):
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    
    top3 = np.argsort(-probs, axis=1)[:, :3]  # Top 3 predictions
    match = (top3 == labels[:, None])

    # Compute MAP@3 manually
    map3 = 0
    for i in range(len(labels)):
        if match[i, 0]:
            map3 += 1.0
        elif match[i, 1]:
            map3 += 1.0 / 2
        elif match[i, 2]:
            map3 += 1.0 / 3
    return {"map@3": map3 / len(labels)}
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
    compute_metrics=compute_map3,
)

# Load test set
test = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')
test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
test.is_correct = test.is_correct.fillna(0)
test['text'] = test.apply(format_input, axis=1)

# HuggingFace Dataset
ds_test = Dataset.from_pandas(test[['text']])
ds_test = ds_test.map(tokenize, batched=True)

# Run inference
pred_output = trainer.predict(ds_test)
logits = pred_output.predictions
probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1).numpy()

# ---- Top-3 predictions ----
top3 = np.argsort(-probs, axis=1)[:, :3]   # shape: [num_samples, 3]

# Decode numeric class indices to original string labels
flat_top3 = top3.flatten()
decoded_labels = le.inverse_transform(flat_top3)
top_labels = decoded_labels.reshape(top3.shape)

# Join 3 labels per row with space
joined_preds = [" ".join(row) for row in top_labels]

# Save Top-3 submission
sub = pd.DataFrame({
    "row_id": test.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission_qwen3_14b.csv", index=False)

# ---- Save Top-25 probability table (for ensembling) ----
prob_data = []
top_k = 25
top_indices = np.argsort(-probs, axis=1)

for i in range(len(logits)):
    prob_dict = {f"prob_{j}": probs[i, top_indices[i, j]] for j in range(top_k)}
    prob_dict["row_id"] = test.row_id.values[i]
    prob_dict["top_classes"] = " ".join(
        le.inverse_transform(top_indices[i, :top_k])
    )
    prob_data.append(prob_dict)

prob_df = pd.DataFrame(prob_data)
prob_df.to_csv("submission_qwen3_14b_prob.csv", index=False)

Writing qwen3_14B.py


# qwen3 deepseek

In [6]:
%%writefile qwen3_deepseek_inference.py

# we do parallel inference, for deepseek and qwen3
import os
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
import threading
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from scipy.special import softmax
from tqdm import tqdm
import time

os.environ["TOKENIZERS_PARALLELISM"] = "false"


train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
test  = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')

model_paths = [
    "/kaggle/input/deekseepmath-7b-map-competition/MAP_EXP_09_FULL",
   "/kaggle/input/qwen3-8b-map-competition/MAP_EXP_16_FULL"]

def format_input(row):
    x = "This answer is correct."
    if not row['is_correct']:
        x = "This is answer is incorrect."
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"{x}\n"
        f"Student Explanation: {row['StudentExplanation']}")


le = LabelEncoder()
train.Misconception  = train.Misconception.fillna('NA')
train['target']   = train.Category + ':' +train.Misconception
train['label']    = le.fit_transform(train['target'])

n_classes = len(le.classes_)
print(f"Train shape: {train.shape} with {n_classes} target classes")
idx = train.apply(lambda row: row.Category.split('_')[0],axis=1)=='True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c',ascending=False)
correct = correct.drop_duplicates(['QuestionId'])
correct = correct[['QuestionId','MC_Answer']]
correct['is_correct'] = 1

test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
test.is_correct = test.is_correct.fillna(0)
test['text'] = test.apply(format_input,axis=1)
ds_test = Dataset.from_pandas(test)


def run_inference_on_gpu(model_path, gpu_id, test_data, output_name):
    """Run inference for one model on one GPU"""
    
    device = f"cuda:{gpu_id}"
    print(f"Loading {output_name} on {device}...")
    
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, 
        device_map=device, 
        dtype=torch.float16
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.eval()
    
    # Tokenize function
    def tokenize(batch):
        return tokenizer(batch["text"], 
                        truncation=True,
                        max_length=256)
    
    ds_test = Dataset.from_pandas(test_data[['text']])
    ds_test = ds_test.map(tokenize, batched=True, remove_columns=['text'])
    
    # Data collator
    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer,
        padding=True,
        return_tensors="pt"
    )
    
    # DataLoader
    dataloader = DataLoader(
        ds_test,
        batch_size=4,
        shuffle=False,
        collate_fn=data_collator,
        pin_memory=True,
        num_workers=0
    )
    
    # Inference
    all_logits = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"{output_name}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            all_logits.append(outputs.logits.float().cpu().numpy())
    
    predictions = np.concatenate(all_logits, axis=0)
    
    # Process results
    probs = softmax(predictions, axis=1)
    top_indices = np.argsort(-probs, axis=1)
    
    # Decode labels
    flat_indices = top_indices.flatten()
    decoded_labels = le.inverse_transform(flat_indices)
    top_labels = decoded_labels.reshape(top_indices.shape)
    
    # Save top-3 submission
    joined_preds = [" ".join(row[:3]) for row in top_labels]
    sub = pd.DataFrame({
        "row_id": test_data.row_id.values,
        "Category:Misconception": joined_preds
    })
    sub.to_csv(f"submission_{output_name}.csv", index=False)
    
    # Save probabilities for ensemble
    prob_data = []
    for i in range(len(predictions)):
        prob_dict = {f"prob_{j}": probs[i, top_indices[i, j]] for j in range(25)}
        prob_dict['row_id'] = test_data.row_id.values[i]
        prob_dict['top_classes'] = " ".join(top_labels[i, :25])
        prob_data.append(prob_dict)
    
    prob_df = pd.DataFrame(prob_data)
    prob_df.to_csv(f"submission_{output_name}_probabilities.csv", index=False)
    
    print(f" {output_name} completed - saved submission and probabilities")
    
    # Clean up GPU memory
    del model, tokenizer
    torch.cuda.empty_cache()

print(" Starting multi-GPU inference...")
start_time = time.time()

threads = []
gpu_assignments = [
    (model_paths[0], 0, "deepseek"),
    (model_paths[1], 1, "qwen3"),
]

# Start threads
for model_path, gpu_id, name in gpu_assignments:
    if gpu_id < torch.cuda.device_count():  
        thread = threading.Thread(
            target=run_inference_on_gpu,
            args=(model_path, gpu_id, test, name)
        )
        threads.append(thread)
        thread.start()
        time.sleep(10)  # Stagger starts to avoid memory issues

# Wait for completion
for thread in threads:
    thread.join()

end_time = time.time()
print(f" completed in {end_time - start_time:.2f} seconds!")

Writing qwen3_deepseek_inference.py


In [7]:
import time 
!python /kaggle/working/Hunyaun_inference.py
time.sleep(10)
!python /kaggle/working/qwen3_4b_inference.py
time.sleep(10)
!python /kaggle/working/qwen2_8b.py
time.sleep(10)
!python /kaggle/working/qwen3_14B.py
time.sleep(10)
!python /kaggle/working/qwen3_deepseek_inference.py
time.sleep(10)

2025-10-16 17:27:21.018775: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760635641.256103      61 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760635641.325937      61 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████████████| 4/4 [01:11<00:00, 17.80s/it]
Some weights of HunYuanDenseV1ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/hunyuan-7b-instruct-bf16 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

In [8]:
import pandas as pd
import numpy as np
from collections import defaultdict

# -------------------------
# Build family map
# -------------------------
train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
test_df = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')

train['is_true'] = train['Category'].str.startswith('True')
correct = (train[train.is_true]
           .assign(c=lambda df: df.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count'))
           .sort_values('c', ascending=False)
           .drop_duplicates(['QuestionId'])[['QuestionId','MC_Answer']])
correct['is_correct'] = 1

fam_map = (test_df.merge(correct, on=['QuestionId','MC_Answer'], how='left')
                  .assign(is_correct=lambda df: df.is_correct.fillna(0).astype(int))
                  .set_index('row_id')['is_correct']
                  .map({1: 'True_', 0: 'False_'}).to_dict())

# -------------------------
# Ensemble
# -------------------------
def extract_class_probabilities(row, model_suffix='', top_k=25):
    """Extract class names and probabilities from a row"""
    classes_col = f'top_classes{model_suffix}'
    if classes_col in row:
        classes = row[classes_col].split(' ')[:top_k]
    else:
        return {}
    class_probs = {}
    for i in range(min(top_k, len(classes))):
        prob_col = f'prob_{i}{model_suffix}'
        if prob_col in row:
            class_probs[classes[i]] = row[prob_col]
    return class_probs


def ensemble_with_disagreement_handling(prob_files, model_weights=None, top_k=3):
    n_models = len(prob_files)
    prob_dfs = []
    final_predictions = []
    
    for file_path in prob_files:
        df = pd.read_csv(file_path)
        prob_dfs.append(df)
    
    # Merge on row_id
    merged_df = prob_dfs[0]
    for i, df in enumerate(prob_dfs[1:], 1):
        merged_df = pd.merge(merged_df, df, on='row_id', suffixes=('', f'_model{i+1}'))
      
    for idx, row in merged_df.iterrows():
        pref = fam_map[row['row_id']]  # family for this row
        
        # Extract probabilities from each model
        all_class_probs = []
        for i in range(n_models):
            suffix = f'_model{i+1}' if i > 0 else ''
            class_probs = extract_class_probabilities(row, suffix, top_k=25)
            all_class_probs.append(class_probs)
        
        # Get all unique classes
        all_classes = set()
        for class_probs in all_class_probs:
            all_classes.update(class_probs.keys())
        
        # Calculate scores
        class_votes = defaultdict(int)
        class_total_prob = defaultdict(float)
        class_max_prob = defaultdict(float)
        
        for i, class_probs in enumerate(all_class_probs):
            weight = model_weights[i]
            for class_name, prob in class_probs.items():
                class_votes[class_name] += 1
                class_total_prob[class_name] += prob * weight
                class_max_prob[class_name] = max(class_max_prob[class_name], prob * weight)
        
        final_scores = {}
        for class_name in all_classes:
            base_score = class_total_prob[class_name]
            agreement_bonus = class_votes[class_name] / n_models
            confidence_bonus = class_max_prob[class_name]
            final_scores[class_name] = (
                base_score * 0.34 +
                agreement_bonus * 0.33 +
                confidence_bonus * 0.33
            )
        
        # -------------------------
        # Family filter
        # -------------------------
        final_scores = {k: v for k, v in final_scores.items() if k.startswith(pref)}
        
        # Sort and get top-k
        sorted_classes = sorted(final_scores.items(), key=lambda x: -x[1])
        top_classes = [class_name for class_name, _ in sorted_classes[:top_k]]
        
        # Backfill if < 3
        fillers = [f"{pref}Neither:NA"] + ([f"{pref}Correct:NA"] if pref == "True_" else [])
        for f in fillers:
            if len(top_classes) >= 3: break
            if f not in top_classes:
                top_classes.append(f)
        while len(top_classes) < 3:
            top_classes.append(fillers[0])
        
        final_predictions.append(' '.join(top_classes))
    
    return final_predictions


# -------------------------
# Run ensemble
# -------------------------
weights = [
    1,  # Qwen3 4B LoRA - 0.945 (best performer)
    1,  # Hunyuan - 0.945 (best performer)
    1,  # Qwen2 8B - 0.945 (BF)
    1,  # Qwen3 14B - 0.944 
    1,  # DeepseekMath 7B - 0.944
    1,  # Qwen 8B - 0.943
]

prob_files = [
    '/kaggle/working/submission_qwen3_4B_probabilities.csv',   # 0.945 (Qwen 4B LoRA)
    '/kaggle/working/submission_Hunyaun_prob.csv',             # 0.945 (Hunyuan)
    '/kaggle/working/submission_qwen2_8b_prob.csv',            # 0.945 (Qwen2 8B)
    '/kaggle/working/submission_qwen3_14b_prob.csv',           # 0.944 (Qwen3 14B)
    '/kaggle/working/submission_deepseek_probabilities.csv',   # 0.944 (DeepSeek Math 7B)
    '/kaggle/working/submission_qwen3_probabilities.csv'       # 0.943 (Qwen 3 8B)
]

predictions = ensemble_with_disagreement_handling(
    prob_files, 
    model_weights=weights,  
    top_k=8
)

submission = pd.DataFrame({
    'row_id': test_df.row_id.values,
    'Category:Misconception': predictions
})

submission.to_csv('submission.csv', index=False)
print(submission.head())

   row_id                             Category:Misconception
0   36696  True_Correct:NA True_Neither:NA True_Misconcep...
1   36697  False_Misconception:WNB False_Neither:NA False...
2   36698  True_Neither:NA True_Correct:NA True_Misconcep...
