
# MAP — Charting Student Math Misunderstandings: Baseline (TF‑IDF + Linear)
**Goal:** predict up to **3** `Category:Misconception` labels per row to maximize **MAP@3**.

**Approach:** concatenate problem text, student MC answer, and their explanation → TF‑IDF → linear classifier (SGD `log_loss`).  
We evaluate with **GroupKFold** split by `QuestionId` to reduce leakage, then train on full data and create **`submission.csv`**.


In [1]:

import os, sys, gc, math
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from scipy import sparse
np.random.seed(42)

# MAP@K metric (K=3 per competition)
def apk(actual, predicted, k=3):
    """Average precision at k for a single observation.
    `actual` is a single-element list [true_label] (competition has one truth per row).
    `predicted` is an ordered list of labels (top-k predictions).
    """
    if not actual:
        return 0.0
    if k < len(predicted):
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p == actual[0]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
            break  # only one relevant label per row
    return score

def mapk(actual_list, predicted_list, k=3):
    return np.mean([apk([a], p, k) for a, p in zip(actual_list, predicted_list)])

def topk_from_proba(class_labels, proba_row, k=3):
    idx = np.argsort(-proba_row)[:k]
    return [class_labels[i] for i in idx]

print("Libraries loaded.")

Libraries loaded.


In [None]:
# Detect Kaggle input path
KAGGLE_INPUT = '/kaggle/input/map-charting-student-math-misunderstandings'
LOCAL_INPUT  = '../input/map-charting-student-math-misunderstandings'
HERE = os.getcwd()

if os.path.exists(KAGGLE_INPUT):
    DATA_DIR = KAGGLE_INPUT
elif os.path.exists(LOCAL_INPUT):
    DATA_DIR = LOCAL_INPUT
else:
    # Fallback to working directory (for local testing with provided CSVs)
    DATA_DIR = 'data'

print('DATA_DIR =', DATA_DIR)

train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
print(train.shape, test.shape)
train.head(2)

(36696, 7) (3, 5)


Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,


In [3]:

# Build single target label "Category:Misconception"
train['Misconception'] = train['Misconception'].fillna('NA')
train['target'] = train['Category'].astype(str) + ':' + train['Misconception'].astype(str)

# Text features: concatenate question, MC answer, and explanation
def combine_text(df):
    return (
        df['QuestionText'].fillna('') + ' [MC] ' +
        df['MC_Answer'].fillna('') + ' [EXPL] ' +
        df['StudentExplanation'].fillna('')
    )

X_text = combine_text(train)
X_test_text = combine_text(test)

y = train['target'].values
groups = train['QuestionId'].values

print('Unique classes:', len(np.unique(y)))
print('Example label:', y[0])
print('Text example:', X_text.iloc[0][:300])

Unique classes: 65
Example label: True_Correct:NA
Text example: What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.] [MC] \( \frac{1}{3} \) [EXPL] 0ne third is equal to tree nineth


In [4]:

# A simple but strong baseline:
# - Word-level TF-IDF (1-2 grams)
# - SGDClassifier with log_loss to get predict_proba for many classes efficiently
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        lowercase=True,
        strip_accents='unicode',
        ngram_range=(1, 2),
        max_features=250_000,
        min_df=2
    )),
    ('clf', SGDClassifier(
        loss='log_loss',
        penalty='l2',
        alpha=1e-5,
        max_iter=10_000,
        tol=1e-4,
        random_state=42
    ))
])

pipeline

In [5]:

N_FOLDS = 5
gkf = GroupKFold(n_splits=N_FOLDS)

oof_true = []
oof_pred_topk = []

for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_text, y, groups), 1):
    X_tr, X_va = X_text.iloc[tr_idx], X_text.iloc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    model = pipeline
    model.fit(X_tr, y_tr)

    # Predict probabilities for validation
    proba = model.predict_proba(X_va)
    classes = model.named_steps['clf'].classes_
    pred_topk = [topk_from_proba(classes, row, k=3) for row in proba]

    oof_true.extend(list(y_va))
    oof_pred_topk.extend(pred_topk)

    fold_map3 = mapk(list(y_va), pred_topk, k=3)
    print(f"Fold {fold} MAP@3: {fold_map3:.5f}")
    gc.collect()

cv_map3 = mapk(oof_true, oof_pred_topk, k=3)
print(f"\nCV MAP@3 (mean over out-of-fold): {cv_map3:.5f}")

Fold 1 MAP@3: 0.56883
Fold 2 MAP@3: 0.29268
Fold 3 MAP@3: 0.57124
Fold 4 MAP@3: 0.52021
Fold 5 MAP@3: 0.50664

CV MAP@3 (mean over out-of-fold): 0.49453


In [6]:

final_model = pipeline
final_model.fit(X_text, y)

test_proba = final_model.predict_proba(X_test_text)
classes = final_model.named_steps['clf'].classes_

top3 = [topk_from_proba(classes, row, k=3) for row in test_proba]

# Build submission
sub = pd.DataFrame({
    'row_id': test.index + 36696,  # Kaggle's sample_submission starts at this ID; we re-index safely
    'Category:Misconception': [' '.join(t) for t in top3]
})
sub.head()

Unnamed: 0,row_id,Category:Misconception
0,36696,True_Correct:NA True_Neither:NA False_Neither:NA
1,36697,False_Misconception:Incomplete False_Misconcep...
2,36698,True_Neither:NA True_Correct:NA False_Neither:NA


In [7]:

# Use the provided sample_submission to ensure exact row_id ordering
sample_path = os.path.join(DATA_DIR, 'sample_submission.csv')
if os.path.exists(sample_path):
    sample = pd.read_csv(sample_path)
    if 'row_id' in sample.columns:
        sub = sample[['row_id']].merge(sub, on='row_id', how='left')
        # If any rows didn't merge (shouldn't happen), fill with a safe default
        default_label = 'True_Correct:NA'
        sub['Category:Misconception'] = sub['Category:Misconception'].fillna(default_label)
        print('Aligned with sample_submission.')
else:
    print('sample_submission.csv not found; using generated row_id sequence.')

sub.head(3)

Aligned with sample_submission.


Unnamed: 0,row_id,Category:Misconception
0,36696,True_Correct:NA True_Neither:NA False_Neither:NA
1,36697,False_Misconception:Incomplete False_Misconcep...
2,36698,True_Neither:NA True_Correct:NA False_Neither:NA


In [8]:

SUB_PATH = 'submission.csv'
sub.to_csv(SUB_PATH, index=False)
print('Saved:', os.path.abspath(SUB_PATH))
sub.head()

Saved: /home/martin/Workspace/map-charting-student-math-misunderstandings/submission.csv


Unnamed: 0,row_id,Category:Misconception
0,36696,True_Correct:NA True_Neither:NA False_Neither:NA
1,36697,False_Misconception:Incomplete False_Misconcep...
2,36698,True_Neither:NA True_Correct:NA False_Neither:NA



## Notes & Next Steps
- This is a **clean baseline**; strong gains often come from:
  - Adding **character-level TF‑IDF** features (e.g., char 3–5 grams) and stacking with word TF‑IDF.
  - Using **class-weighting** to handle long-tail misconception labels.
  - Ensembling multiple linear models or adding **lightweight neural** encoders (e.g., MiniLM) with pooling.
  - Normalizing math expressions / symbols; including `QuestionId`‑aware features.
- Keep runtime low and avoid leakage (we grouped by `QuestionId` for CV).
- Submit this notebook as **GPU/CPU (no internet)**. The output `submission.csv` is in the working directory.
