In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/map-charting-student-math-misunderstandings/sample_submission.csv
/kaggle/input/map-charting-student-math-misunderstandings/train.csv
/kaggle/input/map-charting-student-math-misunderstandings/test.csv


In [2]:
# Load the training data
train_df = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/train.csv")
train_df.head()

Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,
2,2,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"1 3rd is half of 3 6th, so it is simplee to un...",True_Neither,
3,3,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 goes into everything and 3 goes into nine,True_Neither,
4,4,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 out of every 3 isn't coloured,True_Correct,


In [3]:
test_df = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/test.csv")
print(train_df.shape)
print(test_df.shape)

(36696, 7)
(3, 5)


In [9]:
train_texts = set(train_df['QuestionText'] + train_df['MC_Answer'] + train_df['StudentExplanation'])
val_texts = set(test_df['QuestionText'] + test_df['MC_Answer'] + test_df['StudentExplanation'])

overlap = train_texts.intersection(val_texts)
print(f"Overlap between train and val: {len(overlap)}")


Overlap between train and val: 0


In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import label_ranking_average_precision_score

# Combine text fields
def combine_text(row):
    return f"{row['QuestionText']} [SEP] {row['MC_Answer']} [SEP] {row['StudentExplanation']}"

train_df['text'] = train_df.apply(combine_text, axis=1)
test_df['text'] = test_df.apply(combine_text, axis=1)

# Encode labels
le = LabelEncoder()
train_df['label_enc'] = le.fit_transform(train_df['Category'])
all_classes = le.classes_
n_classes = len(all_classes)

# Vectorize
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1, 2), stop_words='english')
X = tfidf.fit_transform(train_df['text'])
X_test = tfidf.transform(test_df['text'])
y = train_df['label_enc'].values

# Classifier
clf = OneVsRestClassifier(
    LogisticRegression(max_iter=1000, solver='saga'),
    n_jobs=-1
)

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
map_scores = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    print(f"\nFold {fold + 1}")
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    clf.fit(X_train, y_train)
    
    y_proba_partial = clf.predict_proba(X_val)
    y_proba = np.zeros((X_val.shape[0], n_classes))
    
    # Align predictions with label encoder
    for i, class_label in enumerate(clf.classes_):
        y_proba[:, class_label] = y_proba_partial[:, i]

    y_val_bin = np.zeros_like(y_proba)
    y_val_bin[np.arange(len(y_val)), y_val] = 1

    score = label_ranking_average_precision_score(y_val_bin, y_proba)
    map_scores.append(score)
    print(f"MAP@3 score: {score:.4f}")

print(f"\nAverage CV MAP@3: {np.mean(map_scores):.4f}")

# Fit on all training data
clf.fit(X, y)

# Predict on test set
test_proba_partial = clf.predict_proba(X_test)
test_proba = np.zeros((X_test.shape[0], n_classes))
for i, class_label in enumerate(clf.classes_):
    test_proba[:, class_label] = test_proba_partial[:, i]

# Get top-3 predictions
top_3_indices = np.argsort(test_proba, axis=1)[:, -3:][:, ::-1]
top_3_labels = le.inverse_transform(top_3_indices.flatten()).reshape(top_3_indices.shape)

# Build submission
submission = pd.DataFrame({
    "ID": test_df["row_id"],
    "Predicted": [" ".join(row) for row in top_3_labels]
})

submission.to_csv("submission.csv", index=False)
print("Submission saved as submission.csv")



Fold 1
MAP@3 score: 0.8453

Fold 2
MAP@3 score: 0.8480

Fold 3
MAP@3 score: 0.8431

Fold 4
MAP@3 score: 0.8466

Fold 5
MAP@3 score: 0.8467

Average CV MAP@3: 0.8460
Submission saved as submission.csv
