In [1]:
import numpy as np
import polars as pl
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
import sklearn.metrics
import lightgbm as lgb
from scipy.sparse import hstack

# データの読み込み
train = pl.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/train.csv")
test = pl.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/test.csv")

# Misconceptionの前処理
train = train.with_columns([
    pl.col('Misconception').fill_null('NA').cast(pl.Utf8).alias('Misconception')
])

# target_catの作成
train = train.with_columns([
    (pl.col('Category') + ":" + pl.col('Misconception')).alias('target_cat')
])

# Categoryのマッピング作成
category_counts = train['Category'].value_counts().sort('count', descending=True)
map_target1 = {row['Category']: idx for idx, row in enumerate(category_counts.iter_rows(named=True))}

# Misconceptionのマッピング作成
misconception_counts = train['Misconception'].value_counts().sort('count', descending=True)
map_target2 = {row['Misconception']: idx for idx, row in enumerate(misconception_counts.iter_rows(named=True))}

# target1とtarget2の作成
train = train.with_columns([
    pl.col('Category').map_elements(lambda x: map_target1.get(x, -1), return_dtype=pl.Int64).alias('target1'),
    pl.col('Misconception').map_elements(lambda x: map_target2.get(x, -1), return_dtype=pl.Int64).alias('target2')
])

# sentenceの作成
def create_sentence(row):
    return f"Question: {row['QuestionText']}\nAnswer: {row['MC_Answer']}\nExplanation: {row['StudentExplanation']}"

train = train.with_columns([
    pl.struct(['QuestionText', 'MC_Answer', 'StudentExplanation']).map_elements(
        create_sentence, return_dtype=pl.Utf8
    ).alias('sentence')
])

test = test.with_columns([
    pl.struct(['QuestionText', 'MC_Answer', 'StudentExplanation']).map_elements(
        create_sentence, return_dtype=pl.Utf8
    ).alias('sentence')
])

# 複数のTF-IDF特徴量を作成
print("Creating TF-IDF features...")

# TF-IDF 1: ngram_range=(1, 3)
tfidf1 = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), analyzer='word', 
                         max_df=0.95, min_df=2, max_features=10000)
all_sentences = pd.concat([
    train.select('sentence').to_pandas(),
    test.select('sentence').to_pandas()
])
tfidf1.fit(all_sentences['sentence'])
train_tfidf1 = tfidf1.transform(train['sentence'].to_pandas())
test_tfidf1 = tfidf1.transform(test['sentence'].to_pandas())

# TF-IDF 2: ngram_range=(1, 2) with character analyzer
tfidf2 = TfidfVectorizer(stop_words='english', ngram_range=(4, 6), analyzer='char', 
                         max_df=0.95, min_df=2, max_features=5000)
tfidf2.fit(all_sentences['sentence'])
train_tfidf2 = tfidf2.transform(train['sentence'].to_pandas())
test_tfidf2 = tfidf2.transform(test['sentence'].to_pandas())

# 特徴量を結合
train_embeddings = hstack([train_tfidf1, train_tfidf2])
test_embeddings = hstack([test_tfidf1, test_tfidf2])
print(f'Combined train sparse shape: {train_embeddings.shape}')
print(f'Combined test sparse shape: {test_embeddings.shape}')

# Target Category Training with Ensemble
print("\nTraining Category models...")
ytrain1_lr = np.zeros((len(train), len(map_target1)))
ytrain1_lgb = np.zeros((len(train), len(map_target1)))
ytest1_lr = np.zeros((len(test), len(map_target1)))
ytest1_lgb = np.zeros((len(test), len(map_target1)))

train_target1 = train['target1'].to_numpy()

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for i, (train_index, valid_index) in enumerate(skf.split(train_embeddings, train_target1)):
    print(f"Category Fold {i}, {len(train_index)}, {len(valid_index)}:")
    
    # Logistic Regression
    lr_model = LogisticRegression(max_iter=1000, C=1.0, random_state=42)
    lr_model.fit(train_embeddings[train_index], train_target1[train_index])
    ytrain1_lr[valid_index] = lr_model.predict_proba(train_embeddings[valid_index])
    ytest1_lr += (lr_model.predict_proba(test_embeddings) / 10.)
    
    # LightGBM
    lgb_model = lgb.LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        num_leaves=31,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    lgb_model.fit(train_embeddings[train_index], train_target1[train_index],
                  eval_set=[(train_embeddings[valid_index], train_target1[valid_index])],
                  callbacks=[lgb.early_stopping(10), lgb.log_evaluation(0)])
    ytrain1_lgb[valid_index] = lgb_model.predict_proba(train_embeddings[valid_index])
    ytest1_lgb += (lgb_model.predict_proba(test_embeddings) / 10.)

# アンサンブル（加重平均）
ytrain1 = 0.6 * ytrain1_lr + 0.4 * ytrain1_lgb
ytest1 = 0.6 * ytest1_lr + 0.4 * ytest1_lgb

print("Category ACC:", np.mean(train_target1 == np.argmax(ytrain1, 1)))
print("Category F1:", sklearn.metrics.f1_score(train_target1, np.argmax(ytrain1, 1), average='weighted'))

# Target Misconception Training with Ensemble
print("\nTraining Misconception models...")

# 異なるTF-IDF設定でMisconception用の特徴量を作成
tfidf_misc = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), analyzer='word', 
                             max_df=0.90, min_df=2, max_features=15000)
tfidf_misc.fit(all_sentences['sentence'])
train_embeddings_misc = tfidf_misc.transform(train['sentence'].to_pandas())
test_embeddings_misc = tfidf_misc.transform(test['sentence'].to_pandas())

ytrain2_lr = np.zeros((len(train), len(map_target2)))
ytrain2_lgb = np.zeros((len(train), len(map_target2)))
ytest2_lr = np.zeros((len(test), len(map_target2)))
ytest2_lgb = np.zeros((len(test), len(map_target2)))

train_target2 = train['target2'].to_numpy()

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for i, (train_index, valid_index) in enumerate(skf.split(train_embeddings_misc, train_target2)):
    print(f"Misconception Fold {i}, {len(train_index)}, {len(valid_index)}:")
    
    # Logistic Regression
    lr_model = LogisticRegression(class_weight='balanced', max_iter=1000, C=0.5, random_state=42)
    lr_model.fit(train_embeddings_misc[train_index], train_target2[train_index])
    ytrain2_lr[valid_index] = lr_model.predict_proba(train_embeddings_misc[valid_index])
    ytest2_lr += (lr_model.predict_proba(test_embeddings_misc) / 10.)
    
    # LightGBM
    # クラスの重みを計算
    from sklearn.utils.class_weight import compute_sample_weight
    sample_weights = compute_sample_weight('balanced', train_target2[train_index])
    
    lgb_model = lgb.LGBMClassifier(
        n_estimators=150,
        learning_rate=0.05,
        num_leaves=50,
        max_depth=8,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    lgb_model.fit(train_embeddings_misc[train_index], train_target2[train_index],
                  sample_weight=sample_weights,
                  eval_set=[(train_embeddings_misc[valid_index], train_target2[valid_index])],
                  callbacks=[lgb.early_stopping(15), lgb.log_evaluation(0)])
    ytrain2_lgb[valid_index] = lgb_model.predict_proba(train_embeddings_misc[valid_index])
    ytest2_lgb += (lgb_model.predict_proba(test_embeddings_misc) / 10.)

# アンサンブル（加重平均）
ytrain2 = 0.7 * ytrain2_lr + 0.3 * ytrain2_lgb
ytest2 = 0.7 * ytest2_lr + 0.3 * ytest2_lgb

print("Misconception ACC:", np.mean(train_target2 == np.argmax(ytrain2, 1)))
print("Misconception F1:", sklearn.metrics.f1_score(train_target2, np.argmax(ytrain2, 1), average='weighted'))

# 逆マッピングの作成
map_inverse1 = {v: k for k, v in map_target1.items()}
map_inverse2 = {v: k for k, v in map_target2.items()}

# 予測の生成
ytrain2[:, 0] = 0  # NAクラスの確率を0に
predicted1 = np.argsort(-ytrain1, 1)[:, :3]
predicted2 = np.argsort(-ytrain2, 1)[:, :3]

predict = []
for i in range(len(predicted1)):
    pred = []
    for j in range(3):
        p1 = map_inverse1[predicted1[i, j]]
        p2 = map_inverse2[predicted2[i, j]]
        if 'Misconception' in p1:
            pred.append(p1 + ":" + p2)
        else:
            pred.append(p1 + ":NA")
    predict.append(pred)

# 精度の評価
train_target_cat = train['target_cat'].to_list()
print("\nValidation Results:")
print("Acc@1:", np.mean([train_target_cat[i] == predict[i][0] for i in range(len(predict))]))
print("Acc@2:", np.mean([train_target_cat[i] == predict[i][1] for i in range(len(predict))]))
print("Acc@3:", np.mean([train_target_cat[i] == predict[i][2] for i in range(len(predict))]))

def map3(target_list, pred_list):
    score = 0.
    for t, p in zip(target_list, pred_list):
        if t == p[0]:
            score += 1.
        elif t == p[1]:
            score += 1/2
        elif t == p[2]:
            score += 1/3
    return score / len(target_list)

print(f"MAP@3: {map3(train_target_cat, predict)}")

# テストデータの予測
ytest2[:, 0] = 0  # NAクラスの確率を0に
predicted1 = np.argsort(-ytest1, 1)[:, :3]
predicted2 = np.argsort(-ytest2, 1)[:, :3]

predict = []
for i in range(len(predicted1)):
    pred = []
    for j in range(3):
        p1 = map_inverse1[predicted1[i, j]]
        p2 = map_inverse2[predicted2[i, j]]
        if 'Misconception' in p1:
            pred.append(p1 + ":" + p2)
        else:
            pred.append(p1 + ":NA")
    predict.append(" ".join(pred))

# 提出ファイルの作成
sub = pl.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/sample_submission.csv")
sub = sub.with_columns([
    pl.Series('Category:Misconception', predict)
])
sub.write_csv("submission.csv")

print("\nSubmission file created successfully!")

Creating TF-IDF features...




Combined train sparse shape: (36696, 15000)
Combined test sparse shape: (3, 15000)

Training Category models...
Category Fold 0, 33026, 3670:
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.439205
Category Fold 1, 33026, 3670:
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.445909
Category Fold 2, 33026, 3670:
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.413911
Category Fold 3, 33026, 3670:
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.428271
Category Fold 4, 33026, 3670:
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.



Misconception Fold 0, 33026, 3670:
Training until validation scores don't improve for 15 rounds
Did not meet early stopping. Best iteration is:
[150]	valid_0's multi_logloss: 0.736206
Misconception Fold 1, 33026, 3670:
Training until validation scores don't improve for 15 rounds
Did not meet early stopping. Best iteration is:
[150]	valid_0's multi_logloss: 0.726358
Misconception Fold 2, 33026, 3670:
Training until validation scores don't improve for 15 rounds
Did not meet early stopping. Best iteration is:
[150]	valid_0's multi_logloss: 0.735752
Misconception Fold 3, 33026, 3670:
Training until validation scores don't improve for 15 rounds
Did not meet early stopping. Best iteration is:
[150]	valid_0's multi_logloss: 0.719123
Misconception Fold 4, 33026, 3670:
Training until validation scores don't improve for 15 rounds
Did not meet early stopping. Best iteration is:
[150]	valid_0's multi_logloss: 0.746753
Misconception Fold 5, 33026, 3670:
Training until validation scores don't improv

In [2]:
sub

row_id,Category:Misconception
i64,str
36696,"""True_Correct:NA True_Neither:N…"
36697,"""False_Misconception:Incomplete…"
36698,"""True_Neither:NA True_Correct:N…"
