In [1]:
!pip install /kaggle/input/mathllm-packages/sentence_transformers-3.2.1-py3-none-any.whl
!pip install /kaggle/input/mathllm-packages/peft-0.13.2-py3-none-any.whl
!pip install /kaggle/input/mathllm-packages/FlagEmbedding-1.2.11-py3-none-any.whl

Processing /kaggle/input/mathllm-packages/sentence_transformers-3.2.1-py3-none-any.whl
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.1
Processing /kaggle/input/mathllm-packages/peft-0.13.2-py3-none-any.whl
Installing collected packages: peft
Successfully installed peft-0.13.2
Processing /kaggle/input/mathllm-packages/FlagEmbedding-1.2.11-py3-none-any.whl
Installing collected packages: FlagEmbedding
Successfully installed FlagEmbedding-1.2.11


In [2]:
import os, re, json
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from transformers import AutoTokenizer

tqdm.pandas()
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [None]:
model_id = '/kaggle/input/eedi-train-finetune-bge-embedding-model/eedi_model'

comp_dir = '/kaggle/input/eedi-mining-misconceptions-in-mathematics'

test          = pd.read_csv(f'{comp_dir}/test.csv')
misconceptions = pd.read_csv(f'{comp_dir}/misconception_mapping.csv')

test["AllQuestionText"] = test["SubjectName"] + " ### " + test["ConstructName"] + " ### " + test["QuestionText"]

keep_cols           = ["QuestionId", "AllQuestionText", "CorrectAnswer"]
answer_cols         = ["AnswerAText", "AnswerBText", "AnswerCText", "AnswerDText"]
misconception_cols  = ["MisconceptionAId", "MisconceptionBId", "MisconceptionCId", "MisconceptionDId"]

def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    # Melt the answer columns
    answers_df = pd.melt(
        id_vars=keep_cols,
        frame=df[keep_cols + answer_cols],
        var_name='Answer', value_name='Value'
    ).sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
    return answers_df

test = wide_to_long(test)

# https://www.kaggle.com/code/pshikk/similarity-preprocessing

def preprocess_text(x):
    x = x.lower()                 # Convert words to lowercase
    x = re.sub("@\w+", '',x)      # Delete strings starting with @
    x = re.sub("'\d+", '',x)      # Delete Numbers
    x = re.sub("\d+", '',x)
    x = re.sub("http\w+", '',x)   # Delete URL
    x = re.sub(r"\s+", " ", x)    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\.+", ".", x)    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\,+", ",", x)
    x = x.strip()                 # Remove empty characters at the beginning and end
    return x

test["AllText"] = test["AllQuestionText"] + " ### " + test["Value"]
test['AnswerId'] = test.Answer.str.replace('Answer', '').str.replace('Text', '')

test = test[test.AnswerId != test.CorrectAnswer].reset_index(drop=True)
test.drop(['AllQuestionText', 'Answer'], axis=1, inplace=True)


test["AllText"] = test["AllText"].apply(preprocess_text)
misconceptions['MisconceptionName'] = misconceptions['MisconceptionName'].apply(preprocess_text)

test_texts = list(test.AllText.values)
MisconceptionName = list(misconceptions['MisconceptionName'].values)

In [4]:
from FlagEmbedding import FlagModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = FlagModel(model_id, 
                  query_instruction_for_retrieval="",
                  use_fp16=True)
q_embeddings = model.encode_queries(test_texts)
p_embeddings = model.encode(MisconceptionName)
# scores = q_embeddings @ p_embeddings.T
test_cos_sim_arr = cosine_similarity(q_embeddings, p_embeddings)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)
test_sorted_indices

----------using 2*GPUs----------


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Inference Embeddings: 100%|██████████| 6/6 [00:02<00:00,  2.79it/s]


array([[2306, 2488, 1507, ...,  780,  961,  237],
       [2306, 2488, 1507, ..., 2232,  961,  237],
       [2306, 2488, 1507, ..., 1635, 1530,  237],
       ...,
       [1287, 1073,  365, ...,  125, 2520, 1890],
       [1287, 1073,  397, ...,  620,   88, 1890],
       [1287, 1073, 2439, ...,   88,  620, 1890]])

In [5]:
test["QuestionId_Answer"] = test["QuestionId"].astype("str") + "_" + test["AnswerId"]
test["MisconceptionId"] = test_sorted_indices[:, :25].tolist()
test["MisconceptionId"] = test["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))
# filter correct row
test = test[test["CorrectAnswer"] != test["AnswerId"]]
submission = test[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)
submission

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_B,2306 2488 1507 706 987 2532 328 1672 1963 2181...
1,1869_C,2306 2488 1507 706 328 2532 987 1672 1005 2181...
2,1869_D,2306 2488 1507 706 2532 1005 987 1672 1392 328...
3,1870_A,2398 1593 2142 2068 2307 143 891 363 1256 1755...
4,1870_B,2398 1593 2142 2068 2307 143 891 363 1256 1755...
5,1870_C,2398 1593 143 2142 2068 891 2307 1755 363 979 ...
6,1871_A,1287 1073 365 397 2439 1923 1349 1177 2319 255...
7,1871_C,1287 1073 397 2439 365 1923 1349 1177 2319 215...
8,1871_D,1287 1073 2439 365 397 1349 1923 2151 2551 167...


In [6]:
submission.to_csv("submission.csv", index=False)