In [1]:
!pip install -U FlagEmbedding
!pip install peft

Collecting FlagEmbedding
  Downloading FlagEmbedding-1.2.11.tar.gz (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.1/147.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting sentence_transformers (from FlagEmbedding)
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: FlagEmbedding
  Building wheel for FlagEmbedding (setup.py) ... [?25ldone
[?25h  Created wheel for FlagEmbedding: filename=FlagEmbedding-1.2.11-py3-none-any.whl size=171152 sha256=cdbda048a8726245c0468cf42ef251d53ad4f47a1f8f05203ed7b83c295eb378
  Stored in directory: /root/.cache/pip/wheels/70/2a/6e/89f48d04306d3981effeeab965819ea7fe29a7c94983d29024
Successfully built FlagEm

In [2]:
import os, re, json
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from transformers import AutoTokenizer

tqdm.pandas()
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [3]:
model_id = '/kaggle/input/eedi-train-finetune-bge-embedding-model/eedi_model/checkpoint-822'

comp_dir = '/kaggle/input/eedi-mining-misconceptions-in-mathematics'

test          = pd.read_csv(f'{comp_dir}/test.csv')
misconceptions = pd.read_csv(f'{comp_dir}/misconception_mapping.csv')

test["AllQuestionText"] = test["SubjectName"] + " ### " + test["ConstructName"] + " ### " + test["QuestionText"]

keep_cols           = ["QuestionId", "AllQuestionText", "CorrectAnswer"]
answer_cols         = ["AnswerAText", "AnswerBText", "AnswerCText", "AnswerDText"]
misconception_cols  = ["MisconceptionAId", "MisconceptionBId", "MisconceptionCId", "MisconceptionDId"]

def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    # Melt the answer columns
    answers_df = pd.melt(
        id_vars=keep_cols,
        frame=df[keep_cols + answer_cols],
        var_name='Answer', value_name='Value'
    ).sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
    return answers_df

test = wide_to_long(test)

# https://www.kaggle.com/code/pshikk/similarity-preprocessing

def preprocess_text(x):
    x = x.lower()                 # Convert words to lowercase
    x = re.sub("@\w+", '',x)      # Delete strings starting with @
    x = re.sub("'\d+", '',x)      # Delete Numbers
    x = re.sub("\d+", '',x)
    x = re.sub("http\w+", '',x)   # Delete URL
    x = re.sub(r"\s+", " ", x)    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\.+", ".", x)    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\,+", ",", x)
    x = x.strip()                 # Remove empty characters at the beginning and end
    return x

test["AllText"] = test["AllQuestionText"] + " ### " + test["Value"]
test['AnswerId'] = test.Answer.str.replace('Answer', '').str.replace('Text', '')

test = test[test.AnswerId != test.CorrectAnswer].reset_index(drop=True)
test.drop(['AllQuestionText', 'Answer'], axis=1, inplace=True)


test["AllText"] = test["AllText"].apply(preprocess_text)
misconceptions['MisconceptionName'] = misconceptions['MisconceptionName'].apply(preprocess_text)

test_texts = list(test.AllText.values)
MisconceptionName = list(misconceptions['MisconceptionName'].values)

In [4]:
from FlagEmbedding import FlagModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = FlagModel(model_id, 
                  query_instruction_for_retrieval="",
                  use_fp16=True)
q_embeddings = model.encode_queries(test_texts)
p_embeddings = model.encode(MisconceptionName)
# scores = q_embeddings @ p_embeddings.T
test_cos_sim_arr = cosine_similarity(q_embeddings, p_embeddings)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)
test_sorted_indices

----------using 2*GPUs----------


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Inference Embeddings: 100%|██████████| 6/6 [00:02<00:00,  2.74it/s]


array([[2306, 1507,  706, ..., 2089,  237, 2232],
       [2306, 1507,  706, ...,  182,  237, 2232],
       [2306, 2488, 1507, ...,  124,  925,  182],
       ...,
       [1287,  397, 1073, ..., 1857, 1239,  620],
       [1287,  397, 1073, ..., 1239, 1890,  620],
       [1287,  397, 2439, ..., 1890, 1239, 1857]])

In [5]:
test["QuestionId_Answer"] = test["QuestionId"].astype("str") + "_" + test["AnswerId"]
test["MisconceptionId"] = test_sorted_indices[:, :25].tolist()
test["MisconceptionId"] = test["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))
# filter correct row
test = test[test["CorrectAnswer"] != test["AnswerId"]]
submission = test[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)
submission

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_B,2306 1507 706 2488 1005 987 1672 1516 2532 328...
1,1869_C,2306 1507 706 2488 1005 987 1672 1516 328 2532...
2,1869_D,2306 2488 1507 706 1005 2532 987 1672 1392 134...
3,1870_A,1593 2398 1256 891 979 363 1540 547 2307 859 1...
4,1870_B,1593 2398 1256 891 979 363 1540 547 2307 859 1...
5,1870_C,1593 2398 891 363 547 979 1540 2307 1256 859 7...
6,1871_A,1287 397 1073 2439 365 1349 2319 1923 2551 167...
7,1871_C,1287 397 1073 2439 365 1349 2319 1923 2551 167...
8,1871_D,1287 397 2439 365 1349 1073 2551 1923 2319 105...


In [None]:
submission.to_csv("submission.csv", index=False)