In [1]:
!pip install transformers peft accelerate bitsandbytes \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files

Looking in links: /kaggle/input/lmsys-wheel-files
Processing /kaggle/input/lmsys-wheel-files/peft-0.11.1-py3-none-any.whl
Processing /kaggle/input/lmsys-wheel-files/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl
Installing collected packages: bitsandbytes, peft
Successfully installed bitsandbytes-0.43.1 peft-0.11.1


In [2]:
import ast
import numpy as np
import pandas as pd

In [3]:
keep_cols = ["QuestionId", "ConstructName", "SubjectName", "CorrectAnswer", "QuestionText"]
answer_cols = ["AnswerAText", "AnswerBText", "AnswerCText", "AnswerDText"]
misconception_cols = ["MisconceptionAId", "MisconceptionBId", "MisconceptionCId", "MisconceptionDId"]
 
    
def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    # Melt the answer columns
    answers_df = pd.melt(
        id_vars=keep_cols,
        frame=df[keep_cols + answer_cols],
        var_name='Answer', value_name='Value'
    ).sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
    
    # If NOT test set
    if misconception_cols[0] in df.columns:
        
        # Melt the misconception columns
        misconceptions_df = pd.melt(
            id_vars=keep_cols,
            frame=df[keep_cols + misconception_cols],
            var_name='Misconception', value_name='MisconceptionId'
        ).sort_values(["QuestionId", "Misconception"]).reset_index(drop=True)

        answers_df[['Misconception', 'MisconceptionId']] = misconceptions_df[['Misconception', 'MisconceptionId']]
    
    return answers_df


test_data = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
test = wide_to_long(test_data)
test.to_csv("test_after_process.csv", index=False)
del test

In [4]:
%%writefile processor.py

from collections import defaultdict
from typing import Dict, List, Optional, Union

from transformers import AutoTokenizer


class plain_processor:
    def __init__(
        self,
        tokenizer: AutoTokenizer,
        max_length: int,
        template: Optional[str] = "{ConstructName} {QuestionText} {Answer}",
        add_eos_token: bool = True,
    ):
        self.tokenizer = tokenizer
        self.template = template
        self.add_eos_token = add_eos_token
        self.max_length = max_length - 1 if add_eos_token else max_length

    def preprocess_batch(self, batch_data: Dict[str, List[str]]):
        subject_name = batch_data["SubjectName"]
        contruct_name = batch_data["ConstructName"]
        question_text = batch_data["QuestionText"]
        answer_text = batch_data["Value"]
        return subject_name, contruct_name, question_text, answer_text
    
    def format_texts(self, subject_name, contruct_name, question_text, answer_text):
        texts = []
        for subj, cont, ques, ans in zip(subject_name, contruct_name, question_text, answer_text):
            data_dic = {
                "SubjectName": subj,
                "ConstructName": cont,
                "QuestionText": ques,
                "Answer": ans,
            }
            text = self.template.format_map(data_dic)
            texts.append(text)
        return texts
    
    def __call__(self, batch_data):
        batch = self.preprocess_batch(batch_data)
        texts = self.format_texts(*batch)
        results = defaultdict(list)
        outputs = self.tokenizer(
            texts,
            max_length=self.max_length,
            truncation=True,
        )
        
        for input_ids, attention_mask in zip(outputs['input_ids'], outputs['attention_mask']):
            input_ids.append(self.tokenizer.eos_token_id)
            attention_mask.append(1)

        results["input_ids"] = outputs["input_ids"]
        results["attention_mask"] = outputs["attention_mask"]

        return results
    

class misconception_processor:
    def __init__(
        self,
        tokenizer: AutoTokenizer,
        max_length: int,
        add_eos_token: bool = True,
    ):
        self.tokenizer = tokenizer
        self.add_eos_token = add_eos_token
        self.max_length = max_length - 1 if add_eos_token else max_length
    
    def __call__(self, batch_data):
        batch_data = batch_data["MisconceptionName"]
        results = defaultdict(list)
        outputs = self.tokenizer(
            batch_data,
            max_length=self.max_length,
            truncation=True,
        )

        for input_ids, attention_mask in zip(outputs['input_ids'], outputs['attention_mask']):
            input_ids.append(self.tokenizer.eos_token_id)
            attention_mask.append(1)

        results["input_ids"] = outputs["input_ids"]
        results["attention_mask"] = outputs["attention_mask"]

        return results

Writing processor.py


In [5]:
%%writefile inference.py

from dataclasses import dataclass, field
from typing import Dict, List, Optional, Union

import torch
from torch import Tensor
from torch.utils.data import DataLoader
import torch.nn.functional as F

import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from transformers import (
    HfArgumentParser,
    AutoTokenizer,
    AutoModel,
    DataCollatorWithPadding,
)
from peft import PeftModel
from processor import plain_processor, misconception_processor

from sklearn.metrics.pairwise import cosine_similarity

@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(default="BAAI/bge-base-en-v1.5")
    model_max_length: int = field(
        default=1024,
        metadata={
            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
        },
    )
    half_precision: bool = field(default=True, metadata={"help": "Whether to use half precision."})
    add_eos_token: bool = field(default=False)
    lora_dir: Optional[str] = field(default="/kaggle/input/qwen2-5-1-5b-retrieval/checkpoint-525")


@dataclass
class DataArguments:
    train_data_path: str = field(
        default="data/train_after_process.csv", metadata={"help": "Path to the training data."}
    )
    misconception_mapping: str = field(
        default="data/misconception_mapping.csv", metadata={"help": "Path to the misconception mapping."}
    )
    template: str = field(
        default="{ConstructName} {QuestionText} {Answer}", metadata={"help": "Template for the input text."}
    )
    top_k_for_recall: int =  field(default=25, metadata={"help": "Remain top k in recall stage."})


@dataclass
class TrainingArguments:
    batch_size: int = field(default=8, metadata={"help": "Batch size per GPU for inference."})


def last_token_pool(
    last_hidden_states: Tensor,            
    attention_mask: Tensor
) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
    

@torch.no_grad()
@torch.amp.autocast('cuda')
def inference(model, dataset, data_collator, batch_size: int = 8):
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)
    embeddings = []
    for batch in tqdm(data_loader):
        if "labels" in batch.keys():
            batch.pop("labels")
        batch = {k: v.to(model.device) for k, v in batch.items()}
        sentence_embeddings = model(**batch).last_hidden_state
        sentence_embeddings = last_token_pool(sentence_embeddings, batch['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        embeddings.append(sentence_embeddings.detach().cpu().numpy())
    return np.concatenate(embeddings, axis=0)


def main():
    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # prepare tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        padding_side="right",
        use_fast=True,
    )
    model = AutoModel.from_pretrained(
        model_args.model_name_or_path,
        device_map="auto",
        torch_dtype=torch.float16 if model_args.half_precision else torch.float32,
    )
    
    model = PeftModel.from_pretrained(model, model_args.lora_dir)
    model.eval()

    # prepare data
    # TODO: Inference for test data.
    train_dataset = Dataset.from_csv(data_args.train_data_path)
    preprocess = plain_processor(tokenizer, model_args.model_max_length, template=data_args.template)
    train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)

    misconception_mapping = Dataset.from_csv(data_args.misconception_mapping)
    mis_preprocess = misconception_processor(tokenizer, model_args.model_max_length)
    misconception_mapping = misconception_mapping.map(mis_preprocess, batched=True, remove_columns=misconception_mapping.column_names)
    
    # inference
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_embeddings = inference(
        model, train_dataset, batch_size=training_args.batch_size, data_collator=data_collator
    )
    misconception_embeddings = inference(
        model, misconception_mapping, batch_size=training_args.batch_size, data_collator=data_collator
    )

    # calculate cosine similarity
    cos_sim_arr = cosine_similarity(train_embeddings, misconception_embeddings)
    sorted_indices = np.argsort(-cos_sim_arr, axis=1)
    sorted_indices = sorted_indices[:, :data_args.top_k_for_recall].tolist()
    
    return sorted_indices

if __name__ == "__main__":
    test_sorted_indices = main()
    
    test_data = pd.read_csv("test_after_process.csv")
    test_data["Answer_alphabet"] = test_data["Answer"].str.extract(r'Answer([A-Z])Text$')
    test_data["QuestionId_Answer"] = test_data["QuestionId"].astype("str") + "_" + test_data["Answer_alphabet"]
    test_data["MisconceptionId"] = test_sorted_indices
    # filter correct row
    test_data = test_data[test_data["CorrectAnswer"] != test_data["Answer_alphabet"]]
    test_data = test_data.sort_values(by=["QuestionId", "Answer_alphabet"])
    
    # submission = test_data[["QuestionId_Answer", "MisconceptionId"]]
    # submission.loc[:, "MisconceptionId"] = submission["MisconceptionId"].apply(lambda x: ' '.join(map(str, x[:25])))
    # submission.to_csv('submission.csv', index=False)
    test_data.to_csv("retrieve_result.csv", index=False)

Writing inference.py


In [6]:
!python inference.py \
    --model_name_or_path /kaggle/input/qwen2-math-7b-instruct \
    --model_max_length 1024 \
    --half_precision False \
    --train_data_path test_after_process.csv \
    --misconception_mapping /kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv \
    --batch_size 8 \
    --template "{ConstructName} {QuestionText} {Answer}" \
    --lora_dir "/kaggle/input/qwen2-5-1-5b-retrieval/checkpoint-474" \
    --top_k_for_recall 50

Loading checkpoint shards: 100%|██████████████████| 4/4 [01:08<00:00, 17.00s/it]
Generating train split: 12 examples [00:00, 487.43 examples/s]
Map: 100%|██████████████████████████████| 12/12 [00:00<00:00, 683.90 examples/s]
Generating train split: 2587 examples [00:00, 271083.63 examples/s]
Map: 100%|████████████████████████| 2587/2587 [00:00<00:00, 24064.41 examples/s]
100%|█████████████████████████████████████████████| 2/2 [00:01<00:00,  1.15it/s]
100%|█████████████████████████████████████████| 324/324 [01:52<00:00,  2.87it/s]


In [7]:
# sub = pd.read_csv("submission.csv")
# sub.head()

In [8]:
retrieve_result = pd.read_csv("retrieve_result.csv")
retrieve_result["MisconceptionId"] = retrieve_result["MisconceptionId"].apply(ast.literal_eval)
retrieve_result = retrieve_result.explode("MisconceptionId")
misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
retrieve_result = retrieve_result.merge(misconception_mapping, how="left", left_on="MisconceptionId", right_on="MisconceptionId")
retrieve_result.to_csv("retrieve_result.csv", index=False)
del retrieve_result

In [9]:
%%writefile reranker_processor.py

from collections import defaultdict
from typing import Dict, List, Optional, Union

from transformers import AutoTokenizer


class reranker_processor:
    def __init__(
        self,
        tokenizer: AutoTokenizer,
        max_length: int,
        template: Optional[str] = "<|im_start|>{ConstructName} {QuestionText} {Answer}\\n{DOC}<|im_end|>",
    ):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.template = template

    def preprocess_batch(self, batch_data: Dict[str, List[str]]):
        subject_name = batch_data["SubjectName"]
        contruct_name = batch_data["ConstructName"]
        question_text = batch_data["QuestionText"]
        answer_text = batch_data["Value"]
        misconception = batch_data["MisconceptionName"]
        return subject_name, contruct_name, question_text, answer_text, misconception
    
    def format_texts(self, subject_name, contruct_name, question_text, answer_text, misconception):
        texts = []
        for subj, cont, ques, ans, misc in zip(subject_name, contruct_name, question_text, answer_text, misconception):
            data_dic = {
                "SubjectName": subj,
                "ConstructName": cont,
                "QuestionText": ques,
                "Answer": ans,
                "DOC": misc,
            }
            text = self.template.format_map(data_dic)
            texts.append(text)
        return texts
    
    def __call__(self, batch_data):
        batch = self.preprocess_batch(batch_data)
        texts = self.format_texts(*batch)
        results = defaultdict(list)
        outputs = self.tokenizer(
            texts,
            max_length=self.max_length,
            truncation=True,
        )

        results["input_ids"] = outputs["input_ids"]
        results["attention_mask"] = outputs["attention_mask"]

        return results

Writing reranker_processor.py


In [10]:
%%writefile reranker_inference.py

from dataclasses import dataclass, field
from typing import Dict, List, Optional, Union

import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F

import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from peft import PeftModel
from transformers import (
    HfArgumentParser,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
)

from reranker_processor import reranker_processor

@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(default="BAAI/bge-base-en-v1.5")
    model_max_length: int = field(
        default=1024,
        metadata={
            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
        },
    )
    half_precision: bool = field(default=True, metadata={"help": "Whether to use half precision."})
    add_eos_token: bool = field(default=False)
    lora_dir: str = field(default="/kaggle/input/qwen2-math-1-5b-it/checkpoint-1887")


@dataclass
class DataArguments:
    train_data_path: str = field(
        default="data/train_after_process.csv", metadata={"help": "Path to the training data."}
    )
    template: str = field(
        default="{ConstructName} {QuestionText} {Answer}", metadata={"help": "Template for the input text."}
    )


@dataclass
class TrainingArguments:
    batch_size: int = field(default=8, metadata={"help": "Batch size per GPU for inference."})


@torch.no_grad()
@torch.amp.autocast('cuda')
def inference(model, dataset, data_collator, batch_size: int = 8):
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)
    probabilities = []
    for batch in tqdm(data_loader):
        batch = {k: v.to(model.device) for k, v in batch.items()}
        outputs = model(**batch).logits
        proba = outputs.sigmoid().detach().cpu().numpy()
        probabilities.extend(proba[:, 0].astype(np.float32))
    return probabilities


def main():
    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # prepare tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        padding_side="right",
        use_fast=True,
        add_eos_token=model_args.add_eos_token,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        num_labels=1,
        device_map="auto",
        torch_dtype=torch.float16 if model_args.half_precision else torch.float32,
    )
    model = PeftModel.from_pretrained(model, model_args.lora_dir)
    model.eval()
    
    if "llama" in model_args.model_name_or_path.lower():
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.pad_token_id
    
    if "qwen" in model_args.model_name_or_path.lower():
        tokenizer.pad_token = "<|endoftext|>"
        model.config.pad_token_id = tokenizer.pad_token_id

    # prepare data
    # TODO: Inference for test data.
    train_dataset = Dataset.from_csv(data_args.train_data_path)
    preprocess = reranker_processor(tokenizer, model_args.model_max_length, template=data_args.template)
    train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
    
    # inference
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    probabilities = inference(
        model, train_dataset, batch_size=training_args.batch_size, data_collator=data_collator
    )
    
    return probabilities

if __name__ == "__main__":
    probabilities = main()
    test_data = pd.read_csv("retrieve_result.csv")
    test_data["pred_prob"] = probabilities
    test_data = test_data.sort_values(by=["QuestionId_Answer", "pred_prob"], ascending=[True, False])
    test_data = test_data.groupby("QuestionId_Answer", as_index=False).agg({"MisconceptionId": lambda x: list(x)})
    
    submission = test_data[["QuestionId_Answer", "MisconceptionId"]]
    submission["MisconceptionId"] = submission["MisconceptionId"].apply(lambda x: ' '.join(map(str, x[:25])))
    submission = submission.sort_values("QuestionId_Answer")
    submission.to_csv('submission.csv', index=False)

Writing reranker_inference.py


In [11]:
!python reranker_inference.py \
    --model_name_or_path /kaggle/input/qwen2-math-1-5b-instruct/Qwen2-Math-1.5B-Instruct \
    --model_max_length 1024 \
    --half_precision False \
    --train_data_path retrieve_result.csv \
    --batch_size 16 \
    --template "<|im_start|>{ConstructName} {QuestionText} {Answer}\\n{DOC}<|im_end|>" \
    --lora_dir /kaggle/input/qwen2-math-1-5b-it/checkpoint-378

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/qwen2-math-1-5b-instruct/Qwen2-Math-1.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Generating train split: 450 examples [00:00, 37242.98 examples/s]
Map: 100%|███████████████████████████| 450/450 [00:00<00:00, 5252.51 examples/s]
100%|███████████████████████████████████████████| 29/29 [00:11<00:00,  2.56it/s]
