In [None]:
!pip install -Uq peft==0.12.0 accelerate==0.33.0 bitsandbytes trl
!pip install -Uq transformers["sentencepiece"]==4.44.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.6/316.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.1/112.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive, userdata
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import gc
import re

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import random
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from scipy.special import softmax
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold

from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForCausalLM,
    Trainer, TrainingArguments,
    BitsAndBytesConfig,
)

from transformers.tokenization_utils import PreTrainedTokenizerBase
from transformers.trainer_utils import set_seed
from transformers.utils import is_torch_bf16_gpu_available

from peft import LoraConfig, TaskType, get_peft_model, PeftModel, PeftConfig

from trl import DataCollatorForCompletionOnlyLM

tqdm.pandas()

def create_random_id(length=7):
    random.seed()
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=length))

In [None]:
RUN_NAME = f"{create_random_id()}"
DEBUG = False
SAMPLING_SIZE = 30
PER_DEVICE_BATCH_SIZE = 4
N_SPLIT = 40
EPOCHS = 2
L_RATE = 7e-5
MODEL_NAME = "Qwen/Qwen2.5-32B-Instruct"
print(RUN_NAME)

ecueaqb


In [None]:
misconception_df = pd.read_csv("/content/drive/MyDrive/eedi/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
mapping_dict = misconception_df.set_index("MisconceptionId")["MisconceptionName"].to_dict()
df_retrieval_result = pd.read_parquet("/content/drive/MyDrive/eedi/input/baseline/df_retrieval_result.parquet")
df_train = pd.read_csv("/content/drive/MyDrive/eedi/input/baseline/train_df.csv")

df_train["QuestionId_Answer"] = df_train["QuestionId"].astype(str) + "_" + df_train["answer_name"]
df_train = df_train.drop(["MisconceptionId"], axis=1).merge(df_retrieval_result, on="QuestionId_Answer", how="inner")

if DEBUG:
    df_train = df_train.sample(SAMPLING_SIZE).reset_index(drop=True)

print(df_train.columns)
print(df_train.shape)

Index(['QuestionId', 'ConstructId', 'ConstructName', 'SubjectId',
       'SubjectName', 'CorrectAnswer', 'QuestionText', 'AnswerAText',
       'AnswerBText', 'AnswerCText', 'AnswerDText', 'MisconceptionAId',
       'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId',
       'query_text', 'answer_name', 'order_index', 'QuestionId_Answer',
       'Prompt', 'MisconceptionId'],
      dtype='object')
(4370, 21)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.padding_side = "left"

def get_retrieval_rank(row):
  target_ids = list(map(int, row.MisconceptionId.split()))
  if row.true_original in target_ids:
    return target_ids.index(row.true_original)+1
  else:
    return 26

def get_candidates(row):
    target_ids = list(map(int, row.MisconceptionId.split()))
    target_ids = list(dict.fromkeys([row.true_original] + target_ids))[:9]
    random.shuffle(target_ids)
    misconceptions_text = [mapping_dict[m] for m in target_ids]

    res_text = ""
    for i, (_, text) in enumerate(zip(target_ids, misconceptions_text, strict=False)):
        res_text += f"{i+1}. {text}\n"
    return res_text, target_ids

def get_true_label(row):
  try:
    return row.target_ids.index(row.true_original)+1
  except ValueError:
    return -1

PROMPT = """
Here is a question about {ConstructName}({SubjectName}).
Question: {Question}
Correct Answer: {CorrectAnswer}
Incorrect Answer: {IncorrectAnswer}

You are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question.
Answer concisely what misconception it is to lead to getting the incorrect answer.
Pick the correct misconception number from the below:

{Retrival}
"""

def preprocess_text(x):
    x = re.sub("http\w+", "", x)  # Delete URL
    x = re.sub(r"\.+", ".", x)  # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\,+", ",", x)
    x = re.sub(r"\\\(", " ", x)
    x = re.sub(r"\\\)", " ", x)
    x = re.sub(r"[ ]{1,}", " ", x)
    x = x.strip()  # Remove empty characters at the beginning and end
    return x

def apply_template(row):

  input_text = preprocess_text(PROMPT.format(
      ConstructName=row["ConstructName"],
      SubjectName=row["SubjectName"],
      Question=row["QuestionText"],
      IncorrectAnswer=row["answer_name"],
      CorrectAnswer=row["CorrectAnswer"],
      Retrival=row["retrieval"],
    )
  )

  messages = [
      {
          "role": "user",
          "content": input_text,
      },
      {
          "role": "assistant",
          "content": f"Answer:{row.true}"
      }
  ]
  text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
  # print(text)
  # print("---------------")
  item = tokenizer(text, add_special_tokens=False, truncation=False)
  return item["input_ids"], item["attention_mask"]

df_train["true_original"] = df_train.apply(lambda x: int(x[f"Misconception{x.answer_name}Id"]), axis=1)
df_train["retrieval_rank"] = df_train.apply(lambda x: get_retrieval_rank(x), axis=1)
df_train[["retrieval", "target_ids"]] = df_train.apply(lambda x: get_candidates(x), axis=1, result_type="expand")
df_train["true"] = df_train.apply(lambda x: get_true_label(x), axis=1)
df_train = df_train[df_train["true"] != -1].reset_index(drop=True)
print(df_train.shape)
df_train[["input_ids", "attention_mask"]] = df_train.apply(lambda x: apply_template(x), axis=1, result_type="expand")

cv = list(GroupKFold(n_splits=N_SPLIT).split(df_train, y=df_train["true"], groups=df_train["QuestionId"]))
fold_idx = 0
trn_idx, val_idx = cv[fold_idx]

print(f"fold: {fold_idx}")
print(f"train size: {len(trn_idx)}, eval size: {len(val_idx)}")
print("val_ids: ", val_idx[:10])

df_train["valid_flag"] = 0
df_train.loc[val_idx, "valid_flag"] = 1
df_train["category"] = "original"

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

(4370, 26)
fold: 0
train size: 4260, eval size: 110
val_ids:  [  1   2   3  79  80  81 179 180 181 346]


In [None]:
df_label_count = df_train["true_original"].value_counts().reset_index()
df_label_count = df_label_count[df_label_count["count"]<=4]

print(df_label_count["count"].sum()) # 2439行のデータを追加

rare_misconception_list = df_label_count["true_original"].values
df_train_rare = df_train[df_train["true_original"].isin(rare_misconception_list)].copy().reset_index(drop=True)

def get_candidates_v2(row):
    target_ids = list(map(int, row.MisconceptionId.split()))
    target_ids = list(dict.fromkeys([row.true_original] + target_ids[:4] + target_ids[9:]))[:9]
    random.shuffle(target_ids)
    misconceptions_text = [mapping_dict[m] for m in target_ids]

    res_text = ""
    for i, (_, text) in enumerate(zip(target_ids, misconceptions_text, strict=False)):
        res_text += f"{i+1}. {text}\n"
    return res_text, target_ids

df_train_rare[["retrieval", "target_ids"]] = df_train_rare.apply(lambda x: get_candidates(x), axis=1, result_type="expand")
df_train_rare["true"] = df_train_rare.apply(lambda x: get_true_label(x), axis=1)
df_train_rare = df_train_rare[df_train_rare["true"] != -1].reset_index(drop=True)
print(df_train_rare.shape)
df_train_rare[["input_ids", "attention_mask"]] = df_train_rare.apply(lambda x: apply_template(x), axis=1, result_type="expand")
df_train_rare["category"] = "addon"

df_train = pd.concat([df_train, df_train_rare]).reset_index(drop=True)
print(df_train.shape)

2439
(2439, 30)
(6809, 30)


In [None]:
class EediDataset(Dataset):
    def __init__(
        self,
        df:pd.DataFrame,
    ):
        self.df = df

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, index) -> dict:
        row = self.df.iloc[index]

        inputs = {
            "input_ids": row["input_ids"],
        }

        return inputs


df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
trn_df = df_train[df_train["valid_flag"] == 0]
val_df = df_train[df_train["valid_flag"] == 1]

print(trn_df.shape)
print(val_df.shape)

ds = EediDataset(df_train)
data_collator = DataCollatorForCompletionOnlyLM("Answer:", tokenizer=tokenizer)

(6642, 30)
(167, 30)


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    token=userdata.get('HF_TOKEN'),
    quantization_config=bnb_config
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM,
    bias='none',
    target_modules=(
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        # "gate_proj",
        # "up_proj",
        # "down_proj",
    )
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

training_args = TrainingArguments(
    output_dir=f"/content/drive/MyDrive/eedi/output/{RUN_NAME}/",
    overwrite_output_dir=False,

    log_level="error",

    logging_steps=40,
    logging_strategy="steps",

    eval_strategy="steps",
    eval_steps=40,
    metric_for_best_model="loss",

    save_strategy="epoch",
    save_total_limit=1,

    num_train_epochs=EPOCHS,

    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    learning_rate=L_RATE,
    weight_decay=0.01,

    bf16=is_torch_bf16_gpu_available(),
    fp16=not is_torch_bf16_gpu_available(),

    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_BATCH_SIZE,

    gradient_accumulation_steps=16 // PER_DEVICE_BATCH_SIZE,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    group_by_length=False,
    report_to='none',
    seed = 42,
    remove_unused_columns=False,
)

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/63.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/17 [00:00<?, ?it/s]

model-00001-of-00017.safetensors:   0%|          | 0.00/3.92G [00:00<?, ?B/s]

model-00002-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00003-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00004-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00005-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00006-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00007-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00008-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00009-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00010-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00011-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00012-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00013-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00014-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00015-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00016-of-00017.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00017-of-00017.safetensors:   0%|          | 0.00/3.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

trainable params: 16,777,216 || all params: 32,780,653,568 || trainable%: 0.0512


In [None]:
trainer = Trainer(
    model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=EediDataset(trn_df),
    eval_dataset=EediDataset(val_df),
    data_collator=data_collator,
)

trainer_output = trainer.train()


Step,Training Loss,Validation Loss
40,12.8949,3.111187
80,0.9986,0.667333
120,0.6596,0.65445
160,0.6406,0.630155
200,0.6262,0.613718
240,0.6233,0.609361
280,0.6239,0.603147
320,0.5739,0.601838
360,0.5579,0.598889
400,0.5898,0.58581


Step,Training Loss,Validation Loss
40,12.8949,3.111187
80,0.9986,0.667333
120,0.6596,0.65445
160,0.6406,0.630155
200,0.6262,0.613718
240,0.6233,0.609361
280,0.6239,0.603147
320,0.5739,0.601838
360,0.5579,0.598889
400,0.5898,0.58581


In [None]:
from google.colab import runtime
runtime.unassign()