# Installing packages

In [1]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install "torch==2.4.0" "xformers==0.0.27.post2" triton torchvision torchaudio "trl==0.14.0"
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"
!pip uninstall -y trl
!pip install git+https://github.com/huggingface/trl.git@e95f9fb74a3c3647b86f251b7e230ec51c64b72b

# Import Libs

In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from unsloth import FastLanguageModel
from datasets import Dataset

import os
import ast
import string
import random
from tqdm.auto import tqdm
from IPython.display import clear_output

os.environ["WANDB_DISABLED"] = "true"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

Unsloth: Patching Xformers to fix some performance issues.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Downloading model

In [3]:
model_name = "unsloth/Qwen2.5-0.5B"

# >>> Download LLM Model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 8192,
    dtype = None,
    load_in_4bit = True,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# FastLanguageModel.for_inference(model)
# print("Model is ready to inference

==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.50.2.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.4.0+cu121. CUDA: 6.0. CUDA Toolkit: 12.1. Triton: 3.0.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/521M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.71k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.19 patched 24 layers with 24 QKV layers, 24 O layers and 24 MLP layers.


# Load & process dataset

In [5]:
df_train = pd.read_csv("/kaggle/input/fpt-ai-residency-batch-6-entry-test/b6_train_data.csv")
df_train.head()

Unnamed: 0,task_id,question,choices,answer
0,k10168,Question: What will be output of the following...,"['8 4 2', '8 4 2', '8 4 4', '8 4 3']",C
1,k10173,Question: What will be output of the following...,"['-4', '-5', '10', '11']",A
2,k10174,Question: Match the following.\n Group 1 ...,"['P-4. Q-1, R-2, S-3', 'P-3, Q-1, R-4, S-2', '...",B
3,k10175,Question: Match the following.\nP. Regular exp...,"['P-4. Q-1, R-2, S-3', 'P-3, Q-1, R-4, S-2', '...",B
4,k10176,Question: Which grammar rules violate the requ...,"['1 only', '1 and 3 only', '2 and 3 only', '3 ...",B


In [6]:
df_test = pd.read_csv("/kaggle/input/fpt-ai-residency-batch-6-entry-test/b6_test_data.csv")
df_test.head()

Unnamed: 0,task_id,question,choices
0,k10171,Question: What will be output of the following...,"['10', '9', '8', 'Error']"
1,k10182,Question: Consider line 3. Identify the compil...,"['No compilation error', 'Only a lexical error..."
2,k10184,Question: Assume the conflicts part (a) of thi...,['Equal precedence and left associativity; exp...
3,k10206,Question: What will be output if you will exec...,"['2.00000', '4.00000', '6.00000', 'Compilation..."
4,k10215,Question: Select the output for code :\nstatic...,"['amish', 'ANKIT', 'harsh', 'Compile time error']"


In [7]:
df_train.describe()

Unnamed: 0,task_id,question,choices,answer
count,3963,3963,3963,3949
unique,3963,3826,3451,11
top,k00701,Question: What is Conditional Rendering?,"['False', 'True']",A
freq,1,7,22,988


In [8]:
df_train = df_train.dropna()
df_train = df_train.drop_duplicates()
df_train.describe()

Unnamed: 0,task_id,question,choices,answer
count,3949,3949,3949,3949
unique,3949,3814,3450,11
top,k00701,Question: What is Conditional Rendering?,"['False', 'True']",A
freq,1,7,22,988


In [9]:
df_train["answer"].unique()

array(['C', 'A', 'B', 'D', 'ANSWER: C', 'ANSWER: B', 'ANSWER: D',
       'ANSWER: A', 'E', 'G', 'ANSWER:  D'], dtype=object)

In [10]:
df_train['answer'] = df_train['answer'].astype(str).str.strip().str[-1]
df_train["answer"].unique()

array(['C', 'A', 'B', 'D', 'E', 'G'], dtype=object)

In [11]:
df_test.describe()

Unnamed: 0,task_id,question,choices
count,1253,1253,1253
unique,1253,1242,1163
top,k00700,Question: Match the following:,"['(1)', '(2)', '(3)', '(4)']"
freq,1,2,7


In [12]:
index2choice = {i: letter for i, letter in enumerate(string.ascii_uppercase)}
choice2index = {letter: i for i, letter in enumerate(string.ascii_uppercase)}

def choices2str(choices):
    choices_lst = ast.literal_eval(choices)
    result = ""
    for i in range(0, len(choices_lst)):
        result = result + index2choice[i] + ". " + str(choices_lst[i]) + "\n\n"
    return result.strip()

In [13]:
prompt_template = """You are a programming expert and will answer multiple-choice questions about code.  
Read the following code snippet carefully and select the **best** answer.  

### Response Format:
- Reply with **only** the letter of the correct choice (A, B, C, or D).  
- Do **not** provide explanations.  

### Question:
{}

### Choices:
{}

### Response:
{}"""

def preprocessing(df):
    df["text"] = df.apply(
        lambda row: prompt_template.format(
            row["question"], choices2str(row["choices"]), row["answer"]
        ), axis=1
    )
    return Dataset.from_pandas(df[["text"]])  # Chuyển DataFrame thành Dataset

In [14]:
dataset = preprocessing(df_train)

# Train model

In [15]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 8092,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4, 
        warmup_steps = 5,
        num_train_epochs = 3, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 30,
        
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/3949 [00:00<?, ? examples/s]

In [16]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,949 | Num Epochs = 3 | Total steps = 369
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 8,798,208/5,000,000,000 (0.18% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
30,0.9995
60,0.6905
90,0.6824
120,0.6631
150,0.6385
180,0.6071
210,0.6109
240,0.5885
270,0.5982
300,0.5356


# Save model

In [17]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/vocab.json',
 'lora_model/merges.txt',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

# Testing model

In [18]:
prompt_tempalte = """You are a programming expert and will answer multiple-choice questions about code.  
Read the following code snippet carefully and select the **best** answer.  

### Response Format:
- Reply with **only** the letter of the correct choice (A, B, C, or D).  
- Do **not** provide explanations.  

### Question:
{}

### Choices:
{}

### Response:
"""

index2choice = {i: letter for i, letter in enumerate(string.ascii_uppercase)}
choice2index = {letter: i for i, letter in enumerate(string.ascii_uppercase)}

def choices2str(choices):
    choices_lst = ast.literal_eval(choices)
    result = ""
    for i in range(0, len(choices_lst)):
        result = result + index2choice[i] + ". " + str(choices_lst[i]) + "\n\n"
    return result.strip()

def apply_template(question, choices):
    return prompt_tempalte.format(question, choices2str(choices))

def response_parser(response):
    if len(response) == 1 and response.upper() in list(choice2index.keys()):
        return response

    response = response.translate(str.maketrans('', '', string.punctuation))
    response_split = response.split()
    for re in response_split:
        if re in list(choice2index.keys()):
            return re
    return random.choice(list(choice2index.keys()))

def generate(prompt:str, max_new_tokens=1):
        inputs = tokenizer(
            [prompt], return_tensors="pt"
        ).to("cuda")
        input_length = inputs.input_ids.shape[1]  # Lưu độ dài ban đầu
        
        output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)
        new_tokens = output_ids[0, input_length:]
        response = tokenizer.decode(new_tokens, skip_special_tokens=True)

        return response.strip()

In [19]:
data_dict = {
    "task_id": [],
    "answer": []
}

for i in tqdm(range(0, df_test.shape[0]), desc="Inference on Test set"):
    task_id = df_test['task_id'].iloc[i]
    prompt = apply_template(df_test['question'].iloc[i], df_test['choices'].iloc[i])
    data_dict['task_id'].append(task_id)
    response = generate(prompt, 2)
    answer = response_parser(response)
    data_dict['answer'].append(answer)

Inference on Test set:   0%|          | 0/1253 [00:00<?, ?it/s]

# Submission

In [20]:
submission = pd.DataFrame(data_dict)
submission.head()

Unnamed: 0,task_id,answer
0,k10171,B
1,k10182,B
2,k10184,C
3,k10206,D
4,k10215,C


In [21]:
submission.describe()

Unnamed: 0,task_id,answer
count,1253,1253
unique,1253,4
top,k00700,B
freq,1,409


In [22]:
submission.to_csv("submission.csv", index=False)