In [16]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Tiny‐scale MCQA fine‐tuning on Qwen3: pick A–E over 50 examples.
"""
import os                            # 0. Interact with the operating system
from datasets import load_dataset   #  Load dataset utilities from Hugging Face
from transformers import (
    AutoTokenizer,                  #  Tokenizer for converting text to model inputs
    AutoModelForSequenceClassification,  #  Model class for sequence classification tasks
    Trainer,                        #  Trainer API for training/evaluation
    TrainingArguments,              #  Argument class for training configuration
    default_data_collator,          #  Data collator for batching inputs
)

# 0. Auth
HF_TOKEN = "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"
os.environ["HF_TOKEN"] = HF_TOKEN  #  Set HF API token in environment

# 1. Configuration
MODEL_NAME    = "Qwen/Qwen3-0.6B-Base"      #  Pretrained model identifier
DATASET_NAME  = "NicoHelemon/MNLP_M2_mcqa_dataset"  #  MCQA dataset identifier
OUTPUT_DIR    = "tmp_small_mcqa"           #  Directory for saving outputs
MAX_LENGTH    = 256                        #  Max token length per input
BATCH_SIZE    = 4                          #  Training batch size per device
NUM_EPOCHS    = 50                         #  Number of training epochs
LEARNING_RATE = 5e-5                       #  Learning rate for optimizer
SMALL_SIZE    = 50                         #  Subsample size from dataset
NUM_LABELS    = 5    # A–E                 #  Number of answer choices (A–E)

# 2. Load & subsample
raw = load_dataset(DATASET_NAME, split="train")  #  Load training split of dataset
small_raw = raw.shuffle(seed=42).select(range(SMALL_SIZE))  #  Shuffle and select first SMALL_SIZE examples

# 3. Tokenizer & model for classification
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    trust_remote_code=True,  #  Allow running custom code from the model repo
    token=HF_TOKEN           #  Use HF token for authentication
)

# <— ADD THESE TWO LINES:
tokenizer.pad_token     = tokenizer.eos_token           #  Set padding token to end-of-sequence token
tokenizer.pad_token_id  = tokenizer.eos_token_id        #  Set padding token ID accordingly

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,  #  Allow custom model code
    token=HF_TOKEN,          #  Authenticate
    num_labels=NUM_LABELS,   #  Define number of classification labels
)

# Ensure the model knows the pad token ID
model.config.pad_token_id = tokenizer.pad_token_id

# 4. Preprocessing: one prompt string → one label 0–4
def preprocess(ex):
    q    = ex["question"]                       #  Extract question text
    opts = ex["options"]                        #  Extract list of 5 option strings
    # Build a single prompt with all options
    prompt_lines = [f"Question: {q}", "Options:"]  #  Start prompt with question and header
    labels = ["A","B","C","D","E"]         #  List of answer labels
    for L, o in zip(labels, opts):
        prompt_lines.append(f"{L}. {o}")           #  Append each label-option pair
    prompt_lines.append("Answer (letter only):")    #  Prompt for answer letter
    prompt = "\n".join(prompt_lines)              #  Join lines into a single string

    enc = tokenizer(
        prompt,
        truncation=True,          #  Truncate inputs longer than MAX_LENGTH
        padding="max_length",   #  Pad inputs to MAX_LENGTH
        max_length=MAX_LENGTH,    #  Maximum sequence length
    )
    return {
        "input_ids":      enc["input_ids"],          #  Token IDs
        "attention_mask": enc["attention_mask"],     #  Attention mask for padding
        # map "A"→0, …, "E"→4
        "labels":         ord(ex["label"]) - ord("A"),  #  Convert label letter to index
    }

# Apply preprocessing to dataset, removing original columns
tokenized = small_raw.map(preprocess, remove_columns=small_raw.column_names)

# 5. Trainer setup
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,                       #  Where to save checkpoints and logs
    num_train_epochs=NUM_EPOCHS,                 #  Total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,      #  Batch size per GPU/CPU
    learning_rate=LEARNING_RATE,                 #  Learning rate
    logging_steps=5,                            #  Log every 10 steps
    save_steps=50,                               #  Save checkpoint every 50 steps
    push_to_hub=False,                           #  Disable pushing to Hugging Face Hub
)

trainer = Trainer(
    model=model,                                 #  Model to train
    args=training_args,                         #  Training configuration
    train_dataset=tokenized,                    #  Prepared training dataset
    data_collator=default_data_collator,        #  Function to batch examples    
)

# 6. Train & save
trainer.train()                #  Run the training loop
model.save_pretrained(OUTPUT_DIR)  #  Save final model weights
tokenizer.save_pretrained(OUTPUT_DIR)  #  Save tokenizer configuration

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B-Base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 594.00 MiB. GPU 0 has a total capacity of 19.50 GiB of which 6.81 MiB is free. Process 1080787 has 1.89 GiB memory in use. Process 1207723 has 1.89 GiB memory in use. Process 1797640 has 1.12 GiB memory in use. Process 1853060 has 1.48 GiB memory in use. Process 1649597 has 348.00 MiB memory in use. Process 1916197 has 10.61 GiB memory in use. Process 1978994 has 1.98 GiB memory in use. Process 2096171 has 744.00 MiB memory in use. Process 2128560 has 1.89 GiB memory in use. Process 2181713 has 1.89 GiB memory in use. Process 3032836 has 17.48 GiB memory in use. Process 3072102 has 4.32 GiB memory in use. Process 3094191 has 13.77 GiB memory in use. Of the allocated memory 17.12 GiB is allocated by PyTorch, and 259.14 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [3]:
import os
import multiprocessing

# CPU
print("CPU cores (logical):", os.cpu_count())
print("CPU cores (physical):", multiprocessing.cpu_count() // 2)  # roughly, on hyperthreaded machines

# PyTorch GPU check
import torch
if torch.cuda.is_available():
    n_gpu = torch.cuda.device_count()
    print("GPUs available:", n_gpu)
    for i in range(n_gpu):
        print(f" GPU {i}:", torch.cuda.get_device_name(i))
else:
    print("No CUDA GPUs detected")

# Hugging Face / Accelerate
from accelerate import notebook_launcher, infer_auto_device_map
print("Accelerate reports:", infer_auto_device_map(torch.nn.Linear(1,1)))


CPU cores (logical): 128
CPU cores (physical): 64
GPUs available: 1
 GPU 0: NVIDIA A100-SXM4-80GB MIG 1g.20gb
Accelerate reports: OrderedDict({'': 0})
