In [1]:
### Loading the Dataset

try: from datasets import load_dataset
except:
    !pip install datasets
    from datasets import load_dataset


dataset = load_dataset("nvidia/OpenMathInstruct-2", split = "train_1M")
dataset_small = dataset.select(range(10000))
dataset_split = dataset_small.train_test_split(test_size=0.1, seed=42)
train_data, eval_data = dataset_split["train"], dataset_split["test"]

print(dataset)

Using the latest cached version of the dataset since nvidia/OpenMathInstruct-2 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\vishg\.cache\huggingface\datasets\nvidia___open_math_instruct-2\default\0.0.0\469216e3f46f4dacf476b382e192485ea51a143e (last modified on Mon Mar 31 09:49:03 2025).


Dataset({
    features: ['problem', 'generated_solution', 'expected_answer', 'problem_source'],
    num_rows: 1000000
})


In [2]:
### Logging into HuggingFace

try:
  from dotenv import load_dotenv
except:
  !pip install python-dotenv
  from dotenv import load_dotenv

from huggingface_hub import login, whoami
import os

load_dotenv()
login(os.getenv("hugging_face_key2"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\vishg\.cache\huggingface\token
Login successful


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, PrefixTuningConfig, TaskType

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

peft_config = PrefixTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    num_virtual_tokens=10,        # prefix length
    prefix_projection=True,       # optional MLP projection of prefix
)

peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()


trainable params: 14,772,480 || all params: 139,212,288 || trainable%: 10.6115




In [22]:
def format_prompt(example):
    text = f"[INST] Problem: {example['problem']} [/INST] Solution: {example['expected_answer']}"
    tokenized = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_train = train_data.map(format_prompt, batched=False, remove_columns=train_data.column_names)
tokenized_eval = eval_data.map(format_prompt, batched=False, remove_columns=eval_data.column_names)

tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [23]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling



training_args = TrainingArguments(
    output_dir="./prefix_gpt2",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
    
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval
)


trainer.train()





  0%|          | 0/3375 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


KeyboardInterrupt: 