In [None]:
#pip install datasets

In [1]:
from datasets import load_dataset

# Load the digit-level dataset
ds_digit = load_dataset(
    "json",
    data_files="dataset_gruver_digit.jsonl"
)

# Load the numeric-token dataset
ds_numeric = load_dataset(
    "json",
    data_files="dataset_delphyne_numeric.jsonl"
)

# Train/validation split (90/10)
digit_ds = ds_digit["train"].train_test_split(test_size=0.1, shuffle=True, seed=42)
numeric_ds = ds_numeric["train"].train_test_split(test_size=0.1, shuffle=True, seed=42)

# Save dataset dictionaries for future use
digit_ds.save_to_disk("hf_digit_dataset")
numeric_ds.save_to_disk("hf_numeric_dataset")

print(digit_ds)
print(numeric_ds)


  from .autonotebook import tqdm as notebook_tqdm
Saving the dataset (1/1 shards): 100%|██████████| 4557/4557 [00:00<00:00, 140853.84 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 507/507 [00:00<00:00, 86184.33 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 4557/4557 [00:00<00:00, 166990.89 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 507/507 [00:00<00:00, 84078.45 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 4557
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 507
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 4557
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 507
    })
})





In [None]:
#!pip install transformers datasets accelerate bitsandbytes peft

In [2]:
from datasets import load_from_disk

numeric_ds = load_from_disk("hf_numeric_dataset")
digit_ds   = load_from_disk("hf_digit_dataset")   # will use later


In [3]:
def format_sample(example):
    example["text"] = f"Predict the next return.\nSequence: {example['input_text']}\nNext: {example['target_text']}"
    return example

train_ds = numeric_ds["train"].map(format_sample)
val_ds   = numeric_ds["test"].map(format_sample)


In [4]:
train_ds[0]["text"]


'Predict the next return.\nSequence: 0.011956 0.011265 0.01152 0.011877 0.005572 0.006475 0.004829 0.00628 0.004783 0.006473 0.005965 0.023039 0.010506 0.016359 0.010665 0.017357 0.01001 0.01093 0.006556 0.018859 0.017817 0.019164 0.018153 0.01987 0.019268 0.084112 0.009394 0.013847 0.009123 0.011749 0.011267 0.008967 0.056426 0.001753 0.003005 0.0023 0.004079 0.000418 0.002405 -0.016568 0.00284 0.008956 0.002846 0.009453 0.00297 0.00265 0.028665 0.001005 0.000773 0.000732 0.000644 -0.000139 0.00082 -0.013933 -0.008084 -0.009447 -0.007799 -0.007696 -0.008514 -0.007537\nNext: -0.030987'

In [None]:
#!pip install --upgrade transformers accelerate


In [None]:
#pip install --upgrade --force-reinstall git+https://github.com/huggingface/transformers.git


In [7]:
import transformers
print(transformers.__version__)


4.57.3


In [5]:
from transformers import AutoTokenizer

model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Make sure padding is correct
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(example):
    tok = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512,   # you can increase later
    )
    
    # For causal LM, labels = input_ids
    tok["labels"] = tok["input_ids"].copy()
    return tok

train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(tokenize_fn, batched=True, remove_columns=val_ds.column_names)

train_tok, val_tok


(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 4557
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 507
 }))

In [10]:
for name, module in model.named_modules():
    if "proj" in name.lower() or "attn" in name.lower():
        print(name)


model.layers.0.self_attn
model.layers.0.self_attn.o_proj
model.layers.0.self_attn.qkv_proj
model.layers.0.mlp.gate_up_proj
model.layers.0.mlp.down_proj
model.layers.0.resid_attn_dropout
model.layers.1.self_attn
model.layers.1.self_attn.o_proj
model.layers.1.self_attn.qkv_proj
model.layers.1.mlp.gate_up_proj
model.layers.1.mlp.down_proj
model.layers.1.resid_attn_dropout
model.layers.2.self_attn
model.layers.2.self_attn.o_proj
model.layers.2.self_attn.qkv_proj
model.layers.2.mlp.gate_up_proj
model.layers.2.mlp.down_proj
model.layers.2.resid_attn_dropout
model.layers.3.self_attn
model.layers.3.self_attn.o_proj
model.layers.3.self_attn.qkv_proj
model.layers.3.mlp.gate_up_proj
model.layers.3.mlp.down_proj
model.layers.3.resid_attn_dropout
model.layers.4.self_attn
model.layers.4.self_attn.o_proj
model.layers.4.self_attn.qkv_proj
model.layers.4.mlp.gate_up_proj
model.layers.4.mlp.down_proj
model.layers.4.resid_attn_dropout
model.layers.5.self_attn
model.layers.5.self_attn.o_proj
model.layers.

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

model_name = "microsoft/Phi-3-mini-4k-instruct"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load model in float32 on CPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map={"": "cpu"},   # force CPU
    torch_dtype="float32"
)

print("Model loaded on CPU.")

from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=[
        "self_attn.qkv_proj",
        "self_attn.o_proj"
    ],
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



`torch_dtype` is deprecated! Use `dtype` instead!
Unrecognized keys in `rope_parameters` for 'rope_type'='default': {'partial_rotary_factor'}
Unrecognized keys in `rope_parameters` for 'rope_type'='default': {'partial_rotary_factor'}
Loading weights: 100%|██████████| 195/195 [00:25<00:00,  7.78it/s, Materializing param=model.norm.weight]                              


Model loaded on CPU.
trainable params: 4,718,592 || all params: 3,825,798,144 || trainable%: 0.1233


In [1]:
import numpy as np, torch, transformers
print("NumPy:", np.__version__)
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)


  from .autonotebook import tqdm as notebook_tqdm


NumPy: 1.26.4
Torch: 2.9.1
Transformers: 5.0.0.dev0


In [7]:
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

# Dynamic padding collator for causal LM (no masking, next-token prediction)
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="./phi3_finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,   # effective batch = 16
    num_train_epochs=3,
    
    learning_rate=2e-4,
    warmup_steps=50,

    logging_steps=10,
    eval_steps=50,
    
    save_steps=200,
    save_total_limit=2,

    bf16=False,
    fp16=False,
    optim="adamw_torch",
    report_to="none"
)


In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=collator,
)


The model is already on multiple devices. Skipping the move to device specified in `args`.


In [9]:
trainer.train()