### Imports

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model


  from .autonotebook import tqdm as notebook_tqdm


### Load the Dataset

In [2]:
# Load Alpaca dataset from Hugging Face or local JSON
dataset = load_dataset("tatsu-lab/alpaca")

# Preview the dataset structure
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})


### Format, Scale, and Tokenize the Dataset

In [3]:

# Combine instruction, input, and output into a single text field
def format_example(example):
    instruction = example['instruction']
    input_text = example['input']
    output = example['output']
    if input_text:
        return f"Instruction: {instruction}\nInput: {input_text}\nOutput: {output}"
    else:
        return f"Instruction: {instruction}\nOutput: {output}"

formatted_dataset = dataset.map(lambda x: {"text": format_example(x)})

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")

# Set the padding token to the <eos_token> (end of sentence token)
tokenizer.pad_token = tokenizer.eos_token

# If you want to use a new token for padding
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize dataset
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy()  # Copy input_ids to labels
    return tokenized


tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)


In [4]:
print(tokenized_dataset)
print(tokenized_dataset["train"][0])




DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 52002
    })
})
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Instruction: Give three tips for staying healthy.\nOutput: 1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'input_ids': [25464, 42, 13843, 1296, 5608, 327, 9286, 2458, 30, 198, 17597, 42, 216, 33, 30, 36693, 253, 8609, 2714, 284, 919, 2090, 288, 1453, 7568, 282, 5574, 284, 5136, 30, 3717, 34, 30, 15382, 5578, 288, 1446, 469, 1248, 3212, 284, 1837, 30, 3717, 35, 30, 5399,

In [5]:
def scale_dataset(dataset, max_samples=1000):
    """Scale down a dataset to a maximum number of samples"""
    if len(dataset) > max_samples:
        scaled_indices = list(range(max_samples))
        return dataset.select(scaled_indices)
    return dataset


# Set your desired size
MAX_SAMPLES = 50000  # Adjust this number as needed

# Print original size
print(f"Original sizes - Total dataset: {len(tokenized_dataset['train'])}")

# First scale down the full dataset
scaled_full = scale_dataset(tokenized_dataset["train"], MAX_SAMPLES)

# Split into train and eval (90-10 split)
splits = scaled_full.train_test_split(
    test_size=0.1,  # 10% for eval
    shuffle=True,
    seed=42
)

tokenized_train = splits['train']
tokenized_eval = splits['test']

# Print final sizes
print(f"Final sizes - Train: {len(tokenized_train)}, Eval: {len(tokenized_eval)}")

Original sizes - Total dataset: 52002
Final sizes - Train: 45000, Eval: 5000


### Fine tuning setup

In [6]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from transformers import EarlyStoppingCallback

# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")

# Configure LoRA
lora_config = LoraConfig(
    r=8,  # Rank for low-rank adaptation
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],  # Target query and value projections
    task_type="CAUSAL_LM"  # This is a causal language model
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()


trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # Evaluate periodically
    eval_steps=5000,              # Evaluate every 5000 steps
    save_steps=5000,              # Save model every 5000 steps
    logging_steps=2500,          # Log progress every 2b 500 steps
    load_best_model_at_end=True,  # Load the best model after training
    metric_for_best_model="eval_loss",  # Use evaluation loss as the metric
    greater_is_better=False,     # Lower eval_loss is better
    learning_rate=5e-4,
    per_device_train_batch_size=4,
    num_train_epochs=10,
    save_total_limit=2,
    fp16=False,
)

### Train

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_eval,  # Replace with validation set if available
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()


In [None]:
# Save the fine-tuned model and tokenizer
output_dir = "./smollm2_finetuned/02 "

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

