### Imports

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model


### Load the Dataset

In [None]:
# Load Alpaca dataset from Hugging Face or local JSON
dataset = load_dataset("tatsu-lab/alpaca")

# Preview the dataset structure
print(dataset)


### Format, Scale, and Tokenize the Dataset

In [None]:

# Combine instruction, input, and output into a single text field
def format_example(example):
    instruction = example['instruction']
    input_text = example['input']
    output = example['output']
    if input_text:
        return f"Instruction: {instruction}\nInput: {input_text}\nOutput: {output}"
    else:
        return f"Instruction: {instruction}\nOutput: {output}"

formatted_dataset = dataset.map(lambda x: {"text": format_example(x)})

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")

# Set the padding token to the <eos_token> (end of sentence token)
tokenizer.pad_token = tokenizer.eos_token

# If you want to use a new token for padding
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize dataset
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy()  # Copy input_ids to labels
    return tokenized


tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)


In [None]:
print(tokenized_dataset)
print(tokenized_dataset["train"][0])




In [None]:
def scale_dataset(dataset, max_samples=1000):
    """Scale down a dataset to a maximum number of samples"""
    if len(dataset) > max_samples:
        scaled_indices = list(range(max_samples))
        return dataset.select(scaled_indices)
    return dataset


# Set your desired size
MAX_SAMPLES = 50000  # Adjust this number as needed

# Print original size
print(f"Original sizes - Total dataset: {len(tokenized_dataset['train'])}")

# First scale down the full dataset
scaled_full = scale_dataset(tokenized_dataset["train"], MAX_SAMPLES)

# Split into train and eval (90-10 split)
splits = scaled_full.train_test_split(
    test_size=0.1,  # 10% for eval
    shuffle=True,
    seed=42
)

tokenized_train = splits['train']
tokenized_eval = splits['test']

# Print final sizes
print(f"Final sizes - Train: {len(tokenized_train)}, Eval: {len(tokenized_eval)}")

### Fine tuning setup

In [None]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from transformers import EarlyStoppingCallback

# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")

# Configure LoRA
lora_config = LoraConfig(
    r=16,  # Increase from 8 to 16 for more capacity
    lora_alpha=32,  # Keep this the same
    lora_dropout=0.1,  # Increase from 0.05 to 0.1 for better regularization
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Add k_proj and o_proj
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()


In [None]:
from transformers import TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",  # Evaluate periodically
#     eval_steps=5000,              # Evaluate every 5000 steps
#     save_steps=5000,              # Save model every 5000 steps
#     logging_steps=2500,          # Log progress every 2b 500 steps
#     load_best_model_at_end=True,  # Load the best model after training
#     metric_for_best_model="eval_loss",  # Use evaluation loss as the metric
#     greater_is_better=False,     # Lower eval_loss is better
#     learning_rate=5e-4,
#     per_device_train_batch_size=4,
#     num_train_epochs=10,
#     save_total_limit=2,
#     fp16=False,
# )

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=1000,              # Decreased from 5000 to 1000
    save_steps=1000,              # Decreased from 5000 to 1000
    logging_steps=500,            # Decreased from 2500 to 500
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=1e-4,           # Decreased from 5e-4 to 1e-4
    per_device_train_batch_size=8,  # Increased from 4 to 8
    gradient_accumulation_steps=4,   # Added this parameter
    num_train_epochs=3,           # Decreased from 10 to 3
    save_total_limit=2,
    fp16=False,                    # Changed to True
    warmup_ratio=0.03,            # Added warmup
    weight_decay=0.01,            # Added weight decay
)

print('test')

### Train

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_eval,  # Replace with validation set if available
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()


In [None]:
# Save the fine-tuned model and tokenizer
output_dir = "./smollm2_finetuned/04"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

