### Imports

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model


  from .autonotebook import tqdm as notebook_tqdm


### Load the Dataset

In [2]:
# Load Alpaca dataset from Hugging Face or local JSON
dataset = load_dataset("tatsu-lab/alpaca")

# Preview the dataset structure
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})


### Format, Scale, and Tokenize the Dataset

In [3]:

# Combine instruction, input, and output into a single text field
def format_example(example):
    instruction = example['instruction']
    input_text = example['input']
    output = example['output']
    if input_text:
        return f"Instruction: {instruction}\nInput: {input_text}\nOutput: {output}"
    else:
        return f"Instruction: {instruction}\nOutput: {output}"

formatted_dataset = dataset.map(lambda x: {"text": format_example(x)})

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")

# Set the padding token to the <eos_token> (end of sentence token)
tokenizer.pad_token = tokenizer.eos_token

# If you want to use a new token for padding
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize dataset
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy()  # Copy input_ids to labels
    return tokenized


tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)


In [4]:
print(tokenized_dataset)
print(tokenized_dataset["train"][0])


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 52002
    })
})
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Instruction: Give three tips for staying healthy.\nOutput: 1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'input_ids': [25464, 42, 13843, 1296, 5608, 327, 9286, 2458, 30, 198, 17597, 42, 216, 33, 30, 36693, 253, 8609, 2714, 284, 919, 2090, 288, 1453, 7568, 282, 5574, 284, 5136, 30, 3717, 34, 30, 15382, 5578, 288, 1446, 469, 1248, 3212, 284, 1837, 30, 3717, 35, 30, 5399,

In [5]:
# Scale datasets for testing
def scale_dataset(dataset, max_samples=1000):
    """Scale down a dataset to a maximum number of samples"""
    if len(dataset) > max_samples:
        scaled_indices = list(range(max_samples))
        return dataset.select(scaled_indices)
    return dataset

# Set your desired size
MAX_SAMPLES = 50000  # Adjust this number as needed

# Print original sizes
print(f"Original sizes - Train: {len(tokenized_dataset['train'])}")

# Scale the train dataset
tokenized_train = scale_dataset(tokenized_dataset["train"], MAX_SAMPLES)

# Optionally scale the eval dataset (if present or needed)
tokenized_eval = scale_dataset(
    tokenized_dataset["train"],  # Replace with tokenized_dataset["eval"] if you have a separate eval set
    max(50, int(MAX_SAMPLES * 0.1))  # Keep eval set ~10% of train, minimum 50 samples
)

# Print scaled sizes
print(f"Scaled sizes - Train: {len(tokenized_train)}, Eval: {len(tokenized_eval)}")


Original sizes - Train: 52002
Scaled sizes - Train: 50000, Eval: 5000


### Fine tuning setup

In [6]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from transformers import EarlyStoppingCallback

# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")

# Configure LoRA
lora_config = LoraConfig(
    r=8,  # Rank for low-rank adaptation
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],  # Target query and value projections
    task_type="CAUSAL_LM"  # This is a causal language model
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()


trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # Evaluate periodically
    eval_steps=5000,              # Evaluate every 5000 steps
    save_steps=5000,              # Save model every 5000 steps
    logging_steps=2500,          # Log progress every 2b 500 steps
    load_best_model_at_end=True,  # Load the best model after training
    metric_for_best_model="eval_loss",  # Use evaluation loss as the metric
    greater_is_better=False,     # Lower eval_loss is better
    learning_rate=5e-4,
    per_device_train_batch_size=4,
    num_train_epochs=10,
    save_total_limit=2,
    fp16=False,
)



### Train

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_eval,  # Replace with validation set if available
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()


  2%|▏         | 2501/125000 [05:50<4:46:51,  7.12it/s]

{'loss': 0.3308, 'grad_norm': 0.09157832711935043, 'learning_rate': 0.00049, 'epoch': 0.2}


  4%|▍         | 5000/125000 [11:41<4:40:27,  7.13it/s]

{'loss': 0.2803, 'grad_norm': 0.10301974415779114, 'learning_rate': 0.00048, 'epoch': 0.4}


                                                       
  4%|▍         | 5000/125000 [12:41<4:40:27,  7.13it/s]

{'eval_loss': 0.25781163573265076, 'eval_runtime': 59.9306, 'eval_samples_per_second': 83.43, 'eval_steps_per_second': 10.429, 'epoch': 0.4}


  6%|▌         | 7501/125000 [18:31<4:36:43,  7.08it/s]  

{'loss': 0.2705, 'grad_norm': 0.11429934203624725, 'learning_rate': 0.00047, 'epoch': 0.6}


  8%|▊         | 10000/125000 [24:22<4:28:45,  7.13it/s]

{'loss': 0.2755, 'grad_norm': 0.10968934744596481, 'learning_rate': 0.00046, 'epoch': 0.8}


                                                        
  8%|▊         | 10000/125000 [25:22<4:28:45,  7.13it/s]

{'eval_loss': 0.25354939699172974, 'eval_runtime': 60.0248, 'eval_samples_per_second': 83.299, 'eval_steps_per_second': 10.412, 'epoch': 0.8}


 10%|█         | 12501/125000 [31:13<4:28:31,  6.98it/s]  

{'loss': 0.2712, 'grad_norm': 0.10120628029108047, 'learning_rate': 0.00045000000000000004, 'epoch': 1.0}


 12%|█▏        | 15000/125000 [37:03<4:17:09,  7.13it/s]

{'loss': 0.2658, 'grad_norm': 0.13964098691940308, 'learning_rate': 0.00044, 'epoch': 1.2}


                                                        
 12%|█▏        | 15000/125000 [38:03<4:17:09,  7.13it/s]

{'eval_loss': 0.2508505880832672, 'eval_runtime': 60.1214, 'eval_samples_per_second': 83.165, 'eval_steps_per_second': 10.396, 'epoch': 1.2}


 14%|█▍        | 17501/125000 [43:54<4:19:00,  6.92it/s]  

{'loss': 0.2735, 'grad_norm': 0.1440533697605133, 'learning_rate': 0.00043, 'epoch': 1.4}


 16%|█▌        | 20000/125000 [49:44<4:04:36,  7.15it/s]

{'loss': 0.2726, 'grad_norm': 0.14911669492721558, 'learning_rate': 0.00042, 'epoch': 1.6}


                                                        
 16%|█▌        | 20000/125000 [50:44<4:04:36,  7.15it/s]

{'eval_loss': 0.24882744252681732, 'eval_runtime': 60.1001, 'eval_samples_per_second': 83.195, 'eval_steps_per_second': 10.399, 'epoch': 1.6}


 18%|█▊        | 22501/125000 [56:35<4:00:33,  7.10it/s]  

{'loss': 0.2682, 'grad_norm': 0.11374334990978241, 'learning_rate': 0.00041, 'epoch': 1.8}


 20%|██        | 25000/125000 [1:02:25<3:55:03,  7.09it/s]

{'loss': 0.267, 'grad_norm': 0.12591692805290222, 'learning_rate': 0.0004, 'epoch': 2.0}


                                                          
 20%|██        | 25000/125000 [1:03:25<3:55:03,  7.09it/s]

{'eval_loss': 0.24776333570480347, 'eval_runtime': 60.0389, 'eval_samples_per_second': 83.279, 'eval_steps_per_second': 10.41, 'epoch': 2.0}


 22%|██▏       | 27501/125000 [1:09:16<3:48:56,  7.10it/s]  

{'loss': 0.2644, 'grad_norm': 0.21246428787708282, 'learning_rate': 0.00039000000000000005, 'epoch': 2.2}


 24%|██▍       | 30000/125000 [1:15:06<3:41:39,  7.14it/s]

{'loss': 0.2684, 'grad_norm': 0.2245933711528778, 'learning_rate': 0.00038, 'epoch': 2.4}


                                                          
 24%|██▍       | 30000/125000 [1:16:06<3:41:39,  7.14it/s]

{'eval_loss': 0.24630649387836456, 'eval_runtime': 59.9781, 'eval_samples_per_second': 83.364, 'eval_steps_per_second': 10.42, 'epoch': 2.4}


 26%|██▌       | 32501/125000 [1:21:57<3:36:53,  7.11it/s]  

{'loss': 0.2664, 'grad_norm': 0.19400089979171753, 'learning_rate': 0.00037, 'epoch': 2.6}


 28%|██▊       | 35000/125000 [1:27:47<3:29:15,  7.17it/s]

{'loss': 0.2662, 'grad_norm': 0.10088231414556503, 'learning_rate': 0.00035999999999999997, 'epoch': 2.8}


                                                          
 28%|██▊       | 35000/125000 [1:28:47<3:29:15,  7.17it/s]

{'eval_loss': 0.24538134038448334, 'eval_runtime': 60.057, 'eval_samples_per_second': 83.254, 'eval_steps_per_second': 10.407, 'epoch': 2.8}


 30%|███       | 37501/125000 [1:34:38<3:29:48,  6.95it/s]  

{'loss': 0.265, 'grad_norm': 0.14780114591121674, 'learning_rate': 0.00035, 'epoch': 3.0}


 32%|███▏      | 40000/125000 [1:40:28<3:18:05,  7.15it/s]

{'loss': 0.2628, 'grad_norm': 0.15553152561187744, 'learning_rate': 0.00034, 'epoch': 3.2}


                                                          
 32%|███▏      | 40000/125000 [1:41:28<3:18:05,  7.15it/s]

{'eval_loss': 0.2445150911808014, 'eval_runtime': 60.0419, 'eval_samples_per_second': 83.275, 'eval_steps_per_second': 10.409, 'epoch': 3.2}


 34%|███▍      | 42501/125000 [1:47:19<3:13:55,  7.09it/s]  

{'loss': 0.2669, 'grad_norm': 0.12662068009376526, 'learning_rate': 0.00033, 'epoch': 3.4}


 36%|███▌      | 45000/125000 [1:53:10<3:06:46,  7.14it/s]

{'loss': 0.2624, 'grad_norm': 0.11920592188835144, 'learning_rate': 0.00032, 'epoch': 3.6}


                                                          
 36%|███▌      | 45000/125000 [1:54:10<3:06:46,  7.14it/s]

{'eval_loss': 0.2442670464515686, 'eval_runtime': 59.9707, 'eval_samples_per_second': 83.374, 'eval_steps_per_second': 10.422, 'epoch': 3.6}


 38%|███▊      | 47501/125000 [2:00:01<3:01:52,  7.10it/s]  

{'loss': 0.2656, 'grad_norm': 0.17698004841804504, 'learning_rate': 0.00031, 'epoch': 3.8}


 40%|████      | 50000/125000 [2:05:51<2:54:48,  7.15it/s]

{'loss': 0.2625, 'grad_norm': 0.1879633069038391, 'learning_rate': 0.0003, 'epoch': 4.0}


                                                          
 40%|████      | 50000/125000 [2:06:51<2:54:48,  7.15it/s]

{'eval_loss': 0.24347595870494843, 'eval_runtime': 59.7101, 'eval_samples_per_second': 83.738, 'eval_steps_per_second': 10.467, 'epoch': 4.0}


 42%|████▏     | 52501/125000 [2:12:42<2:50:13,  7.10it/s]  

{'loss': 0.2625, 'grad_norm': 0.13814032077789307, 'learning_rate': 0.00029, 'epoch': 4.2}


 44%|████▍     | 55000/125000 [2:18:32<2:43:19,  7.14it/s]

{'loss': 0.261, 'grad_norm': 0.12811067700386047, 'learning_rate': 0.00028000000000000003, 'epoch': 4.4}


                                                          
 44%|████▍     | 55000/125000 [2:19:32<2:43:19,  7.14it/s]

{'eval_loss': 0.24261967837810516, 'eval_runtime': 59.6974, 'eval_samples_per_second': 83.756, 'eval_steps_per_second': 10.469, 'epoch': 4.4}


 46%|████▌     | 57501/125000 [2:25:22<2:38:50,  7.08it/s]  

{'loss': 0.2607, 'grad_norm': 0.15033771097660065, 'learning_rate': 0.00027, 'epoch': 4.6}


 48%|████▊     | 60000/125000 [2:31:13<2:31:36,  7.15it/s]

{'loss': 0.2607, 'grad_norm': 0.15351438522338867, 'learning_rate': 0.00026000000000000003, 'epoch': 4.8}


                                                          
 48%|████▊     | 60000/125000 [2:32:12<2:31:36,  7.15it/s]

{'eval_loss': 0.24191713333129883, 'eval_runtime': 59.7533, 'eval_samples_per_second': 83.677, 'eval_steps_per_second': 10.46, 'epoch': 4.8}


 50%|█████     | 62501/125000 [2:38:03<2:28:58,  6.99it/s]  

{'loss': 0.2649, 'grad_norm': 0.11650613695383072, 'learning_rate': 0.00025, 'epoch': 5.0}


 52%|█████▏    | 65000/125000 [2:43:53<2:19:44,  7.16it/s]

{'loss': 0.259, 'grad_norm': 0.12301569432020187, 'learning_rate': 0.00024, 'epoch': 5.2}


                                                          
 52%|█████▏    | 65000/125000 [2:44:52<2:19:44,  7.16it/s]

{'eval_loss': 0.24130375683307648, 'eval_runtime': 59.7502, 'eval_samples_per_second': 83.682, 'eval_steps_per_second': 10.46, 'epoch': 5.2}


 54%|█████▍    | 67501/125000 [2:50:43<2:14:47,  7.11it/s]  

{'loss': 0.2601, 'grad_norm': 0.21444889903068542, 'learning_rate': 0.00023, 'epoch': 5.4}


 56%|█████▌    | 70000/125000 [2:56:33<2:08:21,  7.14it/s]

{'loss': 0.2584, 'grad_norm': 0.1976388841867447, 'learning_rate': 0.00022, 'epoch': 5.6}


                                                          
 56%|█████▌    | 70000/125000 [2:57:32<2:08:21,  7.14it/s]

{'eval_loss': 0.24063168466091156, 'eval_runtime': 59.5888, 'eval_samples_per_second': 83.908, 'eval_steps_per_second': 10.489, 'epoch': 5.6}


 58%|█████▊    | 72501/125000 [3:03:23<2:03:18,  7.10it/s]  

{'loss': 0.2628, 'grad_norm': 0.21828527748584747, 'learning_rate': 0.00021, 'epoch': 5.8}


 60%|██████    | 75000/125000 [3:09:12<1:57:27,  7.09it/s]

{'loss': 0.2605, 'grad_norm': 0.18847200274467468, 'learning_rate': 0.0002, 'epoch': 6.0}


                                                          
 60%|██████    | 75000/125000 [3:10:12<1:57:27,  7.09it/s]

{'eval_loss': 0.24021972715854645, 'eval_runtime': 59.6183, 'eval_samples_per_second': 83.867, 'eval_steps_per_second': 10.483, 'epoch': 6.0}


 62%|██████▏   | 77501/125000 [3:16:02<1:51:08,  7.12it/s]  

{'loss': 0.2584, 'grad_norm': 0.15500354766845703, 'learning_rate': 0.00019, 'epoch': 6.2}


 64%|██████▍   | 80000/125000 [3:21:53<1:45:05,  7.14it/s]

{'loss': 0.2575, 'grad_norm': 0.20276202261447906, 'learning_rate': 0.00017999999999999998, 'epoch': 6.4}


                                                          
 64%|██████▍   | 80000/125000 [3:22:53<1:45:05,  7.14it/s]

{'eval_loss': 0.23969973623752594, 'eval_runtime': 60.0932, 'eval_samples_per_second': 83.204, 'eval_steps_per_second': 10.401, 'epoch': 6.4}


 66%|██████▌   | 82501/125000 [3:28:45<1:39:34,  7.11it/s]  

{'loss': 0.2625, 'grad_norm': 0.12340927124023438, 'learning_rate': 0.00017, 'epoch': 6.6}


 68%|██████▊   | 85000/125000 [3:34:36<1:33:16,  7.15it/s]

{'loss': 0.2565, 'grad_norm': 0.16358733177185059, 'learning_rate': 0.00016, 'epoch': 6.8}


                                                          
 68%|██████▊   | 85000/125000 [3:35:35<1:33:16,  7.15it/s]

{'eval_loss': 0.23922793567180634, 'eval_runtime': 59.5458, 'eval_samples_per_second': 83.969, 'eval_steps_per_second': 10.496, 'epoch': 6.8}


 70%|███████   | 87501/125000 [3:41:26<1:28:59,  7.02it/s]  

{'loss': 0.2579, 'grad_norm': 0.10809537768363953, 'learning_rate': 0.00015, 'epoch': 7.0}


 72%|███████▏  | 90000/125000 [3:47:17<1:21:42,  7.14it/s]

{'loss': 0.256, 'grad_norm': 0.17504940927028656, 'learning_rate': 0.00014000000000000001, 'epoch': 7.2}


                                                          
 72%|███████▏  | 90000/125000 [3:48:17<1:21:42,  7.14it/s]

{'eval_loss': 0.23878104984760284, 'eval_runtime': 59.9079, 'eval_samples_per_second': 83.461, 'eval_steps_per_second': 10.433, 'epoch': 7.2}


 74%|███████▍  | 92501/125000 [3:54:08<1:16:52,  7.05it/s]  

{'loss': 0.2575, 'grad_norm': 0.18385185301303864, 'learning_rate': 0.00013000000000000002, 'epoch': 7.4}


 76%|███████▌  | 95000/125000 [3:59:58<1:09:40,  7.18it/s]

{'loss': 0.2595, 'grad_norm': 0.19152520596981049, 'learning_rate': 0.00012, 'epoch': 7.6}


                                                          
 76%|███████▌  | 95000/125000 [4:00:58<1:09:40,  7.18it/s]

{'eval_loss': 0.23839731514453888, 'eval_runtime': 59.698, 'eval_samples_per_second': 83.755, 'eval_steps_per_second': 10.469, 'epoch': 7.6}


 78%|███████▊  | 97501/125000 [4:06:50<1:04:54,  7.06it/s]  

{'loss': 0.2561, 'grad_norm': 0.12052283436059952, 'learning_rate': 0.00011, 'epoch': 7.8}


 80%|████████  | 100000/125000 [4:12:41<58:41,  7.10it/s] 

{'loss': 0.2563, 'grad_norm': 0.1750742495059967, 'learning_rate': 0.0001, 'epoch': 8.0}


                                                         
 80%|████████  | 100000/125000 [4:13:41<58:41,  7.10it/s]

{'eval_loss': 0.2379692941904068, 'eval_runtime': 59.9018, 'eval_samples_per_second': 83.47, 'eval_steps_per_second': 10.434, 'epoch': 8.0}


 82%|████████▏ | 102501/125000 [4:19:33<53:15,  7.04it/s]    

{'loss': 0.2555, 'grad_norm': 0.14341241121292114, 'learning_rate': 8.999999999999999e-05, 'epoch': 8.2}


 84%|████████▍ | 105000/125000 [4:25:22<46:39,  7.14it/s]

{'loss': 0.256, 'grad_norm': 0.1271553635597229, 'learning_rate': 8e-05, 'epoch': 8.4}


                                                         
 84%|████████▍ | 105000/125000 [4:26:22<46:39,  7.14it/s]

{'eval_loss': 0.23770879209041595, 'eval_runtime': 59.7303, 'eval_samples_per_second': 83.71, 'eval_steps_per_second': 10.464, 'epoch': 8.4}


 86%|████████▌ | 107501/125000 [4:32:14<41:18,  7.06it/s]    

{'loss': 0.2581, 'grad_norm': 0.18592609465122223, 'learning_rate': 7.000000000000001e-05, 'epoch': 8.6}


 88%|████████▊ | 110000/125000 [4:38:05<34:55,  7.16it/s]

{'loss': 0.2528, 'grad_norm': 0.1592438668012619, 'learning_rate': 6e-05, 'epoch': 8.8}


                                                         
 88%|████████▊ | 110000/125000 [4:39:05<34:55,  7.16it/s]

{'eval_loss': 0.237434521317482, 'eval_runtime': 60.0808, 'eval_samples_per_second': 83.221, 'eval_steps_per_second': 10.403, 'epoch': 8.8}


 90%|█████████ | 112501/125000 [4:44:56<29:45,  7.00it/s]   

{'loss': 0.2561, 'grad_norm': 0.19938038289546967, 'learning_rate': 5e-05, 'epoch': 9.0}


 92%|█████████▏| 115000/125000 [4:50:45<23:28,  7.10it/s]

{'loss': 0.2533, 'grad_norm': 0.18060067296028137, 'learning_rate': 4e-05, 'epoch': 9.2}


                                                         
 92%|█████████▏| 115000/125000 [4:51:45<23:28,  7.10it/s]

{'eval_loss': 0.2371135801076889, 'eval_runtime': 59.6078, 'eval_samples_per_second': 83.882, 'eval_steps_per_second': 10.485, 'epoch': 9.2}


 94%|█████████▍| 117501/125000 [4:57:35<17:38,  7.09it/s]   

{'loss': 0.254, 'grad_norm': 0.2145540714263916, 'learning_rate': 3e-05, 'epoch': 9.4}


 96%|█████████▌| 120000/125000 [5:03:25<11:39,  7.15it/s]

{'loss': 0.2536, 'grad_norm': 0.15555639564990997, 'learning_rate': 2e-05, 'epoch': 9.6}


                                                         
 96%|█████████▌| 120000/125000 [5:04:24<11:39,  7.15it/s]

{'eval_loss': 0.23696206510066986, 'eval_runtime': 59.6197, 'eval_samples_per_second': 83.865, 'eval_steps_per_second': 10.483, 'epoch': 9.6}


 98%|█████████▊| 122501/125000 [5:10:14<05:50,  7.12it/s]   

{'loss': 0.2551, 'grad_norm': 0.19592002034187317, 'learning_rate': 1e-05, 'epoch': 9.8}


100%|██████████| 125000/125000 [5:16:04<00:00,  7.09it/s]

{'loss': 0.2562, 'grad_norm': 0.19774676859378815, 'learning_rate': 0.0, 'epoch': 10.0}


                                                         
100%|██████████| 125000/125000 [5:17:04<00:00,  6.57it/s]

{'eval_loss': 0.23684878647327423, 'eval_runtime': 59.6356, 'eval_samples_per_second': 83.843, 'eval_steps_per_second': 10.48, 'epoch': 10.0}
{'train_runtime': 19024.0226, 'train_samples_per_second': 26.283, 'train_steps_per_second': 6.571, 'train_loss': 0.2633160737304687, 'epoch': 10.0}





TrainOutput(global_step=125000, training_loss=0.2633160737304687, metrics={'train_runtime': 19024.0226, 'train_samples_per_second': 26.283, 'train_steps_per_second': 6.571, 'total_flos': 1.63836297216e+17, 'train_loss': 0.2633160737304687, 'epoch': 10.0})

In [9]:
# Save the fine-tuned model and tokenizer
output_dir = "./smollm2_finetuned/01"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")



Model and tokenizer saved to ./smollm2_finetuned/01
