In [None]:
!pip install datasets

In [None]:
import os
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset

In [None]:
# Specify the model
model_name_or_path = "HuggingFaceFW/ablation-model-fineweb-v1" #1.8B Model trained on fineweb

# Load the pretrained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("dmariko/init_data", split = 'train')


In [None]:
torch.cuda.empty_cache()

In [None]:
# Tokenization function for the dataset
def tokenize_function(examples):
    # Tokenize texts; adjust max_length as needed.
    return tokenizer(examples["paragraph"], truncation=True, max_length=1024)

# Tokenize the dataset (batch processing is enabled for efficiency)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["paragraph"])

# Set dataset format to ensure proper tensor types
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Filter out examples where tokenization produced an empty sequence
tokenized_dataset = tokenized_dataset.filter(lambda x: len(x['input_ids']) > 0)

class CustomDataCollator(DataCollatorForLanguageModeling):
    def __call__(self, examples):
        batch = super().__call__(examples)
        # Explicitly cast input_ids and labels to torch.long
        batch['input_ids'] = batch['input_ids'].long()
        if 'labels' in batch:
            batch['labels'] = batch['labels'].long()
        return batch

data_collator = CustomDataCollator(tokenizer=tokenizer, mlm=False)

# Define training arguments; adjust these hyperparameters based on your needs
training_args = TrainingArguments(
    output_dir="./SLM_Pretraining_01",
    overwrite_output_dir=True,
    num_train_epochs=1,                      # Number of training epochs
    per_device_train_batch_size=2,           # Adjust based on your hardware
    learning_rate=5e-5,
    weight_decay=0.01,
    save_steps=5000,                          # Save checkpoint every 500 steps
    save_total_limit=2,                      # Only keep the 2 most recent checkpoints
    prediction_loss_only=True,
)

# Initialize the Trainer with the model, data, and training configuration
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)



In [None]:
# Begin continual pre-training
trainer.train()

# Save the updated model and tokenizer for later use
model.save_pretrained("./SLM_Pretraining_01")
tokenizer.save_pretrained("./SLM_Pretraining_01")