# Setup & Imports

In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import json

# CUDA settings
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True

# Check CUDA availability and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    # Set default CUDA device
    torch.cuda.set_device(0)

  from .autonotebook import tqdm as notebook_tqdm


CUDA available: True
CUDA device: NVIDIA GeForce RTX 4070 Ti SUPER
CUDA version: 11.8


# Tokenizing and shit

In [2]:
from tqdm.auto import tqdm

model_name = "HuggingFaceTB/SmolLM2-135M"

# Load tokenizer and set padding token
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer pad token: {tokenizer.pad_token}")
print(f"Tokenizer pad token ID: {tokenizer.pad_token_id}")

# Load model with progress bar
print("\nDownloading and loading model...")
with tqdm(total=1, desc="Model Loading", position=0, leave=True) as pbar:
    try:
        print("Attempting to load model directly to device...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto" if torch.cuda.is_available() else None,
            torch_dtype=torch.float32
        )
        print("\nModel loaded successfully")
        pbar.update(1)
    except Exception as e:
        print(f"\nError loading model directly to device: {e}")
        print("Attempting alternate loading method...")
        model = AutoModelForCausalLM.from_pretrained(model_name)
        if torch.cuda.is_available():
            print("Moving model to CUDA...")
            model = model.to('cuda')
        print("Model loading complete")
        pbar.update(1)

print(f"\nModel device check:")
print(f"Model is on CUDA: {next(model.parameters()).is_cuda}")

Tokenizer pad token: <|endoftext|>
Tokenizer pad token ID: 0

Downloading and loading model...


Model Loading:   0%|          | 0/1 [00:00<?, ?it/s]

Attempting to load model directly to device...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Model Loading: 100%|██████████| 1/1 [01:21<00:00, 81.52s/it]


Model loaded successfully

Model device check:
Model is on CUDA: True





In [5]:
# Define max length for the sequences
MAX_LENGTH = 512

def tokenize_function(examples):
    """
    Tokenize texts and prepare them for training with SmolLM2.
    Properly handles attention masks and padding.
    
    Args:
        examples: Dataset examples containing 'text' field
    Returns:
        dict: Tokenized examples with input_ids, attention_mask, and labels
    """
    # Tokenize with explicit attention mask and padding
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors=None,  # Return lists instead of tensors
        return_attention_mask=True  # Explicitly request attention mask
    )
    
    # For causal language modeling, labels are the input_ids
    labels = tokenized["input_ids"].copy()
    
    # Convert to tensors
    tokenized = {
        "input_ids": torch.tensor(tokenized["input_ids"], dtype=torch.long),
        "attention_mask": torch.tensor(tokenized["attention_mask"], dtype=torch.long),
        "labels": torch.tensor(labels, dtype=torch.long)
    }

    return tokenized

# Test the tokenization function
test_data = Dataset.from_dict({
    "text": [
        "Here is a sample text.",
        "Here is another, longer piece of text that might need padding."
    ]
})

print("Testing tokenization function:")
tokenized_output = tokenize_function(test_data)
for key, value in tokenized_output.items():
    print(f"\n{key}:")
    print(f"Shape: {value.shape}")
    print(f"Type: {value.dtype}")
    if key == "input_ids":
        print("Example decoded:", tokenizer.decode(value[0]))

Testing tokenization function:

input_ids:
Shape: torch.Size([2, 512])
Type: torch.int64
Example decoded: Here is a sample text.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><

accept dataset

In [6]:
def prepare_dataset_for_training(
    data_source,
    text_field="text",
    train_size=0.9,
    max_samples=None,
    seed=42
):
    """
    Prepare any dataset for training SmolLM2.
    
    Args:
        data_source: Can be:
            - path to .txt file (one sample per line)
            - path to .json file
            - list of texts
            - HuggingFace dataset
        text_field: Name of the text field if using structured data
        train_size: Proportion to use for training (0 to 1)
        max_samples: Optional limit on dataset size
        seed: Random seed for reproducibility
    """
    print("Loading dataset...")
    
    # Handle different input types
    if isinstance(data_source, str):
        # File path provided
        if data_source.endswith('.json'):
            with open(data_source, 'r', encoding='utf-8') as f:
                data = json.load(f)
            dataset = Dataset.from_list(data)
        elif data_source.endswith('.txt'):
            with open(data_source, 'r', encoding='utf-8') as f:
                texts = [line.strip() for line in f if line.strip()]
            dataset = Dataset.from_dict({"text": texts})
        else:
            raise ValueError("Unsupported file format")
    elif isinstance(data_source, list):
        # List of texts
        dataset = Dataset.from_dict({"text": data_source})
    else:
        # Assume it's already a dataset
        dataset = data_source
    
    # Limit dataset size if specified
    if max_samples and len(dataset) > max_samples:
        dataset = dataset.select(range(max_samples))
    
    print(f"Total samples: {len(dataset)}")
    
    # Split into train/validation
    train_dataset, eval_dataset = dataset.train_test_split(
        train_size=train_size,
        seed=seed
    )
    
    print("Tokenizing datasets...")
    
    # Tokenize both splits
    tokenized_train = train_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=train_dataset.column_names
    )
    tokenized_eval = eval_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=eval_dataset.column_names
    )
    
    # Set format for PyTorch
    tokenized_train.set_format("torch")
    tokenized_eval.set_format("torch")
    
    print(f"\nFinal dataset sizes:")
    print(f"Training: {len(tokenized_train)}")
    print(f"Evaluation: {len(tokenized_eval)}")
    
    return tokenized_train, tokenized_eval

# fine-tuning setup

In [7]:
from dataclasses import dataclass
from transformers import Trainer, TrainingArguments
from typing import Dict, List

@dataclass
class SmolDataCollator:
    """
    Custom data collator for SmolLM2 training
    """
    def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
        # Stack all the input tensors together
        input_ids = torch.stack([example["input_ids"] for example in examples])
        attention_mask = torch.stack([example["attention_mask"] for example in examples])
        labels = torch.stack([example["labels"] for example in examples])

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=100,
    logging_steps=100,
    save_total_limit=2,
    logging_dir="./logs",
    max_grad_norm=0.5,
    gradient_accumulation_steps=8,
    fp16=False,  # Set to True if you have GPU with CUDA capability
    dataloader_pin_memory=False if not torch.cuda.is_available() else True,
    remove_unused_columns=False
)

# Initialize trainer
data_collator = SmolDataCollator()



In [None]:
# First prepare test datasets
dummy_dataset = Dataset.from_dict({
    "text": ["This is a test sentence.", "Another test sentence."]
})
tokenized_test = dummy_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dummy_dataset.column_names
)
tokenized_test.set_format("torch")

# Split into train/eval
train_test, eval_test = tokenized_test.train_test_split(test_size=0.5, seed=42)

# Initialize trainer with both train and eval datasets
test_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test,
    eval_dataset=eval_test,
    data_collator=data_collator,
)

# Create and inspect a test batch
test_batch = data_collator([tokenized_test[i] for i in range(len(tokenized_test))])
print("\nTest batch shapes:")
for k, v in test_batch.items():
    print(f"{k}: {v.shape}, dtype: {v.dtype}, range: [{v.min()}, {v.max()}]")

print("\nSetup validation complete. Ready for training.")

Map: 100%|██████████| 2/2 [00:00<00:00, 138.25 examples/s]


Test batch shapes:
input_ids: torch.Size([2, 512]), dtype: torch.int64, range: [0, 6330]
attention_mask: torch.Size([2, 512]), dtype: torch.int64, range: [0, 1]
labels: torch.Size([2, 512]), dtype: torch.int64, range: [0, 6330]

Setup validation complete. Ready for training.





# Fine tuning

In [10]:
# Clear CUDA cache before training if using GPU
if torch.cuda.is_available():
    torch.cuda.empty_cache()

def scale_dataset(dataset, max_samples=1000):
    """Scale down a dataset to a maximum number of samples"""
    if len(dataset) > max_samples:
        scaled_indices = list(range(max_samples))
        return dataset.select(scaled_indices)
    return dataset

def train_model(train_dataset, eval_dataset, max_train_samples=None, max_eval_samples=None):
    # Scale datasets if specified
    print(f"Original sizes - Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
    
    if max_train_samples:
        train_dataset = scale_dataset(train_dataset, max_train_samples)
        # Scale eval set proportionally (usually ~10% of train size)
        if not max_eval_samples:
            max_eval_samples = max(50, int(max_train_samples * 0.1))
    
    if max_eval_samples:
        eval_dataset = scale_dataset(eval_dataset, max_eval_samples)
    
    print(f"Scaled sizes - Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")

    # Initialize trainer with scaled data
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )
    
    # Start training
    try:
        print("Starting training...")
        trainer_output = trainer.train()
        
        # Save the model
        print("Saving model...")
        trainer.save_model("./smollm2_finetuned")
        tokenizer.save_pretrained("./smollm2_finetuned")
        
        return trainer_output
    
    except Exception as e:
        print(f"Training error: {e}")
        if torch.cuda.is_available():
            print("\nCUDA Memory Summary:")
            print(torch.cuda.memory_summary())
        raise

# Usage example:
# train_output = train_model(
#     train_dataset, 
#     eval_dataset, 
#     max_train_samples=40000,  # Adjust these numbers as needed
#     max_eval_samples=4000
# )