In [1]:
!pip install torch==2.5.1 torchaudio==2.5.1 torchvision==0.20.1 transformers==4.46.1 wandb==0.17.3 datasets==3.1.0 accelerate==1.0.1 vllm==0.6.4.post1 ipykernel==6.28.0 ipython==8.20.0 gradio==4.44.0 trl==0.12.0 --force-reinstall

Collecting openai==1.56.1
  Downloading openai-1.56.1-py3-none-any.whl.metadata (24 kB)
Collecting torch==2.5.1
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchaudio==2.5.1
  Downloading torchaudio-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting torchvision==0.20.1
  Downloading torchvision-0.20.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting transformers==4.46.1
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb==0.17.3
  Downloading wandb-0.17.3-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting datasets==3.1.0
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate==1.0.1
  Downloading accelerate-1.0.1-py3-none-any.whl.metadata (19 kB)
Collectin

In [2]:
!nvidia-smi

Mon Mar 17 12:43:50 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H200                    On  |   00000000:18:00.0 Off |                    0 |
| N/A   24C    P0             72W /  700W |       1MiB / 143771MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA H200                    On  |   00

In [None]:
import os
import logging
import warnings
from dataclasses import dataclass, field, asdict
from typing import Optional
from datetime import datetime

import torch
import transformers
from datasets import load_dataset
import trl
from huggingface_hub import HfApi, login


warnings.filterwarnings("ignore", category=FutureWarning)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


@dataclass
class TrainingConfig:
    model_name: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
    train_file_path: str = "simplescaling/s1K-1.1_tokenized"
    block_size: int = 32768
    epochs: int = 5
    micro_batch_size: int = 1
    gradient_accumulation_steps: int = 1
    learning_rate: float = 1e-5
    weight_decay: float = 1e-4
    warmup_ratio: float = 0.05
    #use_fsdp: bool = True  # Enable Fully Sharded Data Parallel
    use_bf16: bool = True  # Mixed precision training
    eval_steps: int = 50
    save_steps: int = 100
    save_only_model: bool = True
    push_to_hub: bool = True  
    hf_repo_name: str = field(default_factory=lambda: f"Nika-7b")
    output_dir: str = field(default_factory=lambda: f"ckpts/{datetime.now().strftime('%Y%m%d_%H%M%S')}")

    def setup_env(self):
        os.environ["WANDB_PROJECT"] = "nika-7b"
        os.environ["WANDB_ENTITY"] = "beastgokul4-"


def train(config: TrainingConfig):
    config.setup_env()
    logging.info(f"Training config: {asdict(config)}")

    token = os.getenv("HF_TOKEN")  
    if not token:
        raise ValueError("Please set HF_TOKEN in your environment variables.")
    login(token=token)

    
    dataset = load_dataset(config.train_file_path)
    
    model_kwargs = {"device_map": "auto", "torch_dtype": "auto"} if "70B" in config.model_name else {}
    model = transformers.AutoModelForCausalLM.from_pretrained(config.model_name, **model_kwargs)
    

    tokenizer = transformers.AutoTokenizer.from_pretrained(config.model_name, use_fast=True)
    if "Qwen" in config.model_name:
        instruction_template = "<|im_start|>user"
        response_template = "<|im_start|>assistant\n"
        tokenizer.pad_token = "<|fim_pad|>"
    else:
        instruction_template = "<|start_header_id|>user<|end_header_id|>"
        response_template = "<|start_header_id|>assistant<|end_header_id|>\n\n"
        tokenizer.pad_token = "<|reserved_special_token_5|>"

    collator = trl.DataCollatorForCompletionOnlyLM(
        instruction_template=instruction_template,
        response_template=response_template,
        tokenizer=tokenizer,
        mlm=False
    )


    training_args = transformers.TrainingArguments(
        output_dir=config.output_dir,
        per_device_train_batch_size=config.micro_batch_size,
        per_device_eval_batch_size=config.micro_batch_size,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        num_train_epochs=config.epochs,
        logging_steps=1,
        save_steps=config.save_steps,
        evaluation_strategy="steps",
        eval_steps=config.eval_steps,
        learning_rate=config.learning_rate,
        weight_decay=config.weight_decay,
        warmup_ratio=config.warmup_ratio,
        save_total_limit=1,
        push_to_hub=config.push_to_hub,
        hub_model_id=config.hf_repo_name,
        hub_private_repo=True,  
        bf16=config.use_bf16,
        #fsdp="full_shard auto_wrap" if config.use_fsdp else None
    )

    trainer = trl.SFTTrainer(
        model=model,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"] if "test" in dataset else dataset["train"],
        args=training_args,
        data_collator=collator
    )

    trainer.train()

    trainer.save_model(config.output_dir)
    tokenizer.save_pretrained(config.output_dir)
    trainer.accelerator.wait_for_everyone()


    if config.push_to_hub:
        api = HfApi()
        logging.info(f"Uploading model to Hugging Face Hub: {config.hf_repo_name}")
        api.create_repo(config.hf_repo_name, exist_ok=True)
        api.upload_folder(
            folder_path=config.output_dir,
            repo_id=config.hf_repo_name
        )
        logging.info("Model successfully uploaded to Hugging Face!")


if __name__ == "__main__":
    train(TrainingConfig())


2025-03-17 12:43:54,838 - INFO - Training config: {'model_name': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', 'train_file_path': 'simplescaling/s1K-1.1_tokenized', 'block_size': 32768, 'epochs': 5, 'micro_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 1e-05, 'weight_decay': 0.0001, 'warmup_ratio': 0.05, 'use_bf16': True, 'eval_steps': 50, 'save_steps': 100, 'save_only_model': True, 'push_to_hub': True, 'hf_repo_name': 'Nika-7b', 'output_dir': 'ckpts/20250317_124354'}
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


README.md:   0%|          | 0.00/665 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/35.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

2025-03-17 12:48:44,795 - ERROR - Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbeastgokul4[0m ([33mbeastgokul4-[0m). Use [1m`wandb login --relogin`[0m to force relogin
