From e6efd8e6fffc4a0491c03d862b0b657a552d8b25 Mon Sep 17 00:00:00 2001 From: dt-edu Date: Sun, 3 May 2026 16:40:50 -0400 Subject: [PATCH 1/3] Contribtuions --- .../skills/automodel-expert-lora/SKILL.md | 211 +++++++++++++++ .../skills/automodel-expert-lora/card.yaml | 33 +++ .../automodel-expert-lora/evals/evals.json | 39 +++ .../skills/megatron-bridge-lora-sft/SKILL.md | 247 ++++++++++++++++++ .../skills/megatron-bridge-lora-sft/card.yaml | 31 +++ .../megatron-bridge-lora-sft/evals/evals.json | 39 +++ 6 files changed, 600 insertions(+) create mode 100644 skills_contribution/skills/automodel-expert-lora/SKILL.md create mode 100644 skills_contribution/skills/automodel-expert-lora/card.yaml create mode 100644 skills_contribution/skills/automodel-expert-lora/evals/evals.json create mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md create mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/card.yaml create mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json diff --git a/skills_contribution/skills/automodel-expert-lora/SKILL.md b/skills_contribution/skills/automodel-expert-lora/SKILL.md new file mode 100644 index 0000000..ef7e723 --- /dev/null +++ b/skills_contribution/skills/automodel-expert-lora/SKILL.md @@ -0,0 +1,211 @@ +--- +name: automodel-expert-lora +description: Apply LoRA to fused MoE expert layers in NeMo AutoModel using HuggingFace Transformers v5+ models. Covers expert parameter detection, rank_pattern configuration, and the validation warning emitted when match_all_linear silently skips expert weights. Use when fine-tuning MoE models (Mixtral, Qwen3-MoE, DeepSeek) with LoRA and needing expert layers adapted, or when diagnosing why only attention layers are changing during MoE LoRA training. +when_to_use: LoRA on MoE models in NeMo AutoModel, expert weight adaptation, rank_pattern configuration, silent skip diagnosis; 'match_all_linear MoE', 'expert LoRA', 'fused expert parameters', 'target_modules MoE', 'Mixtral LoRA', 'Qwen3-MoE LoRA', 'DeepSeek LoRA', 'nn.Parameter expert'. +license: Apache-2.0 +--- + +# Expert LoRA for Fused MoE Models + +Card: @skills/automodel-expert-lora/card.yaml + +## The Problem + +In Transformers v5+, fused MoE models (Mixtral, Qwen3-MoE, DeepSeek-V3, +GLM-4.5) register expert weights as `nn.Parameter` inside a combined linear +layer — not as individual `nn.Linear` modules. `match_all_linear=True` iterates +`nn.Linear` only. Expert parameters are invisible to it. + +Result: LoRA appears to run, loss changes only from attention adaptation, and +the expert layers are never modified. No error is raised. + +As of NeMo AutoModel v0.x (issue #1151), `apply_lora()` now emits a +`UserWarning` when this condition is detected, and three utilities are +available to configure expert LoRA correctly. + +## Quick Decision + +| Model family | Expert param pattern | Correct target_modules | +|---|---|---| +| Mixtral | `block_sparse_moe.w1/w2/w3` | `["w1", "w2", "w3"]` | +| Qwen3-MoE | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` | +| DeepSeek-V3 | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` | +| GLM-4.5 | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` | + +If unsure, run `detect_fused_moe_experts(model)` — it returns the correct +list for any supported model. + +## Enablement + +### Step 1 — Detect expert parameter names + +```python +from nemo_automodel.components._peft.lora import detect_fused_moe_experts + +targets = detect_fused_moe_experts(model) +# e.g. returns ["w1", "w2", "w3"] for Mixtral +# ["down_proj", "gate_proj", "up_proj"] for Qwen3-MoE +``` + +### Step 2 — Build rank_pattern (optional: per-expert rank sizing) + +```python +from nemo_automodel.components._peft.lora import build_expert_lora_rank_pattern + +rank_pattern = build_expert_lora_rank_pattern( + model, + base_rank=16, + expert_rank_multiplier=0.5, # smaller rank for experts to save memory +) +# e.g. {"block_sparse_moe": 8} +``` + +### Step 3 — Apply LoRA with explicit target_modules + +```python +from peft import LoraConfig, get_peft_model +from nemo_automodel.components._peft.lora import detect_fused_moe_experts, build_expert_lora_rank_pattern + +targets = detect_fused_moe_experts(model) +rank_pattern = build_expert_lora_rank_pattern(model, base_rank=16) + +lora_config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=targets, + rank_pattern=rank_pattern, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +model = get_peft_model(model, lora_config) +model.print_trainable_parameters() +# Should show expert layers in trainable params, not just attention +``` + +### Validation warning + +If `apply_lora()` is called with `match_all_linear=True` and no +`target_modules`, and the model has fused expert parameters, a `UserWarning` +is emitted with the detected parameter names and a fix snippet. Treat this +as an error — silent expert-skip produces wrong training dynamics. + +``` +UserWarning: [NeMo AutoModel] Fused MoE expert parameters detected but will +NOT be adapted by LoRA. + + Detected expert parameter names: ['w1', 'w2', 'w3'] + + To apply LoRA to expert layers, pass target_modules explicitly: + lora_config = LoraConfig( + target_modules=['w1', 'w2', 'w3'], + rank_pattern=build_expert_lora_rank_pattern(model, base_rank=16), + ) +``` + +## Code Anchors + +Expert detection utility: + +```python +# nemo_automodel/components/_peft/lora.py +_FUSED_EXPERT_PARAM_PATTERNS = ( + "block_sparse_moe", # Mixtral + "mlp.experts", # Qwen3-MoE, DeepSeek + "moe.experts", # generic + "ffn.experts", # generic +) + +def detect_fused_moe_experts(model: nn.Module) -> list[str]: + # inspects named_parameters() for known fused MoE patterns + # returns sorted list of leaf parameter name suffixes +``` + +Rank pattern builder: + +```python +# nemo_automodel/components/_peft/lora.py +def build_expert_lora_rank_pattern( + model: nn.Module, + base_rank: int, + expert_rank_multiplier: float = 1.0, +) -> dict[str, int]: + # maps MoE pattern keys to int(base_rank * multiplier) + # returns {} for dense models +``` + +Validation hook in apply_lora: + +```python +# nemo_automodel/components/_peft/lora.py +def apply_lora(model, lora_config, match_all_linear=False, target_modules=None): + validate_lora_config_for_moe(model, match_all_linear, target_modules) + # ... existing LoRA application logic ... +``` + +Tests: + +```python +# tests/unit/components/peft/test_expert_lora.py +class TestDetectFusedMoeExperts # 4 tests +class TestBuildExpertLoraRankPattern # 4 tests +class TestValidateLoraConfigForMoe # 3 tests +``` + +## Pitfalls + +1. **Silent failure with match_all_linear**: The most dangerous failure mode. + Training appears normal, loss decreases, but expert weights are never + adapted. Only detectable by checking `model.print_trainable_parameters()` + and confirming expert layers appear — or by observing that expert-heavy + tasks show no improvement vs attention-only LoRA. + +2. **rank_pattern key must match parameter path substring**: The keys in + `rank_pattern` are matched against full parameter names. Use the pattern + as returned by `detect_fused_moe_experts` or `build_expert_lora_rank_pattern` + — do not abbreviate. + +3. **expert_rank_multiplier < 1 floors at rank 1**: Setting + `expert_rank_multiplier=0.1` with `base_rank=4` gives rank 1, not 0. + This is intentional — rank 0 is invalid. Verify effective rank with + `model.print_trainable_parameters()`. + +4. **Dense model returns empty pattern**: `build_expert_lora_rank_pattern` + returns `{}` for dense models. Passing an empty `rank_pattern` to + `LoraConfig` is safe — PEFT falls back to the global `r` value. + +5. **target_modules suppresses the warning**: Once `target_modules` is + provided, `validate_lora_config_for_moe` returns immediately and + does not check whether the provided names actually cover expert layers. + Use `detect_fused_moe_experts` to generate the list rather than + guessing module names. + +## Verification + +Unit tests for all three utilities: + +```bash +pytest tests/unit/components/peft/test_expert_lora.py -v +``` + +Expected: `12 passed` + +Confirm expert layers are trainable after apply_lora: + +```python +model = get_peft_model(model, lora_config) +trainable = {n for n, p in model.named_parameters() if p.requires_grad} +expert_patterns = detect_fused_moe_experts(model.base_model) +assert any( + any(pat in name for pat in expert_patterns) + for name in trainable +), "No expert parameters in trainable set — check target_modules" +``` + +Success criteria: + +- `12 passed` on unit tests +- `model.print_trainable_parameters()` shows expert layer names in the + trainable parameter count +- No `UserWarning` about fused MoE expert skip when target_modules is set diff --git a/skills_contribution/skills/automodel-expert-lora/card.yaml b/skills_contribution/skills/automodel-expert-lora/card.yaml new file mode 100644 index 0000000..a377293 --- /dev/null +++ b/skills_contribution/skills/automodel-expert-lora/card.yaml @@ -0,0 +1,33 @@ +name: automodel-expert-lora +version: "1.0" +author: Doondi-Ashlesh +status: community +recommendation_level: stable + +description: > + Apply LoRA to fused MoE expert layers in NeMo AutoModel. Covers expert + parameter detection, rank_pattern configuration, and the validation warning + emitted when match_all_linear silently skips expert weights in Transformers v5+. + +use_cases: + - LoRA fine-tuning on Mixtral models targeting expert layers + - LoRA fine-tuning on Qwen3-MoE, DeepSeek, GLM-4.5 with expert adaptation + - Diagnosing silent expert-skip in existing MoE LoRA training runs + - Per-expert rank sizing via rank_pattern + +known_limitations: + - Expert detection relies on known fused MoE parameter name patterns + - Custom MoE architectures with non-standard parameter naming require + manual target_modules specification + - rank_pattern key matching is substring-based; ambiguous keys may + match unintended layers + +follow_up_validation: + - End-to-end MoE LoRA fine-tune + eval loop test not yet in CI + - Functional test on real Mixtral or Qwen3-MoE checkpoint pending + +related_issues: + - https://github.com/NVIDIA-NeMo/Automodel/issues/1151 + +related_skills: + - megatron-bridge-lora-sft diff --git a/skills_contribution/skills/automodel-expert-lora/evals/evals.json b/skills_contribution/skills/automodel-expert-lora/evals/evals.json new file mode 100644 index 0000000..615e6e8 --- /dev/null +++ b/skills_contribution/skills/automodel-expert-lora/evals/evals.json @@ -0,0 +1,39 @@ +{ + "skill_name": "automodel-expert-lora", + "evals": [ + { + "id": 1, + "prompt": "I'm applying LoRA to a Mixtral model in NeMo AutoModel with match_all_linear=True but the loss is not improving on expert-heavy tasks. What's wrong and how do I fix it?", + "expected_output": "Diagnosis that match_all_linear=True only matches nn.Linear and misses Mixtral's fused expert parameters (nn.Parameter in block_sparse_moe). Fix: use detect_fused_moe_experts(model) to get target_modules=['w1','w2','w3'] and pass them explicitly to LoraConfig.", + "assertions": [ + "Response identifies nn.Parameter vs nn.Linear as the root cause", + "Response references detect_fused_moe_experts utility", + "Response provides target_modules=['w1','w2','w3'] for Mixtral", + "Response does not suggest increasing lora_rank as the primary fix", + "Response mentions UserWarning that would have been emitted" + ] + }, + { + "id": 2, + "prompt": "How do I apply different LoRA ranks to attention vs expert layers in a Qwen3-MoE model using NeMo AutoModel?", + "expected_output": "Use build_expert_lora_rank_pattern(model, base_rank=16, expert_rank_multiplier=0.5) to generate rank_pattern dict, then pass both target_modules (from detect_fused_moe_experts) and rank_pattern to LoraConfig.", + "assertions": [ + "Response includes build_expert_lora_rank_pattern with base_rank and expert_rank_multiplier", + "Response shows rank_pattern passed to LoraConfig", + "Response includes detect_fused_moe_experts to get target_modules", + "Response explains that expert_rank_multiplier < 1 reduces expert rank below base_rank" + ] + }, + { + "id": 3, + "prompt": "After applying LoRA to my Qwen3-MoE model, how do I verify that expert layers are actually being trained and not silently skipped?", + "expected_output": "Use model.print_trainable_parameters() and check that expert layer names appear, plus a code snippet using detect_fused_moe_experts to assert expert pattern names are in the trainable parameter set.", + "assertions": [ + "Response includes model.print_trainable_parameters() call", + "Response provides assertion or check using detect_fused_moe_experts", + "Response explains what to look for in the trainable parameter output", + "Response mentions the 12 passed unit test expectation as a baseline check" + ] + } + ] +} diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md b/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md new file mode 100644 index 0000000..b43027d --- /dev/null +++ b/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md @@ -0,0 +1,247 @@ +--- +name: megatron-bridge-lora-sft +description: Configure and run LoRA, DoRA, and full SFT fine-tuning in Megatron-Bridge. Covers PEFT recipe selection, target module wiring, adapter merging, and HuggingFace checkpoint export. Use when applying LoRA or DoRA to any Bridge-supported model, setting up SFT datasets, debugging PEFT config errors, or exporting fine-tuned weights back to HuggingFace format. +when_to_use: LoRA or DoRA fine-tuning, SFT recipe setup, PEFT config errors, adapter merging, HuggingFace export after fine-tuning; 'peft_config', 'LoRA', 'DoRA', 'lora_rank', 'target_modules', 'merge_lora', 'sft_config', 'fine-tune', 'adapter export'. +license: Apache-2.0 +--- + +# LoRA / DoRA / SFT Fine-Tuning + +Stable docs: @docs/training/peft.md +Card: @skills/megatron-bridge-lora-sft/card.yaml + +## Quick Decision + +| Goal | Recipe type | Min GPUs | +|---|---|---| +| LoRA on 8B model | `*_peft_config` | 1 | +| LoRA on 70B model | `*_peft_config` | 8 | +| LoRA on 235B MoE | `*_peft_config` | 16 | +| Full SFT on 8B | `*_sft_config` | 2 | +| Full SFT on 70B | `*_sft_config` | 16 | +| Merge adapters + export to HF | Post-training step | Same as training | + +Use PEFT recipes when GPU count is the constraint. Use SFT recipes when +you need full gradient flow through all parameters. + +## Enablement + +### LoRA (minimal) + +```python +from megatron.bridge.recipes.llama import llama3_8b_peft_config + +cfg = llama3_8b_peft_config() + +# Default: rank=16, alpha=32, target_modules=["linear_qkv", "linear_proj"] +# Override rank and alpha: +cfg.peft.lora_rank = 32 +cfg.peft.lora_alpha = 64 + +# Add MLP layers to target modules: +cfg.peft.target_modules = [ + "linear_qkv", + "linear_proj", + "linear_fc1", + "linear_fc2", +] +``` + +### DoRA + +```python +cfg.peft.use_dora = True +cfg.peft.lora_rank = 16 +cfg.peft.lora_alpha = 16 # alpha == rank is the DoRA convention +``` + +### SFT (full fine-tune) + +```python +from megatron.bridge.recipes.llama import llama3_8b_sft_config + +cfg = llama3_8b_sft_config() +cfg.dataset.data_path = ["/data/train.jsonl"] +cfg.dataset.seq_length = 4096 +cfg.train.global_batch_size = 128 +cfg.train.micro_batch_size = 2 +cfg.optimizer.lr = 1e-5 +``` + +### MoE LoRA — expert layer targeting + +For MoE models (Qwen3-MoE, DeepSeek, GLM-4.5), expert weights are +registered as `nn.Parameter`, not `nn.Linear`. `match_all_linear=True` +silently skips them. Set `target_modules` explicitly: + +```python +cfg = qwen3_30b_a3b_peft_config() +cfg.peft.target_modules = [ + "linear_qkv", # attention + "linear_proj", # attention output + "gate_proj", # expert gate + "up_proj", # expert up + "down_proj", # expert down +] +cfg.peft.lora_rank = 16 +cfg.peft.lora_alpha = 32 +``` + +### Adapter merge and HuggingFace export + +```python +from megatron.bridge.peft.merge import merge_lora_weights +from megatron.bridge.convert import export_to_hf + +# Step 1: merge adapters into base weights +merge_lora_weights( + checkpoint_dir="/checkpoints/lora_run", + output_dir="/checkpoints/merged", +) + +# Step 2: export merged checkpoint to HuggingFace format +export_to_hf( + megatron_checkpoint="/checkpoints/merged", + hf_output_dir="/hf_model/", + model_type="llama3", +) +``` + +Or via CLI: + +```bash +python scripts/convert/megatron_to_hf.py \ + --checkpoint /checkpoints/merged \ + --output /hf_model/ \ + --model-type llama3 +``` + +## Entry Points + +```bash +# LoRA fine-tune (1 GPU) +uv run python -m torch.distributed.run --nproc_per_node=1 \ + scripts/training/run_recipe.py \ + --recipe llama3_8b_peft_config \ + --dataset llm-finetune + +# SFT fine-tune (2 GPUs) +uv run python -m torch.distributed.run --nproc_per_node=2 \ + scripts/training/run_recipe.py \ + --recipe llama3_8b_sft_config \ + --dataset llm-finetune + +# Override LoRA rank via CLI +uv run python -m torch.distributed.run --nproc_per_node=1 \ + scripts/training/run_recipe.py \ + --recipe llama3_8b_peft_config \ + --dataset llm-finetune \ + 'peft.lora_rank=32' \ + 'peft.lora_alpha=64' +``` + +## Code Anchors + +PEFT config definition: + +```python +# src/megatron/bridge/training/config.py +@dataclass +class PEFTConfig: + lora_rank: int = 16 + lora_alpha: float = 32 + lora_dropout: float = 0.0 + use_dora: bool = False + target_modules: list[str] = field(default_factory=lambda: ["linear_qkv", "linear_proj"]) + match_all_linear: bool = False +``` + +LoRA adapter application: + +```python +# src/megatron/bridge/training/peft.py +def apply_lora(model, peft_config): + # wraps target modules with LoraLinear / DoraLinear + # match_all_linear iterates nn.Linear only — misses nn.Parameter MoE experts +``` + +Merge utility: + +```python +# src/megatron/bridge/peft/merge.py +def merge_lora_weights(checkpoint_dir, output_dir): + # loads base + adapter shards, merges in-place, writes merged checkpoint +``` + +PEFT recipe examples: + +```python +# src/megatron/bridge/recipes/llama.py +def llama3_8b_peft_config() -> ConfigContainer: + cfg = llama3_8b_sft_config() + cfg.peft = PEFTConfig(lora_rank=16, lora_alpha=32) + cfg.model.tensor_model_parallel_size = 1 + cfg.model.pipeline_model_parallel_size = 1 + return cfg +``` + +## Pitfalls + +1. **MoE expert layers silently skipped**: `match_all_linear=True` only + matches `nn.Linear`. Expert weights in fused MoE blocks (Qwen3-MoE, + DeepSeek, GLM-4.5) are `nn.Parameter` — they are invisible to the + matcher. Always set `target_modules` explicitly for MoE models. + +2. **DoRA alpha convention**: DoRA expects `lora_alpha == lora_rank`. Using + the standard LoRA convention (`alpha = 2 * rank`) will not error but + produces suboptimal scaling. Set `alpha = rank` for DoRA. + +3. **Merge before export**: Exporting a LoRA checkpoint to HuggingFace + without merging produces a broken HF model — the base weights do not + include adapter contributions. Always run `merge_lora_weights()` first. + +4. **TP > 1 with PEFT**: LoRA adapters are sharded along with the base + layer when `tensor_model_parallel_size > 1`. The adapter shapes must be + consistent across TP ranks. Mismatched `lora_rank` between ranks causes + a shape error at initialization, not at the first forward pass. + +5. **SFT with packed sequences requires MBS=1**: When `PackedSequenceSpecs` + is active, setting `micro_batch_size > 1` raises a `ValueError`. PEFT + recipes default to `MBS=1`; SFT recipes may need explicit adjustment. + +6. **`calculate_per_token_loss` for SFT with CP**: When context parallelism + (`context_parallel_size > 1`) is enabled for SFT, set + `cfg.model.calculate_per_token_loss = True` and + `cfg.ddp.average_in_collective = False`. Omitting either causes + incorrect loss scaling across CP ranks. + +7. **LoRA dropout and inference**: `lora_dropout > 0` is training-only. + Ensure the adapter is saved and merged in eval mode or dropout + will be applied during export, corrupting merged weights. + +## Verification + +Unit test coverage for PEFT config validation: + +```bash +uv run python -m pytest tests/unit_tests/training/test_config.py \ + -k "peft or lora" -v +``` + +Smoke test LoRA on 1 GPU with mock data: + +```bash +CUDA_VISIBLE_DEVICES=0 uv run python -m torch.distributed.run --nproc_per_node=1 \ + scripts/training/run_recipe.py \ + --recipe llama3_8b_peft_config \ + --dataset llm-finetune \ + 'train.train_iters=5' \ + 'logger.log_interval=1' +``` + +Success criteria: + +- Exit code 0 +- Finite loss at iteration 5 (e.g. `lm loss: 9.8E+00`) +- Log shows `PEFTConfig` with expected `lora_rank` and `target_modules` +- No `KeyError` or shape mismatch during adapter initialization diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml b/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml new file mode 100644 index 0000000..30b87eb --- /dev/null +++ b/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml @@ -0,0 +1,31 @@ +name: megatron-bridge-lora-sft +version: "1.0" +author: Doondi-Ashlesh +status: community +recommendation_level: stable + +description: > + Configure and run LoRA, DoRA, and full SFT fine-tuning in Megatron-Bridge. + Covers PEFT recipe selection, target module wiring for dense and MoE models, + adapter merging, and HuggingFace checkpoint export. + +use_cases: + - LoRA fine-tuning on dense models (Llama, Qwen3, Gemma) + - DoRA fine-tuning + - LoRA on MoE models with explicit expert layer targeting + - Full SFT fine-tuning + - Adapter merge and HuggingFace export + +known_limitations: + - Expert layer LoRA requires explicit target_modules for MoE models + - DoRA requires alpha == rank for correct weight decomposition scaling + - Adapter merge must precede HuggingFace export + +follow_up_validation: + - End-to-end LoRA merge + export round-trip test not yet in CI + - MoE expert LoRA functional test pending + +related_skills: + - recipe-recommender + - perf-parallelism-strategies + - perf-sequence-packing diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json b/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json new file mode 100644 index 0000000..4835132 --- /dev/null +++ b/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json @@ -0,0 +1,39 @@ +{ + "skill_name": "megatron-bridge-lora-sft", + "evals": [ + { + "id": 1, + "prompt": "How do I set up LoRA fine-tuning for Llama 3 8B in Megatron-Bridge with rank 32 targeting attention and MLP layers?", + "expected_output": "Python config snippet using llama3_8b_peft_config(), setting lora_rank=32, lora_alpha=64, and target_modules covering both attention (linear_qkv, linear_proj) and MLP (linear_fc1, linear_fc2) layers, plus the launch command.", + "assertions": [ + "Response includes llama3_8b_peft_config() recipe reference", + "Response sets lora_rank=32", + "Response sets target_modules with at least 4 entries covering attention and MLP", + "Response includes the uv run torch.distributed.run launch command", + "Response does not suggest match_all_linear=True as the solution" + ] + }, + { + "id": 2, + "prompt": "I'm trying to apply LoRA to a Qwen3-30B-A3B MoE model in Megatron-Bridge but my loss isn't changing — it looks like only attention layers are being adapted. What's wrong?", + "expected_output": "Diagnosis that match_all_linear silently skips MoE expert parameters (nn.Parameter, not nn.Linear), and the fix: explicitly setting target_modules to include gate_proj, up_proj, down_proj in addition to attention layers.", + "assertions": [ + "Response identifies the root cause as match_all_linear missing nn.Parameter expert weights", + "Response provides explicit target_modules list including expert layer names", + "Response references qwen3_30b_a3b_peft_config or equivalent MoE recipe", + "Response does not suggest increasing lora_rank as the primary fix" + ] + }, + { + "id": 3, + "prompt": "How do I merge LoRA adapters and export the result to HuggingFace format after training in Megatron-Bridge?", + "expected_output": "Two-step process: first merge_lora_weights() to combine adapter into base checkpoint, then export_to_hf() or CLI megatron_to_hf.py to produce the HF model directory.", + "assertions": [ + "Response shows merge step before export step", + "Response includes checkpoint_dir and output_dir arguments for merge", + "Response includes CLI or Python export command with model-type argument", + "Response warns that skipping merge produces a broken HF model" + ] + } + ] +} From fe7a22bd7928cb47931190885b54a645a2c0154f Mon Sep 17 00:00:00 2001 From: Doondi-Ashlesh Date: Sun, 3 May 2026 22:30:15 +0000 Subject: [PATCH 2/3] Add skills: automodel-expert-lora and megatron-bridge-lora-sft - Add automodel-expert-lora to skills/NeMo-AutoModel/ with component entry - Add megatron-bridge-lora-sft to skills/Megatron-Bridge/ - Each skill includes SKILL.md, card.yaml, and evals/evals.json Signed-off-by: Doondi-Ashlesh --- components.d/automodel.yml | 8 + .../megatron-bridge-lora-sft/SKILL.md | 201 ++++++++++++++++++ .../megatron-bridge-lora-sft/card.yaml | 80 +++++++ .../megatron-bridge-lora-sft/evals/evals.json | 44 ++++ .../automodel-expert-lora/SKILL.md | 201 ++++++++++++++++++ .../automodel-expert-lora/card.yaml | 76 +++++++ .../automodel-expert-lora/evals/evals.json | 44 ++++ 7 files changed, 654 insertions(+) create mode 100644 components.d/automodel.yml create mode 100644 skills/Megatron-Bridge/megatron-bridge-lora-sft/SKILL.md create mode 100644 skills/Megatron-Bridge/megatron-bridge-lora-sft/card.yaml create mode 100644 skills/Megatron-Bridge/megatron-bridge-lora-sft/evals/evals.json create mode 100644 skills/NeMo-AutoModel/automodel-expert-lora/SKILL.md create mode 100644 skills/NeMo-AutoModel/automodel-expert-lora/card.yaml create mode 100644 skills/NeMo-AutoModel/automodel-expert-lora/evals/evals.json diff --git a/components.d/automodel.yml b/components.d/automodel.yml new file mode 100644 index 0000000..22641c7 --- /dev/null +++ b/components.d/automodel.yml @@ -0,0 +1,8 @@ +name: NeMo-AutoModel +repo: NVIDIA-NeMo/Automodel +description: NeMo AutoModel — fine-tuning and training of HuggingFace-compatible models, including LoRA, PEFT, and MoE workflows. +skills: + - path: skills/ + catalog_dir: NeMo-AutoModel +links: + security: false diff --git a/skills/Megatron-Bridge/megatron-bridge-lora-sft/SKILL.md b/skills/Megatron-Bridge/megatron-bridge-lora-sft/SKILL.md new file mode 100644 index 0000000..1b914c3 --- /dev/null +++ b/skills/Megatron-Bridge/megatron-bridge-lora-sft/SKILL.md @@ -0,0 +1,201 @@ +--- +name: megatron-bridge-lora-sft +description: Configure and run LoRA, DoRA, and full SFT fine-tuning in Megatron-Bridge. Covers LoRA dataclass setup, target module wiring, normalize_moe_lora for MoE models, and adapter export via AutoBridge.export_adapter_ckpt. Use when applying LoRA or DoRA to any Bridge-supported model, setting up SFT datasets, or exporting fine-tuned adapters to HuggingFace PEFT format. +when_to_use: LoRA or DoRA fine-tuning, SFT recipe setup, normalize_moe_lora, MoE expert targeting, adapter export to HuggingFace, peft_scheme lora dora, dim alpha target_modules LoRA dataclass, torchrun recipe fine-tune, export_adapter_ckpt AutoBridge. +--- + +# LoRA / DoRA / SFT Fine-Tuning + +Card: @skills/megatron-bridge-lora-sft/card.yaml + +## Quick Decision + +| Goal | peft_scheme | Min GPUs | +|---|---|---| +| LoRA on 1B model | `"lora"` | 1 | +| DoRA on 1B model | `"dora"` | 1 | +| Full SFT on 8B | sft recipe | 2 | +| Export adapter to HF PEFT | CPU only | 0 GPUs | + +## Enablement + +### LoRA (minimal) + +```python +from megatron.bridge.recipes.llama import llama32_1b_peft_config + +config = llama32_1b_peft_config(peft_scheme="lora") + +# Default target_modules: ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"] +# Default dim=32, alpha=32 + +# Override rank and alpha: +config.peft.dim = 16 +config.peft.alpha = 32 +``` + +Launch: + +```bash +torchrun --nproc_per_node=1 tutorials/recipes/llama/01_quickstart_finetune.py \ + --pretrained-checkpoint /path/to/checkpoint +``` + +### DoRA + +```python +config = llama32_1b_peft_config(peft_scheme="dora") +config.peft.dim = 16 +config.peft.alpha = 64 # DoRA default alpha is 64 +``` + +### MoE LoRA — expert layer targeting + +For MoE models, add expert projection names to `target_modules` and enable +`normalize_moe_lora` to scale down expert rank proportionally: + +```python +from megatron.bridge.peft.lora import LoRA + +lora = LoRA( + target_modules=[ + "linear_qkv", # attention + "linear_proj", # attention output + "linear_fc1", # MLP gate/up (dense fallback) + "linear_fc2", # MLP down (dense fallback) + ], + dim=32, + alpha=32, + normalize_moe_lora=True, # dim // moe_router_topk for expert layers +) +``` + +With `normalize_moe_lora=True`: +- Expert linear layers: effective dim = `dim // moe_router_topk` +- Non-expert layers: effective dim = `dim` (unchanged) +- `dim` must be evenly divisible by `moe_router_topk` + +### Adapter export to HuggingFace + +```python +from megatron.bridge import AutoBridge + +bridge = AutoBridge(hf_model_path="/path/to/hf/model") + +bridge.export_adapter_ckpt( + peft_checkpoint="/checkpoints/lora_run", + output_path="./my_adapter", +) +# produces: ./my_adapter/adapter_config.json +# ./my_adapter/adapter_model.safetensors +``` + +Or via CLI script: + +```bash +python examples/conversion/adapter/export_adapter.py \ + --hf-model-path /path/to/hf/model \ + --lora-checkpoint /checkpoints/lora_run \ + --output ./my_adapter +``` + +The exported adapter loads directly with HuggingFace PEFT: + +```python +from peft import PeftModel +model = PeftModel.from_pretrained(base_model, "./my_adapter") +``` + +Export runs on CPU — no GPU required. + +## Code Anchors + +LoRA dataclass: + +```python +# src/megatron/bridge/peft/lora.py +@dataclass +class LoRA(PEFT, ModuleMatcher): + target_modules: List[str] = field( + default_factory=lambda: ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"] + ) + dim: int = 32 + alpha: int = 32 + dropout: float = 0.0 + dropout_position: Literal["pre", "post"] = "pre" + lora_A_init_method: str = "xavier" + lora_B_init_method: str = "zero" + a2a_experimental: bool = False + lora_dtype: torch.dtype = None + normalize_moe_lora: bool = False +``` + +DoRA dataclass: + +```python +# src/megatron/bridge/peft/dora.py +@dataclass +class DoRA(PEFT, ModuleMatcher): + target_modules: List[str] = field( + default_factory=lambda: ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"] + ) + dim: int = 32 + alpha: int = 64 # DoRA default differs from LoRA default +``` + +Recipe function: + +```python +# tutorials/recipes/llama/01_quickstart_finetune.py +from megatron.bridge.recipes.llama import llama32_1b_peft_config + +config = llama32_1b_peft_config(peft_scheme="lora") # or "dora" +config.peft.dim = 16 +config.peft.alpha = 32 +``` + +Export: + +```python +# examples/conversion/adapter/export_adapter.py +bridge = AutoBridge(hf_model_path=...) +bridge.export_adapter_ckpt(peft_checkpoint=..., output_path=...) +``` + +## Pitfalls + +1. **MoE expert layers silently skipped without normalize_moe_lora or explicit targets**: + The default `target_modules` covers attention and MLP layers for dense models. + For MoE models, expert weights may not be covered — verify with a forward pass + that expert parameters have `requires_grad=True`. + +2. **DoRA alpha convention**: DoRA default `alpha=64`, not 32. Check the `DoRA` + dataclass defaults before overriding. + +3. **normalize_moe_lora requires evenly divisible dim**: `dim` must be divisible by + `moe_router_topk`. Indivisible `dim` values will error. + +4. **Export produces HF PEFT adapter — no merge step needed**: Unlike some frameworks, + `export_adapter_ckpt` produces `adapter_config.json` + `adapter_model.safetensors` + which load directly via `PeftModel.from_pretrained`. No separate merge step is + required before HuggingFace use. + +5. **TP > 1 with PEFT**: LoRA adapter shapes are sharded with the base layer when + `tensor_model_parallel_size > 1`. Adapter `dim` must be consistent across TP ranks. + Mismatched `dim` causes a shape error at initialization. + +## Verification + +Smoke test LoRA on 1 GPU with mock data: + +```bash +torchrun --nproc_per_node=1 tutorials/recipes/llama/01_quickstart_finetune.py \ + --pretrained-checkpoint /path/to/checkpoint +``` + +Success criteria: + +- Exit code 0 +- Finite loss in logs +- Adapter files generated: `adapter_config.json` + `adapter_model.safetensors` +- `PeftModel.from_pretrained(base_model, output_path)` loads without error diff --git a/skills/Megatron-Bridge/megatron-bridge-lora-sft/card.yaml b/skills/Megatron-Bridge/megatron-bridge-lora-sft/card.yaml new file mode 100644 index 0000000..6f5b635 --- /dev/null +++ b/skills/Megatron-Bridge/megatron-bridge-lora-sft/card.yaml @@ -0,0 +1,80 @@ +title: megatron_bridge_lora_sft +validated_on: "2026-05-03" +summary: > + Megatron-Bridge exposes LoRA and DoRA via the LoRA and DoRA dataclasses in + src/megatron/bridge/peft/. Default target_modules cover attention and MLP dense + layers. MoE expert rank normalization is via normalize_moe_lora=True (divides dim + by moe_router_topk for expert layers). Adapter export to HuggingFace PEFT format + uses AutoBridge.export_adapter_ckpt — produces adapter_config.json and + adapter_model.safetensors compatible with PeftModel.from_pretrained. + +validation_status: + lora_dataclass: + - code_verified + dora_dataclass: + - code_verified + normalize_moe_lora: + - code_verified + recipe_function_llama32_1b: + - code_verified + export_adapter_ckpt: + - code_verified + peft_model_load_after_export: + - code_verified + tp_peft_sharding: + - unclear + end_to_end_moe_lora_finetune: + - unclear + +feature_meaning: + lora_dataclass: > + LoRA(target_modules, dim=32, alpha=32, normalize_moe_lora=False). + Applied to model via peft_scheme="lora" in recipe functions. + dora_dataclass: > + DoRA(target_modules, dim=32, alpha=64). DoRA default alpha is 64, not 32. + Applied via peft_scheme="dora". + normalize_moe_lora: > + When True, expert linear layers use dim // moe_router_topk instead of full dim. + Non-expert layers keep full dim. dim must be evenly divisible by moe_router_topk. + export_adapter_ckpt: > + AutoBridge(hf_model_path).export_adapter_ckpt(peft_checkpoint, output_path). + Generates adapter_config.json + adapter_model.safetensors. Runs on CPU. + Output loads directly via PeftModel.from_pretrained(base_model, output_path). + +recommended_path: + lora_minimal: + recipe: llama32_1b_peft_config(peft_scheme="lora") + peft.dim: 16 + peft.alpha: 32 + dora: + recipe: llama32_1b_peft_config(peft_scheme="dora") + peft.dim: 16 + peft.alpha: 64 + moe_lora: + peft.normalize_moe_lora: true + peft.dim: 32 + note: dim must be divisible by moe_router_topk + export: + step_1: "bridge = AutoBridge(hf_model_path)" + step_2: "bridge.export_adapter_ckpt(peft_checkpoint, output_path)" + +known_constraints: + - DoRA default alpha is 64, not 32; overriding without checking defaults may produce incorrect scaling. + - normalize_moe_lora requires dim evenly divisible by moe_router_topk. + - TP > 1 with PEFT requires consistent adapter dim across all TP ranks; mismatch errors at init. + - export_adapter_ckpt produces HF PEFT adapter files — no separate merge step is needed before HF use. + +known_limitations: + - End-to-end MoE LoRA fine-tune on a real MoE checkpoint not confirmed in CI. + - TP > 1 PEFT sharding behavior not fully validated from source review. + +evidence: + - src/megatron/bridge/peft/lora.py + - src/megatron/bridge/peft/dora.py + - tutorials/recipes/llama/01_quickstart_finetune.py + - examples/conversion/adapter/export_adapter.py + +follow_up_validation: + - Add a checked-in end-to-end LoRA adapter export round-trip CI test. + - Confirm normalize_moe_lora on a real MoE checkpoint (DeepSeek, Qwen3-MoE). + - Clarify whether TP > 1 PEFT is validated on current container versions. diff --git a/skills/Megatron-Bridge/megatron-bridge-lora-sft/evals/evals.json b/skills/Megatron-Bridge/megatron-bridge-lora-sft/evals/evals.json new file mode 100644 index 0000000..309f3a5 --- /dev/null +++ b/skills/Megatron-Bridge/megatron-bridge-lora-sft/evals/evals.json @@ -0,0 +1,44 @@ +[ + { + "id": "lora-001-llama-dim-alpha-target-modules", + "question": "How do I set up LoRA fine-tuning for a Llama model in Megatron-Bridge with rank 16 targeting attention and MLP layers?", + "expected_skill": "megatron-bridge-lora-sft", + "expected_script": null, + "ground_truth": "Use llama32_1b_peft_config(peft_scheme='lora') as the starting recipe. Set config.peft.dim=16 and config.peft.alpha=32. The default target_modules already includes ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'] covering both attention and MLP layers. Launch with torchrun --nproc_per_node=1 tutorials/recipes/llama/01_quickstart_finetune.py --pretrained-checkpoint /path/to/checkpoint.", + "expected_behavior": [ + "References llama32_1b_peft_config(peft_scheme='lora') as the recipe entry point", + "Sets config.peft.dim=16 (not lora_rank)", + "Sets config.peft.alpha=32", + "Mentions that default target_modules covers linear_qkv, linear_proj, linear_fc1, linear_fc2", + "Includes the torchrun launch command with --pretrained-checkpoint" + ] + }, + { + "id": "lora-002-moe-normalize-moe-lora", + "question": "I'm applying LoRA to a MoE model in Megatron-Bridge but want expert layers to use a smaller rank than attention layers. How do I do that?", + "expected_skill": "megatron-bridge-lora-sft", + "expected_script": null, + "ground_truth": "Set normalize_moe_lora=True on the LoRA dataclass. With dim=32 and moe_router_topk=2, expert linear layers get effective dim = 32 // 2 = 16, while non-expert layers keep the full dim=32. dim must be evenly divisible by moe_router_topk. This is set directly on the LoRA dataclass: LoRA(dim=32, alpha=32, normalize_moe_lora=True, target_modules=[...]).", + "expected_behavior": [ + "References normalize_moe_lora=True as the mechanism for per-layer rank reduction", + "Explains that expert layer effective dim = dim // moe_router_topk", + "Explains that non-expert layers keep the full dim", + "Notes that dim must be evenly divisible by moe_router_topk", + "Shows the LoRA dataclass usage directly, not a fabricated PEFTConfig field" + ] + }, + { + "id": "lora-003-export-adapter-hf-peft", + "question": "How do I export my LoRA adapter checkpoint from Megatron-Bridge to HuggingFace PEFT format?", + "expected_skill": "megatron-bridge-lora-sft", + "expected_script": null, + "ground_truth": "Use AutoBridge(hf_model_path) and call bridge.export_adapter_ckpt(peft_checkpoint='/checkpoints/lora_run', output_path='./my_adapter'). This produces adapter_config.json and adapter_model.safetensors in the output directory. The export runs on CPU. The result loads directly with PeftModel.from_pretrained(base_model, './my_adapter'). Alternatively, use the CLI: python examples/conversion/adapter/export_adapter.py --hf-model-path ... --lora-checkpoint ... --output ...", + "expected_behavior": [ + "References AutoBridge and export_adapter_ckpt as the export mechanism", + "Shows the peft_checkpoint and output_path arguments", + "States the export produces adapter_config.json and adapter_model.safetensors", + "Mentions the export runs on CPU (no GPU needed)", + "Shows PeftModel.from_pretrained as the consumption pattern, or the CLI alternative" + ] + } +] diff --git a/skills/NeMo-AutoModel/automodel-expert-lora/SKILL.md b/skills/NeMo-AutoModel/automodel-expert-lora/SKILL.md new file mode 100644 index 0000000..83357b8 --- /dev/null +++ b/skills/NeMo-AutoModel/automodel-expert-lora/SKILL.md @@ -0,0 +1,201 @@ +--- +name: automodel-expert-lora +description: Apply LoRA to fused MoE expert layers in NeMo AutoModel. Covers PeftConfig setup, moe_rank_scaling for automatic per-expert rank reduction, target_modules wildcard matching for expert layers, and the GroupedExpertsTE limitation. Use when fine-tuning MoE models (models using GroupedExperts or GroupedExpertsDeepEP) with LoRA and needing expert layers adapted, or when diagnosing why expert weights are not being trained. +when_to_use: LoRA on MoE models in NeMo AutoModel, expert weight adaptation, moe_rank_scaling, target_modules for MoE, expert LoRA patching, GroupedExperts LoRA, dim scaling by n_activated_experts, apply_lora_to_linear_modules MoE. +--- + +# Expert LoRA for Fused MoE Models + +Card: @skills/automodel-expert-lora/card.yaml + +## The Problem + +In NeMo AutoModel, fused MoE expert layers (`GroupedExperts`, `GroupedExpertsDeepEP`) are +not `nn.Linear` modules. `match_all_linear=True` iterates `nn.Linear` only and silently +skips expert parameters. + +Result: LoRA runs but only attention or dense linear layers are adapted. Expert weights are +never modified. No error is raised. + +Additionally, `GroupedExpertsTE` (Transformer Engine expert layers) are not supported — +passing them raises `NotImplementedError`. + +## Quick Decision + +| Scenario | PeftConfig setting | +|---|---| +| Adapt expert layers | `target_modules=["*experts*"]` | +| Adapt specific expert name | `target_modules=["experts"]` | +| Reduce expert rank proportionally | `moe_rank_scaling=True` | +| Dense model only | `match_all_linear=True` (skips MoE) | +| TE expert layers | Not supported — raises NotImplementedError | + +## Enablement + +### Step 1 — Configure PeftConfig for expert layers + +```python +from nemo_automodel.components._peft.lora import PeftConfig, apply_lora_to_linear_modules + +peft_config = PeftConfig( + target_modules=["*experts*"], # wildcard matches modules with "experts" in the name + dim=16, + alpha=32, +) + +n_patched = apply_lora_to_linear_modules(model, peft_config) +# returns count of modules patched +``` + +For exact name matching instead of wildcard: + +```python +peft_config = PeftConfig( + target_modules=["experts"], # exact substring match + dim=8, + alpha=32, +) +``` + +### Step 2 — Use moe_rank_scaling for proportional rank reduction + +`moe_rank_scaling=True` divides `dim` by `n_activated_experts` for expert modules while +keeping the full `dim` for dense linear layers. This normalizes total adapter capacity. + +```python +peft_config = PeftConfig( + target_modules=["experts", "linear"], # both MoE and dense + dim=16, + alpha=32, + moe_rank_scaling=True, +) +# model.config.n_activated_experts = 2 +# → expert lora_dim = 16 // 2 = 8 +# → linear lora_dim = 16 (unchanged) + +n_patched = apply_lora_to_linear_modules(model, peft_config) +``` + +Constraints: +- `dim` must be >= `n_activated_experts`; otherwise raises `ValueError` +- Non-evenly-divisible `dim` is allowed (floor division) but emits a warning +- `moe_rank_scaling=False` (default): all modules use the full `dim` + +### Step 3 — Verify expert layers are trainable + +```python +trainable = [(n, p.shape) for n, p in model.named_parameters() if p.requires_grad] +# Confirm expert parameter names appear in the list +assert any("experts" in n for n, _ in trainable), \ + "No expert parameters are trainable — check target_modules" +``` + +## Code Anchors + +PeftConfig and application function: + +```python +# nemo_automodel/components/_peft/lora.py +@dataclass +class PeftConfig: + target_modules: list = field(default_factory=list) + exclude_modules: list = field(default_factory=list) + match_all_linear: bool = False + dim: int = 8 + alpha: int = 32 + use_dora: bool = False + dropout: float = 0.0 + dropout_position: Literal["pre", "post"] = "post" + lora_A_init: str = "xavier" + lora_dtype: Optional[torch.dtype] = None + use_triton: bool = False + moe_rank_scaling: bool = False + +def apply_lora_to_linear_modules( + model: nn.Module, + peft_config: PeftConfig, + quantization_config=None, + skip_freeze: bool = False, +) -> int: + # patches matched nn.Linear and MoE expert modules + # returns count of patched modules +``` + +MoE module patching: + +```python +# nemo_automodel/components/_peft/lora.py +def patch_moe_module( + orig_module, + dim=8, + alpha=32, + lora_A_init_method="xavier", + lora_dtype=None, +) -> nn.Module: + # GroupedExperts → GroupedExpertsLoRA + # GroupedExpertsDeepEP → GroupedExpertsDeepEPLoRA + # GroupedExpertsTE → raises NotImplementedError +``` + +Tests: + +```python +# tests/unit_tests/_peft/test_lora_experts.py +test_apply_lora_equivalence # wildcard target_modules=["*experts*"] +test_apply_lora_patching_logic # exact and wildcard matching +test_moe_rank_scaling_basic # dim=16, n_activated_experts=2 → lora_dim=8 +test_moe_rank_scaling_default_off # moe_rank_scaling=False keeps full dim +test_moe_rank_scaling_floor_division_warning # non-divisible dim +test_moe_rank_scaling_dim_too_small # dim < n_activated_experts → ValueError +test_moe_rank_scaling_output_equivalence # zero-init B → identical baseline output +``` + +## Pitfalls + +1. **Silent expert-skip with match_all_linear**: `match_all_linear=True` iterates + `nn.Linear` modules only. Expert modules are not `nn.Linear` — they are silently + skipped. Training appears to run but only dense/attention layers are adapted. + Always set `target_modules` explicitly when working with MoE models. + +2. **GroupedExpertsTE not supported**: Models using Transformer Engine expert layers + (`GroupedExpertsTE`) raise `NotImplementedError` when `patch_moe_module` is called. + There is no workaround — TE expert LoRA is not implemented. + +3. **dim too small with moe_rank_scaling**: Setting `dim < n_activated_experts` with + `moe_rank_scaling=True` raises a `ValueError`. Increase `dim` to at least + `n_activated_experts`. + +4. **Floor division warning**: When `dim` is not evenly divisible by `n_activated_experts`, + floor division is applied and a warning is logged. The resulting `lora_dim` may be + unexpectedly small. Verify effective rank with trainable parameter inspection. + +5. **target_modules must match module names, not parameter names**: Wildcard patterns + like `"*experts*"` are matched against module names from `model.named_modules()`, + not parameter names from `model.named_parameters()`. + +## Verification + +Run unit tests: + +```bash +pytest tests/unit_tests/_peft/test_lora_experts.py -v +``` + +Confirm expert modules are patched: + +```python +peft_config = PeftConfig(target_modules=["*experts*"], dim=8) +n = apply_lora_to_linear_modules(model, peft_config) +assert n > 0, "No modules were patched — check target_modules pattern" + +trainable = {n for n, p in model.named_parameters() if p.requires_grad} +assert any("experts" in name for name in trainable), \ + "Expert parameters not in trainable set" +``` + +Success criteria: + +- Unit tests pass +- `n_patched > 0` after `apply_lora_to_linear_modules` +- Expert parameter names appear in `model.named_parameters()` with `requires_grad=True` +- No `NotImplementedError` (i.e., model does not use `GroupedExpertsTE`) diff --git a/skills/NeMo-AutoModel/automodel-expert-lora/card.yaml b/skills/NeMo-AutoModel/automodel-expert-lora/card.yaml new file mode 100644 index 0000000..ada1bba --- /dev/null +++ b/skills/NeMo-AutoModel/automodel-expert-lora/card.yaml @@ -0,0 +1,76 @@ +title: automodel_expert_lora +validated_on: "2026-05-03" +summary: > + NeMo AutoModel supports LoRA on fused MoE expert layers via PeftConfig with + target_modules and moe_rank_scaling. Setting target_modules=["*experts*"] patches + GroupedExperts and GroupedExpertsDeepEP modules. moe_rank_scaling=True divides dim + by n_activated_experts for expert layers while keeping full dim for dense layers. + GroupedExpertsTE raises NotImplementedError. Confirmed from source and unit tests. + +validation_status: + peft_config_dataclass: + - code_verified + apply_lora_to_linear_modules: + - code_verified + patch_moe_module_grouped_experts: + - code_verified + patch_moe_module_deepep: + - code_verified + patch_moe_module_te_not_supported: + - code_verified + moe_rank_scaling_dim_division: + - code_verified + wildcard_target_modules_matching: + - code_verified + unit_tests: + - code_verified + end_to_end_finetune: + - unclear + +feature_meaning: + peft_config: > + Dataclass controlling LoRA application. Key fields: target_modules (list of + module name patterns), dim (LoRA rank), alpha, match_all_linear (nn.Linear only), + moe_rank_scaling (divide dim by n_activated_experts for expert layers). + apply_lora_to_linear_modules: > + Main entry point. Freezes base model parameters, iterates named modules, + patches matched nn.Linear and MoE expert modules. Returns count of patched modules. + patch_moe_module: > + Patches a single MoE module. GroupedExperts → GroupedExpertsLoRA, + GroupedExpertsDeepEP → GroupedExpertsDeepEPLoRA, GroupedExpertsTE → NotImplementedError. + moe_rank_scaling: > + When True, expert lora_dim = dim // n_activated_experts; dense lora_dim = dim. + Requires dim >= n_activated_experts. Non-divisible dim uses floor division with warning. + +recommended_path: + expert_lora_minimal: + target_modules: '["*experts*"]' + dim: 16 + alpha: 32 + expert_lora_with_rank_scaling: + target_modules: '["*experts*", "linear"]' + dim: 16 + alpha: 32 + moe_rank_scaling: true + verification: + check: "n = apply_lora_to_linear_modules(model, peft_config); assert n > 0" + +known_constraints: + - GroupedExpertsTE is not supported and raises NotImplementedError; no workaround exists. + - dim must be >= n_activated_experts when moe_rank_scaling=True or ValueError is raised. + - Non-divisible dim is allowed with moe_rank_scaling but uses floor division; a warning is logged. + - match_all_linear=True only iterates nn.Linear and silently skips all MoE expert modules. + - target_modules patterns are matched against module names, not parameter names. + +known_limitations: + - End-to-end MoE LoRA fine-tune on a real checkpoint (Mixtral, Qwen3-MoE) not in CI. + - DoRA (use_dora=True) with MoE modules: not confirmed from unit tests. + +evidence: + - nemo_automodel/components/_peft/lora.py + - tests/unit_tests/_peft/test_lora_experts.py + +follow_up_validation: + - Add an end-to-end CI test that runs apply_lora_to_linear_modules on a real MoE checkpoint. + - Confirm DoRA (use_dora=True) behavior with GroupedExperts modules. + - Confirm moe_rank_scaling behavior when n_activated_experts > 2. diff --git a/skills/NeMo-AutoModel/automodel-expert-lora/evals/evals.json b/skills/NeMo-AutoModel/automodel-expert-lora/evals/evals.json new file mode 100644 index 0000000..434632e --- /dev/null +++ b/skills/NeMo-AutoModel/automodel-expert-lora/evals/evals.json @@ -0,0 +1,44 @@ +[ + { + "id": "moe-lora-001-match-all-linear-silent-skip", + "question": "I'm applying LoRA to a MoE model in NeMo AutoModel with match_all_linear=True but expert layers are not being trained. What's happening?", + "expected_skill": "automodel-expert-lora", + "expected_script": null, + "ground_truth": "match_all_linear=True only iterates nn.Linear modules. MoE expert layers (GroupedExperts, GroupedExpertsDeepEP) are not nn.Linear and are silently skipped. Fix: set target_modules=['*experts*'] in PeftConfig instead of relying on match_all_linear. Confirm patching by checking that apply_lora_to_linear_modules returns a count > 0 and that expert parameter names appear with requires_grad=True.", + "expected_behavior": [ + "Identifies that match_all_linear only iterates nn.Linear and silently skips MoE expert modules", + "Explains that GroupedExperts and GroupedExpertsDeepEP are not nn.Linear", + "Provides target_modules=['*experts*'] as the fix in PeftConfig", + "Mentions apply_lora_to_linear_modules return count as a verification step", + "Does not suggest increasing dim as the primary fix" + ] + }, + { + "id": "moe-lora-002-moe-rank-scaling", + "question": "How do I apply a smaller LoRA rank to MoE expert layers than to dense attention layers in NeMo AutoModel?", + "expected_skill": "automodel-expert-lora", + "expected_script": null, + "ground_truth": "Use moe_rank_scaling=True in PeftConfig. With dim=16 and n_activated_experts=2, expert layers get lora_dim=8 (dim // n_activated_experts) while dense linear layers keep the full dim=16. Set target_modules to cover both expert and linear modules. dim must be >= n_activated_experts or a ValueError is raised. Non-divisible dim uses floor division with a warning.", + "expected_behavior": [ + "References moe_rank_scaling=True in PeftConfig as the mechanism", + "Explains that expert lora_dim = dim // n_activated_experts", + "Explains that dense linear layers keep the full dim", + "Notes that dim must be >= n_activated_experts or ValueError is raised", + "Notes that non-divisible dim uses floor division with a warning" + ] + }, + { + "id": "moe-lora-003-groupedexpertste-not-supported", + "question": "I'm trying to apply LoRA to a Transformer Engine MoE model in NeMo AutoModel and getting a NotImplementedError. Why?", + "expected_skill": "automodel-expert-lora", + "expected_script": null, + "ground_truth": "GroupedExpertsTE (Transformer Engine expert layers) are not supported by patch_moe_module. When apply_lora_to_linear_modules encounters a GroupedExpertsTE module, it raises NotImplementedError with 'LoRA is not supported for Transformer Engine'. There is no workaround — TE expert LoRA is not implemented. Only GroupedExperts (→ GroupedExpertsLoRA) and GroupedExpertsDeepEP (→ GroupedExpertsDeepEPLoRA) are supported.", + "expected_behavior": [ + "Identifies GroupedExpertsTE as the unsupported module type", + "States that patch_moe_module raises NotImplementedError for TE expert layers", + "States there is no workaround — TE expert LoRA is not implemented", + "Lists the supported types: GroupedExperts and GroupedExpertsDeepEP", + "Does not suggest a TE-specific workaround that does not exist" + ] + } +] From 4306256e25ac092eb30d57582fbb8ed7d054627c Mon Sep 17 00:00:00 2001 From: Doondi-Ashlesh Date: Sun, 3 May 2026 22:57:32 +0000 Subject: [PATCH 3/3] Remove skills_contribution staging folder Signed-off-by: Doondi-Ashlesh --- .../skills/automodel-expert-lora/SKILL.md | 211 --------------- .../skills/automodel-expert-lora/card.yaml | 33 --- .../automodel-expert-lora/evals/evals.json | 39 --- .../skills/megatron-bridge-lora-sft/SKILL.md | 247 ------------------ .../skills/megatron-bridge-lora-sft/card.yaml | 31 --- .../megatron-bridge-lora-sft/evals/evals.json | 39 --- 6 files changed, 600 deletions(-) delete mode 100644 skills_contribution/skills/automodel-expert-lora/SKILL.md delete mode 100644 skills_contribution/skills/automodel-expert-lora/card.yaml delete mode 100644 skills_contribution/skills/automodel-expert-lora/evals/evals.json delete mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md delete mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/card.yaml delete mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json diff --git a/skills_contribution/skills/automodel-expert-lora/SKILL.md b/skills_contribution/skills/automodel-expert-lora/SKILL.md deleted file mode 100644 index ef7e723..0000000 --- a/skills_contribution/skills/automodel-expert-lora/SKILL.md +++ /dev/null @@ -1,211 +0,0 @@ ---- -name: automodel-expert-lora -description: Apply LoRA to fused MoE expert layers in NeMo AutoModel using HuggingFace Transformers v5+ models. Covers expert parameter detection, rank_pattern configuration, and the validation warning emitted when match_all_linear silently skips expert weights. Use when fine-tuning MoE models (Mixtral, Qwen3-MoE, DeepSeek) with LoRA and needing expert layers adapted, or when diagnosing why only attention layers are changing during MoE LoRA training. -when_to_use: LoRA on MoE models in NeMo AutoModel, expert weight adaptation, rank_pattern configuration, silent skip diagnosis; 'match_all_linear MoE', 'expert LoRA', 'fused expert parameters', 'target_modules MoE', 'Mixtral LoRA', 'Qwen3-MoE LoRA', 'DeepSeek LoRA', 'nn.Parameter expert'. -license: Apache-2.0 ---- - -# Expert LoRA for Fused MoE Models - -Card: @skills/automodel-expert-lora/card.yaml - -## The Problem - -In Transformers v5+, fused MoE models (Mixtral, Qwen3-MoE, DeepSeek-V3, -GLM-4.5) register expert weights as `nn.Parameter` inside a combined linear -layer — not as individual `nn.Linear` modules. `match_all_linear=True` iterates -`nn.Linear` only. Expert parameters are invisible to it. - -Result: LoRA appears to run, loss changes only from attention adaptation, and -the expert layers are never modified. No error is raised. - -As of NeMo AutoModel v0.x (issue #1151), `apply_lora()` now emits a -`UserWarning` when this condition is detected, and three utilities are -available to configure expert LoRA correctly. - -## Quick Decision - -| Model family | Expert param pattern | Correct target_modules | -|---|---|---| -| Mixtral | `block_sparse_moe.w1/w2/w3` | `["w1", "w2", "w3"]` | -| Qwen3-MoE | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` | -| DeepSeek-V3 | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` | -| GLM-4.5 | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` | - -If unsure, run `detect_fused_moe_experts(model)` — it returns the correct -list for any supported model. - -## Enablement - -### Step 1 — Detect expert parameter names - -```python -from nemo_automodel.components._peft.lora import detect_fused_moe_experts - -targets = detect_fused_moe_experts(model) -# e.g. returns ["w1", "w2", "w3"] for Mixtral -# ["down_proj", "gate_proj", "up_proj"] for Qwen3-MoE -``` - -### Step 2 — Build rank_pattern (optional: per-expert rank sizing) - -```python -from nemo_automodel.components._peft.lora import build_expert_lora_rank_pattern - -rank_pattern = build_expert_lora_rank_pattern( - model, - base_rank=16, - expert_rank_multiplier=0.5, # smaller rank for experts to save memory -) -# e.g. {"block_sparse_moe": 8} -``` - -### Step 3 — Apply LoRA with explicit target_modules - -```python -from peft import LoraConfig, get_peft_model -from nemo_automodel.components._peft.lora import detect_fused_moe_experts, build_expert_lora_rank_pattern - -targets = detect_fused_moe_experts(model) -rank_pattern = build_expert_lora_rank_pattern(model, base_rank=16) - -lora_config = LoraConfig( - r=16, - lora_alpha=32, - target_modules=targets, - rank_pattern=rank_pattern, - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -model = get_peft_model(model, lora_config) -model.print_trainable_parameters() -# Should show expert layers in trainable params, not just attention -``` - -### Validation warning - -If `apply_lora()` is called with `match_all_linear=True` and no -`target_modules`, and the model has fused expert parameters, a `UserWarning` -is emitted with the detected parameter names and a fix snippet. Treat this -as an error — silent expert-skip produces wrong training dynamics. - -``` -UserWarning: [NeMo AutoModel] Fused MoE expert parameters detected but will -NOT be adapted by LoRA. - - Detected expert parameter names: ['w1', 'w2', 'w3'] - - To apply LoRA to expert layers, pass target_modules explicitly: - lora_config = LoraConfig( - target_modules=['w1', 'w2', 'w3'], - rank_pattern=build_expert_lora_rank_pattern(model, base_rank=16), - ) -``` - -## Code Anchors - -Expert detection utility: - -```python -# nemo_automodel/components/_peft/lora.py -_FUSED_EXPERT_PARAM_PATTERNS = ( - "block_sparse_moe", # Mixtral - "mlp.experts", # Qwen3-MoE, DeepSeek - "moe.experts", # generic - "ffn.experts", # generic -) - -def detect_fused_moe_experts(model: nn.Module) -> list[str]: - # inspects named_parameters() for known fused MoE patterns - # returns sorted list of leaf parameter name suffixes -``` - -Rank pattern builder: - -```python -# nemo_automodel/components/_peft/lora.py -def build_expert_lora_rank_pattern( - model: nn.Module, - base_rank: int, - expert_rank_multiplier: float = 1.0, -) -> dict[str, int]: - # maps MoE pattern keys to int(base_rank * multiplier) - # returns {} for dense models -``` - -Validation hook in apply_lora: - -```python -# nemo_automodel/components/_peft/lora.py -def apply_lora(model, lora_config, match_all_linear=False, target_modules=None): - validate_lora_config_for_moe(model, match_all_linear, target_modules) - # ... existing LoRA application logic ... -``` - -Tests: - -```python -# tests/unit/components/peft/test_expert_lora.py -class TestDetectFusedMoeExperts # 4 tests -class TestBuildExpertLoraRankPattern # 4 tests -class TestValidateLoraConfigForMoe # 3 tests -``` - -## Pitfalls - -1. **Silent failure with match_all_linear**: The most dangerous failure mode. - Training appears normal, loss decreases, but expert weights are never - adapted. Only detectable by checking `model.print_trainable_parameters()` - and confirming expert layers appear — or by observing that expert-heavy - tasks show no improvement vs attention-only LoRA. - -2. **rank_pattern key must match parameter path substring**: The keys in - `rank_pattern` are matched against full parameter names. Use the pattern - as returned by `detect_fused_moe_experts` or `build_expert_lora_rank_pattern` - — do not abbreviate. - -3. **expert_rank_multiplier < 1 floors at rank 1**: Setting - `expert_rank_multiplier=0.1` with `base_rank=4` gives rank 1, not 0. - This is intentional — rank 0 is invalid. Verify effective rank with - `model.print_trainable_parameters()`. - -4. **Dense model returns empty pattern**: `build_expert_lora_rank_pattern` - returns `{}` for dense models. Passing an empty `rank_pattern` to - `LoraConfig` is safe — PEFT falls back to the global `r` value. - -5. **target_modules suppresses the warning**: Once `target_modules` is - provided, `validate_lora_config_for_moe` returns immediately and - does not check whether the provided names actually cover expert layers. - Use `detect_fused_moe_experts` to generate the list rather than - guessing module names. - -## Verification - -Unit tests for all three utilities: - -```bash -pytest tests/unit/components/peft/test_expert_lora.py -v -``` - -Expected: `12 passed` - -Confirm expert layers are trainable after apply_lora: - -```python -model = get_peft_model(model, lora_config) -trainable = {n for n, p in model.named_parameters() if p.requires_grad} -expert_patterns = detect_fused_moe_experts(model.base_model) -assert any( - any(pat in name for pat in expert_patterns) - for name in trainable -), "No expert parameters in trainable set — check target_modules" -``` - -Success criteria: - -- `12 passed` on unit tests -- `model.print_trainable_parameters()` shows expert layer names in the - trainable parameter count -- No `UserWarning` about fused MoE expert skip when target_modules is set diff --git a/skills_contribution/skills/automodel-expert-lora/card.yaml b/skills_contribution/skills/automodel-expert-lora/card.yaml deleted file mode 100644 index a377293..0000000 --- a/skills_contribution/skills/automodel-expert-lora/card.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: automodel-expert-lora -version: "1.0" -author: Doondi-Ashlesh -status: community -recommendation_level: stable - -description: > - Apply LoRA to fused MoE expert layers in NeMo AutoModel. Covers expert - parameter detection, rank_pattern configuration, and the validation warning - emitted when match_all_linear silently skips expert weights in Transformers v5+. - -use_cases: - - LoRA fine-tuning on Mixtral models targeting expert layers - - LoRA fine-tuning on Qwen3-MoE, DeepSeek, GLM-4.5 with expert adaptation - - Diagnosing silent expert-skip in existing MoE LoRA training runs - - Per-expert rank sizing via rank_pattern - -known_limitations: - - Expert detection relies on known fused MoE parameter name patterns - - Custom MoE architectures with non-standard parameter naming require - manual target_modules specification - - rank_pattern key matching is substring-based; ambiguous keys may - match unintended layers - -follow_up_validation: - - End-to-end MoE LoRA fine-tune + eval loop test not yet in CI - - Functional test on real Mixtral or Qwen3-MoE checkpoint pending - -related_issues: - - https://github.com/NVIDIA-NeMo/Automodel/issues/1151 - -related_skills: - - megatron-bridge-lora-sft diff --git a/skills_contribution/skills/automodel-expert-lora/evals/evals.json b/skills_contribution/skills/automodel-expert-lora/evals/evals.json deleted file mode 100644 index 615e6e8..0000000 --- a/skills_contribution/skills/automodel-expert-lora/evals/evals.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "skill_name": "automodel-expert-lora", - "evals": [ - { - "id": 1, - "prompt": "I'm applying LoRA to a Mixtral model in NeMo AutoModel with match_all_linear=True but the loss is not improving on expert-heavy tasks. What's wrong and how do I fix it?", - "expected_output": "Diagnosis that match_all_linear=True only matches nn.Linear and misses Mixtral's fused expert parameters (nn.Parameter in block_sparse_moe). Fix: use detect_fused_moe_experts(model) to get target_modules=['w1','w2','w3'] and pass them explicitly to LoraConfig.", - "assertions": [ - "Response identifies nn.Parameter vs nn.Linear as the root cause", - "Response references detect_fused_moe_experts utility", - "Response provides target_modules=['w1','w2','w3'] for Mixtral", - "Response does not suggest increasing lora_rank as the primary fix", - "Response mentions UserWarning that would have been emitted" - ] - }, - { - "id": 2, - "prompt": "How do I apply different LoRA ranks to attention vs expert layers in a Qwen3-MoE model using NeMo AutoModel?", - "expected_output": "Use build_expert_lora_rank_pattern(model, base_rank=16, expert_rank_multiplier=0.5) to generate rank_pattern dict, then pass both target_modules (from detect_fused_moe_experts) and rank_pattern to LoraConfig.", - "assertions": [ - "Response includes build_expert_lora_rank_pattern with base_rank and expert_rank_multiplier", - "Response shows rank_pattern passed to LoraConfig", - "Response includes detect_fused_moe_experts to get target_modules", - "Response explains that expert_rank_multiplier < 1 reduces expert rank below base_rank" - ] - }, - { - "id": 3, - "prompt": "After applying LoRA to my Qwen3-MoE model, how do I verify that expert layers are actually being trained and not silently skipped?", - "expected_output": "Use model.print_trainable_parameters() and check that expert layer names appear, plus a code snippet using detect_fused_moe_experts to assert expert pattern names are in the trainable parameter set.", - "assertions": [ - "Response includes model.print_trainable_parameters() call", - "Response provides assertion or check using detect_fused_moe_experts", - "Response explains what to look for in the trainable parameter output", - "Response mentions the 12 passed unit test expectation as a baseline check" - ] - } - ] -} diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md b/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md deleted file mode 100644 index b43027d..0000000 --- a/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md +++ /dev/null @@ -1,247 +0,0 @@ ---- -name: megatron-bridge-lora-sft -description: Configure and run LoRA, DoRA, and full SFT fine-tuning in Megatron-Bridge. Covers PEFT recipe selection, target module wiring, adapter merging, and HuggingFace checkpoint export. Use when applying LoRA or DoRA to any Bridge-supported model, setting up SFT datasets, debugging PEFT config errors, or exporting fine-tuned weights back to HuggingFace format. -when_to_use: LoRA or DoRA fine-tuning, SFT recipe setup, PEFT config errors, adapter merging, HuggingFace export after fine-tuning; 'peft_config', 'LoRA', 'DoRA', 'lora_rank', 'target_modules', 'merge_lora', 'sft_config', 'fine-tune', 'adapter export'. -license: Apache-2.0 ---- - -# LoRA / DoRA / SFT Fine-Tuning - -Stable docs: @docs/training/peft.md -Card: @skills/megatron-bridge-lora-sft/card.yaml - -## Quick Decision - -| Goal | Recipe type | Min GPUs | -|---|---|---| -| LoRA on 8B model | `*_peft_config` | 1 | -| LoRA on 70B model | `*_peft_config` | 8 | -| LoRA on 235B MoE | `*_peft_config` | 16 | -| Full SFT on 8B | `*_sft_config` | 2 | -| Full SFT on 70B | `*_sft_config` | 16 | -| Merge adapters + export to HF | Post-training step | Same as training | - -Use PEFT recipes when GPU count is the constraint. Use SFT recipes when -you need full gradient flow through all parameters. - -## Enablement - -### LoRA (minimal) - -```python -from megatron.bridge.recipes.llama import llama3_8b_peft_config - -cfg = llama3_8b_peft_config() - -# Default: rank=16, alpha=32, target_modules=["linear_qkv", "linear_proj"] -# Override rank and alpha: -cfg.peft.lora_rank = 32 -cfg.peft.lora_alpha = 64 - -# Add MLP layers to target modules: -cfg.peft.target_modules = [ - "linear_qkv", - "linear_proj", - "linear_fc1", - "linear_fc2", -] -``` - -### DoRA - -```python -cfg.peft.use_dora = True -cfg.peft.lora_rank = 16 -cfg.peft.lora_alpha = 16 # alpha == rank is the DoRA convention -``` - -### SFT (full fine-tune) - -```python -from megatron.bridge.recipes.llama import llama3_8b_sft_config - -cfg = llama3_8b_sft_config() -cfg.dataset.data_path = ["/data/train.jsonl"] -cfg.dataset.seq_length = 4096 -cfg.train.global_batch_size = 128 -cfg.train.micro_batch_size = 2 -cfg.optimizer.lr = 1e-5 -``` - -### MoE LoRA — expert layer targeting - -For MoE models (Qwen3-MoE, DeepSeek, GLM-4.5), expert weights are -registered as `nn.Parameter`, not `nn.Linear`. `match_all_linear=True` -silently skips them. Set `target_modules` explicitly: - -```python -cfg = qwen3_30b_a3b_peft_config() -cfg.peft.target_modules = [ - "linear_qkv", # attention - "linear_proj", # attention output - "gate_proj", # expert gate - "up_proj", # expert up - "down_proj", # expert down -] -cfg.peft.lora_rank = 16 -cfg.peft.lora_alpha = 32 -``` - -### Adapter merge and HuggingFace export - -```python -from megatron.bridge.peft.merge import merge_lora_weights -from megatron.bridge.convert import export_to_hf - -# Step 1: merge adapters into base weights -merge_lora_weights( - checkpoint_dir="/checkpoints/lora_run", - output_dir="/checkpoints/merged", -) - -# Step 2: export merged checkpoint to HuggingFace format -export_to_hf( - megatron_checkpoint="/checkpoints/merged", - hf_output_dir="/hf_model/", - model_type="llama3", -) -``` - -Or via CLI: - -```bash -python scripts/convert/megatron_to_hf.py \ - --checkpoint /checkpoints/merged \ - --output /hf_model/ \ - --model-type llama3 -``` - -## Entry Points - -```bash -# LoRA fine-tune (1 GPU) -uv run python -m torch.distributed.run --nproc_per_node=1 \ - scripts/training/run_recipe.py \ - --recipe llama3_8b_peft_config \ - --dataset llm-finetune - -# SFT fine-tune (2 GPUs) -uv run python -m torch.distributed.run --nproc_per_node=2 \ - scripts/training/run_recipe.py \ - --recipe llama3_8b_sft_config \ - --dataset llm-finetune - -# Override LoRA rank via CLI -uv run python -m torch.distributed.run --nproc_per_node=1 \ - scripts/training/run_recipe.py \ - --recipe llama3_8b_peft_config \ - --dataset llm-finetune \ - 'peft.lora_rank=32' \ - 'peft.lora_alpha=64' -``` - -## Code Anchors - -PEFT config definition: - -```python -# src/megatron/bridge/training/config.py -@dataclass -class PEFTConfig: - lora_rank: int = 16 - lora_alpha: float = 32 - lora_dropout: float = 0.0 - use_dora: bool = False - target_modules: list[str] = field(default_factory=lambda: ["linear_qkv", "linear_proj"]) - match_all_linear: bool = False -``` - -LoRA adapter application: - -```python -# src/megatron/bridge/training/peft.py -def apply_lora(model, peft_config): - # wraps target modules with LoraLinear / DoraLinear - # match_all_linear iterates nn.Linear only — misses nn.Parameter MoE experts -``` - -Merge utility: - -```python -# src/megatron/bridge/peft/merge.py -def merge_lora_weights(checkpoint_dir, output_dir): - # loads base + adapter shards, merges in-place, writes merged checkpoint -``` - -PEFT recipe examples: - -```python -# src/megatron/bridge/recipes/llama.py -def llama3_8b_peft_config() -> ConfigContainer: - cfg = llama3_8b_sft_config() - cfg.peft = PEFTConfig(lora_rank=16, lora_alpha=32) - cfg.model.tensor_model_parallel_size = 1 - cfg.model.pipeline_model_parallel_size = 1 - return cfg -``` - -## Pitfalls - -1. **MoE expert layers silently skipped**: `match_all_linear=True` only - matches `nn.Linear`. Expert weights in fused MoE blocks (Qwen3-MoE, - DeepSeek, GLM-4.5) are `nn.Parameter` — they are invisible to the - matcher. Always set `target_modules` explicitly for MoE models. - -2. **DoRA alpha convention**: DoRA expects `lora_alpha == lora_rank`. Using - the standard LoRA convention (`alpha = 2 * rank`) will not error but - produces suboptimal scaling. Set `alpha = rank` for DoRA. - -3. **Merge before export**: Exporting a LoRA checkpoint to HuggingFace - without merging produces a broken HF model — the base weights do not - include adapter contributions. Always run `merge_lora_weights()` first. - -4. **TP > 1 with PEFT**: LoRA adapters are sharded along with the base - layer when `tensor_model_parallel_size > 1`. The adapter shapes must be - consistent across TP ranks. Mismatched `lora_rank` between ranks causes - a shape error at initialization, not at the first forward pass. - -5. **SFT with packed sequences requires MBS=1**: When `PackedSequenceSpecs` - is active, setting `micro_batch_size > 1` raises a `ValueError`. PEFT - recipes default to `MBS=1`; SFT recipes may need explicit adjustment. - -6. **`calculate_per_token_loss` for SFT with CP**: When context parallelism - (`context_parallel_size > 1`) is enabled for SFT, set - `cfg.model.calculate_per_token_loss = True` and - `cfg.ddp.average_in_collective = False`. Omitting either causes - incorrect loss scaling across CP ranks. - -7. **LoRA dropout and inference**: `lora_dropout > 0` is training-only. - Ensure the adapter is saved and merged in eval mode or dropout - will be applied during export, corrupting merged weights. - -## Verification - -Unit test coverage for PEFT config validation: - -```bash -uv run python -m pytest tests/unit_tests/training/test_config.py \ - -k "peft or lora" -v -``` - -Smoke test LoRA on 1 GPU with mock data: - -```bash -CUDA_VISIBLE_DEVICES=0 uv run python -m torch.distributed.run --nproc_per_node=1 \ - scripts/training/run_recipe.py \ - --recipe llama3_8b_peft_config \ - --dataset llm-finetune \ - 'train.train_iters=5' \ - 'logger.log_interval=1' -``` - -Success criteria: - -- Exit code 0 -- Finite loss at iteration 5 (e.g. `lm loss: 9.8E+00`) -- Log shows `PEFTConfig` with expected `lora_rank` and `target_modules` -- No `KeyError` or shape mismatch during adapter initialization diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml b/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml deleted file mode 100644 index 30b87eb..0000000 --- a/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml +++ /dev/null @@ -1,31 +0,0 @@ -name: megatron-bridge-lora-sft -version: "1.0" -author: Doondi-Ashlesh -status: community -recommendation_level: stable - -description: > - Configure and run LoRA, DoRA, and full SFT fine-tuning in Megatron-Bridge. - Covers PEFT recipe selection, target module wiring for dense and MoE models, - adapter merging, and HuggingFace checkpoint export. - -use_cases: - - LoRA fine-tuning on dense models (Llama, Qwen3, Gemma) - - DoRA fine-tuning - - LoRA on MoE models with explicit expert layer targeting - - Full SFT fine-tuning - - Adapter merge and HuggingFace export - -known_limitations: - - Expert layer LoRA requires explicit target_modules for MoE models - - DoRA requires alpha == rank for correct weight decomposition scaling - - Adapter merge must precede HuggingFace export - -follow_up_validation: - - End-to-end LoRA merge + export round-trip test not yet in CI - - MoE expert LoRA functional test pending - -related_skills: - - recipe-recommender - - perf-parallelism-strategies - - perf-sequence-packing diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json b/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json deleted file mode 100644 index 4835132..0000000 --- a/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "skill_name": "megatron-bridge-lora-sft", - "evals": [ - { - "id": 1, - "prompt": "How do I set up LoRA fine-tuning for Llama 3 8B in Megatron-Bridge with rank 32 targeting attention and MLP layers?", - "expected_output": "Python config snippet using llama3_8b_peft_config(), setting lora_rank=32, lora_alpha=64, and target_modules covering both attention (linear_qkv, linear_proj) and MLP (linear_fc1, linear_fc2) layers, plus the launch command.", - "assertions": [ - "Response includes llama3_8b_peft_config() recipe reference", - "Response sets lora_rank=32", - "Response sets target_modules with at least 4 entries covering attention and MLP", - "Response includes the uv run torch.distributed.run launch command", - "Response does not suggest match_all_linear=True as the solution" - ] - }, - { - "id": 2, - "prompt": "I'm trying to apply LoRA to a Qwen3-30B-A3B MoE model in Megatron-Bridge but my loss isn't changing — it looks like only attention layers are being adapted. What's wrong?", - "expected_output": "Diagnosis that match_all_linear silently skips MoE expert parameters (nn.Parameter, not nn.Linear), and the fix: explicitly setting target_modules to include gate_proj, up_proj, down_proj in addition to attention layers.", - "assertions": [ - "Response identifies the root cause as match_all_linear missing nn.Parameter expert weights", - "Response provides explicit target_modules list including expert layer names", - "Response references qwen3_30b_a3b_peft_config or equivalent MoE recipe", - "Response does not suggest increasing lora_rank as the primary fix" - ] - }, - { - "id": 3, - "prompt": "How do I merge LoRA adapters and export the result to HuggingFace format after training in Megatron-Bridge?", - "expected_output": "Two-step process: first merge_lora_weights() to combine adapter into base checkpoint, then export_to_hf() or CLI megatron_to_hf.py to produce the HF model directory.", - "assertions": [ - "Response shows merge step before export step", - "Response includes checkpoint_dir and output_dir arguments for merge", - "Response includes CLI or Python export command with model-type argument", - "Response warns that skipping merge produces a broken HF model" - ] - } - ] -}