From e6efd8e6fffc4a0491c03d862b0b657a552d8b25 Mon Sep 17 00:00:00 2001
From: dt-edu <dtammineedi@eduquencher.com>
Date: Sun, 3 May 2026 16:40:50 -0400
Subject: [PATCH 1/3] Contribtuions

---
 .../skills/automodel-expert-lora/SKILL.md     | 211 +++++++++++++++
 .../skills/automodel-expert-lora/card.yaml    |  33 +++
 .../automodel-expert-lora/evals/evals.json    |  39 +++
 .../skills/megatron-bridge-lora-sft/SKILL.md  | 247 ++++++++++++++++++
 .../skills/megatron-bridge-lora-sft/card.yaml |  31 +++
 .../megatron-bridge-lora-sft/evals/evals.json |  39 +++
 6 files changed, 600 insertions(+)
 create mode 100644 skills_contribution/skills/automodel-expert-lora/SKILL.md
 create mode 100644 skills_contribution/skills/automodel-expert-lora/card.yaml
 create mode 100644 skills_contribution/skills/automodel-expert-lora/evals/evals.json
 create mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md
 create mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/card.yaml
 create mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json

diff --git a/skills_contribution/skills/automodel-expert-lora/SKILL.md b/skills_contribution/skills/automodel-expert-lora/SKILL.md
new file mode 100644
index 0000000..ef7e723
--- /dev/null
+++ b/skills_contribution/skills/automodel-expert-lora/SKILL.md
@@ -0,0 +1,211 @@
+---
+name: automodel-expert-lora
+description: Apply LoRA to fused MoE expert layers in NeMo AutoModel using HuggingFace Transformers v5+ models. Covers expert parameter detection, rank_pattern configuration, and the validation warning emitted when match_all_linear silently skips expert weights. Use when fine-tuning MoE models (Mixtral, Qwen3-MoE, DeepSeek) with LoRA and needing expert layers adapted, or when diagnosing why only attention layers are changing during MoE LoRA training.
+when_to_use: LoRA on MoE models in NeMo AutoModel, expert weight adaptation, rank_pattern configuration, silent skip diagnosis; 'match_all_linear MoE', 'expert LoRA', 'fused expert parameters', 'target_modules MoE', 'Mixtral LoRA', 'Qwen3-MoE LoRA', 'DeepSeek LoRA', 'nn.Parameter expert'.
+license: Apache-2.0
+---
+
+# Expert LoRA for Fused MoE Models
+
+Card: @skills/automodel-expert-lora/card.yaml
+
+## The Problem
+
+In Transformers v5+, fused MoE models (Mixtral, Qwen3-MoE, DeepSeek-V3,
+GLM-4.5) register expert weights as `nn.Parameter` inside a combined linear
+layer — not as individual `nn.Linear` modules. `match_all_linear=True` iterates
+`nn.Linear` only. Expert parameters are invisible to it.
+
+Result: LoRA appears to run, loss changes only from attention adaptation, and
+the expert layers are never modified. No error is raised.
+
+As of NeMo AutoModel v0.x (issue #1151), `apply_lora()` now emits a
+`UserWarning` when this condition is detected, and three utilities are
+available to configure expert LoRA correctly.
+
+## Quick Decision
+
+| Model family | Expert param pattern | Correct target_modules |
+|---|---|---|
+| Mixtral | `block_sparse_moe.w1/w2/w3` | `["w1", "w2", "w3"]` |
+| Qwen3-MoE | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` |
+| DeepSeek-V3 | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` |
+| GLM-4.5 | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` |
+
+If unsure, run `detect_fused_moe_experts(model)` — it returns the correct
+list for any supported model.
+
+## Enablement
+
+### Step 1 — Detect expert parameter names
+
+```python
+from nemo_automodel.components._peft.lora import detect_fused_moe_experts
+
+targets = detect_fused_moe_experts(model)
+# e.g. returns ["w1", "w2", "w3"] for Mixtral
+#              ["down_proj", "gate_proj", "up_proj"] for Qwen3-MoE
+```
+
+### Step 2 — Build rank_pattern (optional: per-expert rank sizing)
+
+```python
+from nemo_automodel.components._peft.lora import build_expert_lora_rank_pattern
+
+rank_pattern = build_expert_lora_rank_pattern(
+    model,
+    base_rank=16,
+    expert_rank_multiplier=0.5,  # smaller rank for experts to save memory
+)
+# e.g. {"block_sparse_moe": 8}
+```
+
+### Step 3 — Apply LoRA with explicit target_modules
+
+```python
+from peft import LoraConfig, get_peft_model
+from nemo_automodel.components._peft.lora import detect_fused_moe_experts, build_expert_lora_rank_pattern
+
+targets = detect_fused_moe_experts(model)
+rank_pattern = build_expert_lora_rank_pattern(model, base_rank=16)
+
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=targets,
+    rank_pattern=rank_pattern,
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
+# Should show expert layers in trainable params, not just attention
+```
+
+### Validation warning
+
+If `apply_lora()` is called with `match_all_linear=True` and no
+`target_modules`, and the model has fused expert parameters, a `UserWarning`
+is emitted with the detected parameter names and a fix snippet. Treat this
+as an error — silent expert-skip produces wrong training dynamics.
+
+```
+UserWarning: [NeMo AutoModel] Fused MoE expert parameters detected but will
+NOT be adapted by LoRA.
+
+  Detected expert parameter names: ['w1', 'w2', 'w3']
+
+  To apply LoRA to expert layers, pass target_modules explicitly:
+    lora_config = LoraConfig(
+        target_modules=['w1', 'w2', 'w3'],
+        rank_pattern=build_expert_lora_rank_pattern(model, base_rank=16),
+    )
+```
+
+## Code Anchors
+
+Expert detection utility:
+
+```python
+# nemo_automodel/components/_peft/lora.py
+_FUSED_EXPERT_PARAM_PATTERNS = (
+    "block_sparse_moe",   # Mixtral
+    "mlp.experts",        # Qwen3-MoE, DeepSeek
+    "moe.experts",        # generic
+    "ffn.experts",        # generic
+)
+
+def detect_fused_moe_experts(model: nn.Module) -> list[str]:
+    # inspects named_parameters() for known fused MoE patterns
+    # returns sorted list of leaf parameter name suffixes
+```
+
+Rank pattern builder:
+
+```python
+# nemo_automodel/components/_peft/lora.py
+def build_expert_lora_rank_pattern(
+    model: nn.Module,
+    base_rank: int,
+    expert_rank_multiplier: float = 1.0,
+) -> dict[str, int]:
+    # maps MoE pattern keys to int(base_rank * multiplier)
+    # returns {} for dense models
+```
+
+Validation hook in apply_lora:
+
+```python
+# nemo_automodel/components/_peft/lora.py
+def apply_lora(model, lora_config, match_all_linear=False, target_modules=None):
+    validate_lora_config_for_moe(model, match_all_linear, target_modules)
+    # ... existing LoRA application logic ...
+```
+
+Tests:
+
+```python
+# tests/unit/components/peft/test_expert_lora.py
+class TestDetectFusedMoeExperts   # 4 tests
+class TestBuildExpertLoraRankPattern  # 4 tests
+class TestValidateLoraConfigForMoe    # 3 tests
+```
+
+## Pitfalls
+
+1. **Silent failure with match_all_linear**: The most dangerous failure mode.
+   Training appears normal, loss decreases, but expert weights are never
+   adapted. Only detectable by checking `model.print_trainable_parameters()`
+   and confirming expert layers appear — or by observing that expert-heavy
+   tasks show no improvement vs attention-only LoRA.
+
+2. **rank_pattern key must match parameter path substring**: The keys in
+   `rank_pattern` are matched against full parameter names. Use the pattern
+   as returned by `detect_fused_moe_experts` or `build_expert_lora_rank_pattern`
+   — do not abbreviate.
+
+3. **expert_rank_multiplier < 1 floors at rank 1**: Setting
+   `expert_rank_multiplier=0.1` with `base_rank=4` gives rank 1, not 0.
+   This is intentional — rank 0 is invalid. Verify effective rank with
+   `model.print_trainable_parameters()`.
+
+4. **Dense model returns empty pattern**: `build_expert_lora_rank_pattern`
+   returns `{}` for dense models. Passing an empty `rank_pattern` to
+   `LoraConfig` is safe — PEFT falls back to the global `r` value.
+
+5. **target_modules suppresses the warning**: Once `target_modules` is
+   provided, `validate_lora_config_for_moe` returns immediately and
+   does not check whether the provided names actually cover expert layers.
+   Use `detect_fused_moe_experts` to generate the list rather than
+   guessing module names.
+
+## Verification
+
+Unit tests for all three utilities:
+
+```bash
+pytest tests/unit/components/peft/test_expert_lora.py -v
+```
+
+Expected: `12 passed`
+
+Confirm expert layers are trainable after apply_lora:
+
+```python
+model = get_peft_model(model, lora_config)
+trainable = {n for n, p in model.named_parameters() if p.requires_grad}
+expert_patterns = detect_fused_moe_experts(model.base_model)
+assert any(
+    any(pat in name for pat in expert_patterns)
+    for name in trainable
+), "No expert parameters in trainable set — check target_modules"
+```
+
+Success criteria:
+
+- `12 passed` on unit tests
+- `model.print_trainable_parameters()` shows expert layer names in the
+  trainable parameter count
+- No `UserWarning` about fused MoE expert skip when target_modules is set
diff --git a/skills_contribution/skills/automodel-expert-lora/card.yaml b/skills_contribution/skills/automodel-expert-lora/card.yaml
new file mode 100644
index 0000000..a377293
--- /dev/null
+++ b/skills_contribution/skills/automodel-expert-lora/card.yaml
@@ -0,0 +1,33 @@
+name: automodel-expert-lora
+version: "1.0"
+author: Doondi-Ashlesh
+status: community
+recommendation_level: stable
+
+description: >
+  Apply LoRA to fused MoE expert layers in NeMo AutoModel. Covers expert
+  parameter detection, rank_pattern configuration, and the validation warning
+  emitted when match_all_linear silently skips expert weights in Transformers v5+.
+
+use_cases:
+  - LoRA fine-tuning on Mixtral models targeting expert layers
+  - LoRA fine-tuning on Qwen3-MoE, DeepSeek, GLM-4.5 with expert adaptation
+  - Diagnosing silent expert-skip in existing MoE LoRA training runs
+  - Per-expert rank sizing via rank_pattern
+
+known_limitations:
+  - Expert detection relies on known fused MoE parameter name patterns
+  - Custom MoE architectures with non-standard parameter naming require
+    manual target_modules specification
+  - rank_pattern key matching is substring-based; ambiguous keys may
+    match unintended layers
+
+follow_up_validation:
+  - End-to-end MoE LoRA fine-tune + eval loop test not yet in CI
+  - Functional test on real Mixtral or Qwen3-MoE checkpoint pending
+
+related_issues:
+  - https://github.com/NVIDIA-NeMo/Automodel/issues/1151
+
+related_skills:
+  - megatron-bridge-lora-sft
diff --git a/skills_contribution/skills/automodel-expert-lora/evals/evals.json b/skills_contribution/skills/automodel-expert-lora/evals/evals.json
new file mode 100644
index 0000000..615e6e8
--- /dev/null
+++ b/skills_contribution/skills/automodel-expert-lora/evals/evals.json
@@ -0,0 +1,39 @@
+{
+  "skill_name": "automodel-expert-lora",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "I'm applying LoRA to a Mixtral model in NeMo AutoModel with match_all_linear=True but the loss is not improving on expert-heavy tasks. What's wrong and how do I fix it?",
+      "expected_output": "Diagnosis that match_all_linear=True only matches nn.Linear and misses Mixtral's fused expert parameters (nn.Parameter in block_sparse_moe). Fix: use detect_fused_moe_experts(model) to get target_modules=['w1','w2','w3'] and pass them explicitly to LoraConfig.",
+      "assertions": [
+        "Response identifies nn.Parameter vs nn.Linear as the root cause",
+        "Response references detect_fused_moe_experts utility",
+        "Response provides target_modules=['w1','w2','w3'] for Mixtral",
+        "Response does not suggest increasing lora_rank as the primary fix",
+        "Response mentions UserWarning that would have been emitted"
+      ]
+    },
+    {
+      "id": 2,
+      "prompt": "How do I apply different LoRA ranks to attention vs expert layers in a Qwen3-MoE model using NeMo AutoModel?",
+      "expected_output": "Use build_expert_lora_rank_pattern(model, base_rank=16, expert_rank_multiplier=0.5) to generate rank_pattern dict, then pass both target_modules (from detect_fused_moe_experts) and rank_pattern to LoraConfig.",
+      "assertions": [
+        "Response includes build_expert_lora_rank_pattern with base_rank and expert_rank_multiplier",
+        "Response shows rank_pattern passed to LoraConfig",
+        "Response includes detect_fused_moe_experts to get target_modules",
+        "Response explains that expert_rank_multiplier < 1 reduces expert rank below base_rank"
+      ]
+    },
+    {
+      "id": 3,
+      "prompt": "After applying LoRA to my Qwen3-MoE model, how do I verify that expert layers are actually being trained and not silently skipped?",
+      "expected_output": "Use model.print_trainable_parameters() and check that expert layer names appear, plus a code snippet using detect_fused_moe_experts to assert expert pattern names are in the trainable parameter set.",
+      "assertions": [
+        "Response includes model.print_trainable_parameters() call",
+        "Response provides assertion or check using detect_fused_moe_experts",
+        "Response explains what to look for in the trainable parameter output",
+        "Response mentions the 12 passed unit test expectation as a baseline check"
+      ]
+    }
+  ]
+}
diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md b/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md
new file mode 100644
index 0000000..b43027d
--- /dev/null
+++ b/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md
@@ -0,0 +1,247 @@
+---
+name: megatron-bridge-lora-sft
+description: Configure and run LoRA, DoRA, and full SFT fine-tuning in Megatron-Bridge. Covers PEFT recipe selection, target module wiring, adapter merging, and HuggingFace checkpoint export. Use when applying LoRA or DoRA to any Bridge-supported model, setting up SFT datasets, debugging PEFT config errors, or exporting fine-tuned weights back to HuggingFace format.
+when_to_use: LoRA or DoRA fine-tuning, SFT recipe setup, PEFT config errors, adapter merging, HuggingFace export after fine-tuning; 'peft_config', 'LoRA', 'DoRA', 'lora_rank', 'target_modules', 'merge_lora', 'sft_config', 'fine-tune', 'adapter export'.
+license: Apache-2.0
+---
+
+# LoRA / DoRA / SFT Fine-Tuning
+
+Stable docs: @docs/training/peft.md
+Card: @skills/megatron-bridge-lora-sft/card.yaml
+
+## Quick Decision
+
+| Goal | Recipe type | Min GPUs |
+|---|---|---|
+| LoRA on 8B model | `*_peft_config` | 1 |
+| LoRA on 70B model | `*_peft_config` | 8 |
+| LoRA on 235B MoE | `*_peft_config` | 16 |
+| Full SFT on 8B | `*_sft_config` | 2 |
+| Full SFT on 70B | `*_sft_config` | 16 |
+| Merge adapters + export to HF | Post-training step | Same as training |
+
+Use PEFT recipes when GPU count is the constraint. Use SFT recipes when
+you need full gradient flow through all parameters.
+
+## Enablement
+
+### LoRA (minimal)
+
+```python
+from megatron.bridge.recipes.llama import llama3_8b_peft_config
+
+cfg = llama3_8b_peft_config()
+
+# Default: rank=16, alpha=32, target_modules=["linear_qkv", "linear_proj"]
+# Override rank and alpha:
+cfg.peft.lora_rank = 32
+cfg.peft.lora_alpha = 64
+
+# Add MLP layers to target modules:
+cfg.peft.target_modules = [
+    "linear_qkv",
+    "linear_proj",
+    "linear_fc1",
+    "linear_fc2",
+]
+```
+
+### DoRA
+
+```python
+cfg.peft.use_dora = True
+cfg.peft.lora_rank = 16
+cfg.peft.lora_alpha = 16  # alpha == rank is the DoRA convention
+```
+
+### SFT (full fine-tune)
+
+```python
+from megatron.bridge.recipes.llama import llama3_8b_sft_config
+
+cfg = llama3_8b_sft_config()
+cfg.dataset.data_path = ["/data/train.jsonl"]
+cfg.dataset.seq_length = 4096
+cfg.train.global_batch_size = 128
+cfg.train.micro_batch_size = 2
+cfg.optimizer.lr = 1e-5
+```
+
+### MoE LoRA — expert layer targeting
+
+For MoE models (Qwen3-MoE, DeepSeek, GLM-4.5), expert weights are
+registered as `nn.Parameter`, not `nn.Linear`. `match_all_linear=True`
+silently skips them. Set `target_modules` explicitly:
+
+```python
+cfg = qwen3_30b_a3b_peft_config()
+cfg.peft.target_modules = [
+    "linear_qkv",       # attention
+    "linear_proj",      # attention output
+    "gate_proj",        # expert gate
+    "up_proj",          # expert up
+    "down_proj",        # expert down
+]
+cfg.peft.lora_rank = 16
+cfg.peft.lora_alpha = 32
+```
+
+### Adapter merge and HuggingFace export
+
+```python
+from megatron.bridge.peft.merge import merge_lora_weights
+from megatron.bridge.convert import export_to_hf
+
+# Step 1: merge adapters into base weights
+merge_lora_weights(
+    checkpoint_dir="/checkpoints/lora_run",
+    output_dir="/checkpoints/merged",
+)
+
+# Step 2: export merged checkpoint to HuggingFace format
+export_to_hf(
+    megatron_checkpoint="/checkpoints/merged",
+    hf_output_dir="/hf_model/",
+    model_type="llama3",
+)
+```
+
+Or via CLI:
+
+```bash
+python scripts/convert/megatron_to_hf.py \
+  --checkpoint /checkpoints/merged \
+  --output /hf_model/ \
+  --model-type llama3
+```
+
+## Entry Points
+
+```bash
+# LoRA fine-tune (1 GPU)
+uv run python -m torch.distributed.run --nproc_per_node=1 \
+  scripts/training/run_recipe.py \
+  --recipe llama3_8b_peft_config \
+  --dataset llm-finetune
+
+# SFT fine-tune (2 GPUs)
+uv run python -m torch.distributed.run --nproc_per_node=2 \
+  scripts/training/run_recipe.py \
+  --recipe llama3_8b_sft_config \
+  --dataset llm-finetune
+
+# Override LoRA rank via CLI
+uv run python -m torch.distributed.run --nproc_per_node=1 \
+  scripts/training/run_recipe.py \
+  --recipe llama3_8b_peft_config \
+  --dataset llm-finetune \
+  'peft.lora_rank=32' \
+  'peft.lora_alpha=64'
+```
+
+## Code Anchors
+
+PEFT config definition:
+
+```python
+# src/megatron/bridge/training/config.py
+@dataclass
+class PEFTConfig:
+    lora_rank: int = 16
+    lora_alpha: float = 32
+    lora_dropout: float = 0.0
+    use_dora: bool = False
+    target_modules: list[str] = field(default_factory=lambda: ["linear_qkv", "linear_proj"])
+    match_all_linear: bool = False
+```
+
+LoRA adapter application:
+
+```python
+# src/megatron/bridge/training/peft.py
+def apply_lora(model, peft_config):
+    # wraps target modules with LoraLinear / DoraLinear
+    # match_all_linear iterates nn.Linear only — misses nn.Parameter MoE experts
+```
+
+Merge utility:
+
+```python
+# src/megatron/bridge/peft/merge.py
+def merge_lora_weights(checkpoint_dir, output_dir):
+    # loads base + adapter shards, merges in-place, writes merged checkpoint
+```
+
+PEFT recipe examples:
+
+```python
+# src/megatron/bridge/recipes/llama.py
+def llama3_8b_peft_config() -> ConfigContainer:
+    cfg = llama3_8b_sft_config()
+    cfg.peft = PEFTConfig(lora_rank=16, lora_alpha=32)
+    cfg.model.tensor_model_parallel_size = 1
+    cfg.model.pipeline_model_parallel_size = 1
+    return cfg
+```
+
+## Pitfalls
+
+1. **MoE expert layers silently skipped**: `match_all_linear=True` only
+   matches `nn.Linear`. Expert weights in fused MoE blocks (Qwen3-MoE,
+   DeepSeek, GLM-4.5) are `nn.Parameter` — they are invisible to the
+   matcher. Always set `target_modules` explicitly for MoE models.
+
+2. **DoRA alpha convention**: DoRA expects `lora_alpha == lora_rank`. Using
+   the standard LoRA convention (`alpha = 2 * rank`) will not error but
+   produces suboptimal scaling. Set `alpha = rank` for DoRA.
+
+3. **Merge before export**: Exporting a LoRA checkpoint to HuggingFace
+   without merging produces a broken HF model — the base weights do not
+   include adapter contributions. Always run `merge_lora_weights()` first.
+
+4. **TP > 1 with PEFT**: LoRA adapters are sharded along with the base
+   layer when `tensor_model_parallel_size > 1`. The adapter shapes must be
+   consistent across TP ranks. Mismatched `lora_rank` between ranks causes
+   a shape error at initialization, not at the first forward pass.
+
+5. **SFT with packed sequences requires MBS=1**: When `PackedSequenceSpecs`
+   is active, setting `micro_batch_size > 1` raises a `ValueError`. PEFT
+   recipes default to `MBS=1`; SFT recipes may need explicit adjustment.
+
+6. **`calculate_per_token_loss` for SFT with CP**: When context parallelism
+   (`context_parallel_size > 1`) is enabled for SFT, set
+   `cfg.model.calculate_per_token_loss = True` and
+   `cfg.ddp.average_in_collective = False`. Omitting either causes
+   incorrect loss scaling across CP ranks.
+
+7. **LoRA dropout and inference**: `lora_dropout > 0` is training-only.
+   Ensure the adapter is saved and merged in eval mode or dropout
+   will be applied during export, corrupting merged weights.
+
+## Verification
+
+Unit test coverage for PEFT config validation:
+
+```bash
+uv run python -m pytest tests/unit_tests/training/test_config.py \
+  -k "peft or lora" -v
+```
+
+Smoke test LoRA on 1 GPU with mock data:
+
+```bash
+CUDA_VISIBLE_DEVICES=0 uv run python -m torch.distributed.run --nproc_per_node=1 \
+  scripts/training/run_recipe.py \
+  --recipe llama3_8b_peft_config \
+  --dataset llm-finetune \
+  'train.train_iters=5' \
+  'logger.log_interval=1'
+```
+
+Success criteria:
+
+- Exit code 0
+- Finite loss at iteration 5 (e.g. `lm loss: 9.8E+00`)
+- Log shows `PEFTConfig` with expected `lora_rank` and `target_modules`
+- No `KeyError` or shape mismatch during adapter initialization
diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml b/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml
new file mode 100644
index 0000000..30b87eb
--- /dev/null
+++ b/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml
@@ -0,0 +1,31 @@
+name: megatron-bridge-lora-sft
+version: "1.0"
+author: Doondi-Ashlesh
+status: community
+recommendation_level: stable
+
+description: >
+  Configure and run LoRA, DoRA, and full SFT fine-tuning in Megatron-Bridge.
+  Covers PEFT recipe selection, target module wiring for dense and MoE models,
+  adapter merging, and HuggingFace checkpoint export.
+
+use_cases:
+  - LoRA fine-tuning on dense models (Llama, Qwen3, Gemma)
+  - DoRA fine-tuning
+  - LoRA on MoE models with explicit expert layer targeting
+  - Full SFT fine-tuning
+  - Adapter merge and HuggingFace export
+
+known_limitations:
+  - Expert layer LoRA requires explicit target_modules for MoE models
+  - DoRA requires alpha == rank for correct weight decomposition scaling
+  - Adapter merge must precede HuggingFace export
+
+follow_up_validation:
+  - End-to-end LoRA merge + export round-trip test not yet in CI
+  - MoE expert LoRA functional test pending
+
+related_skills:
+  - recipe-recommender
+  - perf-parallelism-strategies
+  - perf-sequence-packing
diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json b/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json
new file mode 100644
index 0000000..4835132
--- /dev/null
+++ b/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json
@@ -0,0 +1,39 @@
+{
+  "skill_name": "megatron-bridge-lora-sft",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "How do I set up LoRA fine-tuning for Llama 3 8B in Megatron-Bridge with rank 32 targeting attention and MLP layers?",
+      "expected_output": "Python config snippet using llama3_8b_peft_config(), setting lora_rank=32, lora_alpha=64, and target_modules covering both attention (linear_qkv, linear_proj) and MLP (linear_fc1, linear_fc2) layers, plus the launch command.",
+      "assertions": [
+        "Response includes llama3_8b_peft_config() recipe reference",
+        "Response sets lora_rank=32",
+        "Response sets target_modules with at least 4 entries covering attention and MLP",
+        "Response includes the uv run torch.distributed.run launch command",
+        "Response does not suggest match_all_linear=True as the solution"
+      ]
+    },
+    {
+      "id": 2,
+      "prompt": "I'm trying to apply LoRA to a Qwen3-30B-A3B MoE model in Megatron-Bridge but my loss isn't changing — it looks like only attention layers are being adapted. What's wrong?",
+      "expected_output": "Diagnosis that match_all_linear silently skips MoE expert parameters (nn.Parameter, not nn.Linear), and the fix: explicitly setting target_modules to include gate_proj, up_proj, down_proj in addition to attention layers.",
+      "assertions": [
+        "Response identifies the root cause as match_all_linear missing nn.Parameter expert weights",
+        "Response provides explicit target_modules list including expert layer names",
+        "Response references qwen3_30b_a3b_peft_config or equivalent MoE recipe",
+        "Response does not suggest increasing lora_rank as the primary fix"
+      ]
+    },
+    {
+      "id": 3,
+      "prompt": "How do I merge LoRA adapters and export the result to HuggingFace format after training in Megatron-Bridge?",
+      "expected_output": "Two-step process: first merge_lora_weights() to combine adapter into base checkpoint, then export_to_hf() or CLI megatron_to_hf.py to produce the HF model directory.",
+      "assertions": [
+        "Response shows merge step before export step",
+        "Response includes checkpoint_dir and output_dir arguments for merge",
+        "Response includes CLI or Python export command with model-type argument",
+        "Response warns that skipping merge produces a broken HF model"
+      ]
+    }
+  ]
+}

From fe7a22bd7928cb47931190885b54a645a2c0154f Mon Sep 17 00:00:00 2001
From: Doondi-Ashlesh <doondiashlesh@gmail.com>
Date: Sun, 3 May 2026 22:30:15 +0000
Subject: [PATCH 2/3] Add skills: automodel-expert-lora and
 megatron-bridge-lora-sft

- Add automodel-expert-lora to skills/NeMo-AutoModel/ with component entry
- Add megatron-bridge-lora-sft to skills/Megatron-Bridge/
- Each skill includes SKILL.md, card.yaml, and evals/evals.json

Signed-off-by: Doondi-Ashlesh <doondiashlesh@gmail.com>
---
 components.d/automodel.yml                    |   8 +
 .../megatron-bridge-lora-sft/SKILL.md         | 201 ++++++++++++++++++
 .../megatron-bridge-lora-sft/card.yaml        |  80 +++++++
 .../megatron-bridge-lora-sft/evals/evals.json |  44 ++++
 .../automodel-expert-lora/SKILL.md            | 201 ++++++++++++++++++
 .../automodel-expert-lora/card.yaml           |  76 +++++++
 .../automodel-expert-lora/evals/evals.json    |  44 ++++
 7 files changed, 654 insertions(+)
 create mode 100644 components.d/automodel.yml
 create mode 100644 skills/Megatron-Bridge/megatron-bridge-lora-sft/SKILL.md
 create mode 100644 skills/Megatron-Bridge/megatron-bridge-lora-sft/card.yaml
 create mode 100644 skills/Megatron-Bridge/megatron-bridge-lora-sft/evals/evals.json
 create mode 100644 skills/NeMo-AutoModel/automodel-expert-lora/SKILL.md
 create mode 100644 skills/NeMo-AutoModel/automodel-expert-lora/card.yaml
 create mode 100644 skills/NeMo-AutoModel/automodel-expert-lora/evals/evals.json

diff --git a/components.d/automodel.yml b/components.d/automodel.yml
new file mode 100644
index 0000000..22641c7
--- /dev/null
+++ b/components.d/automodel.yml
@@ -0,0 +1,8 @@
+name: NeMo-AutoModel
+repo: NVIDIA-NeMo/Automodel
+description: NeMo AutoModel — fine-tuning and training of HuggingFace-compatible models, including LoRA, PEFT, and MoE workflows.
+skills:
+  - path: skills/
+    catalog_dir: NeMo-AutoModel
+links:
+  security: false
diff --git a/skills/Megatron-Bridge/megatron-bridge-lora-sft/SKILL.md b/skills/Megatron-Bridge/megatron-bridge-lora-sft/SKILL.md
new file mode 100644
index 0000000..1b914c3
--- /dev/null
+++ b/skills/Megatron-Bridge/megatron-bridge-lora-sft/SKILL.md
@@ -0,0 +1,201 @@
+---
+name: megatron-bridge-lora-sft
+description: Configure and run LoRA, DoRA, and full SFT fine-tuning in Megatron-Bridge. Covers LoRA dataclass setup, target module wiring, normalize_moe_lora for MoE models, and adapter export via AutoBridge.export_adapter_ckpt. Use when applying LoRA or DoRA to any Bridge-supported model, setting up SFT datasets, or exporting fine-tuned adapters to HuggingFace PEFT format.
+when_to_use: LoRA or DoRA fine-tuning, SFT recipe setup, normalize_moe_lora, MoE expert targeting, adapter export to HuggingFace, peft_scheme lora dora, dim alpha target_modules LoRA dataclass, torchrun recipe fine-tune, export_adapter_ckpt AutoBridge.
+---
+
+# LoRA / DoRA / SFT Fine-Tuning
+
+Card: @skills/megatron-bridge-lora-sft/card.yaml
+
+## Quick Decision
+
+| Goal | peft_scheme | Min GPUs |
+|---|---|---|
+| LoRA on 1B model | `"lora"` | 1 |
+| DoRA on 1B model | `"dora"` | 1 |
+| Full SFT on 8B | sft recipe | 2 |
+| Export adapter to HF PEFT | CPU only | 0 GPUs |
+
+## Enablement
+
+### LoRA (minimal)
+
+```python
+from megatron.bridge.recipes.llama import llama32_1b_peft_config
+
+config = llama32_1b_peft_config(peft_scheme="lora")
+
+# Default target_modules: ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"]
+# Default dim=32, alpha=32
+
+# Override rank and alpha:
+config.peft.dim = 16
+config.peft.alpha = 32
+```
+
+Launch:
+
+```bash
+torchrun --nproc_per_node=1 tutorials/recipes/llama/01_quickstart_finetune.py \
+  --pretrained-checkpoint /path/to/checkpoint
+```
+
+### DoRA
+
+```python
+config = llama32_1b_peft_config(peft_scheme="dora")
+config.peft.dim = 16
+config.peft.alpha = 64   # DoRA default alpha is 64
+```
+
+### MoE LoRA — expert layer targeting
+
+For MoE models, add expert projection names to `target_modules` and enable
+`normalize_moe_lora` to scale down expert rank proportionally:
+
+```python
+from megatron.bridge.peft.lora import LoRA
+
+lora = LoRA(
+    target_modules=[
+        "linear_qkv",       # attention
+        "linear_proj",      # attention output
+        "linear_fc1",       # MLP gate/up (dense fallback)
+        "linear_fc2",       # MLP down (dense fallback)
+    ],
+    dim=32,
+    alpha=32,
+    normalize_moe_lora=True,  # dim // moe_router_topk for expert layers
+)
+```
+
+With `normalize_moe_lora=True`:
+- Expert linear layers: effective dim = `dim // moe_router_topk`
+- Non-expert layers: effective dim = `dim` (unchanged)
+- `dim` must be evenly divisible by `moe_router_topk`
+
+### Adapter export to HuggingFace
+
+```python
+from megatron.bridge import AutoBridge
+
+bridge = AutoBridge(hf_model_path="/path/to/hf/model")
+
+bridge.export_adapter_ckpt(
+    peft_checkpoint="/checkpoints/lora_run",
+    output_path="./my_adapter",
+)
+# produces: ./my_adapter/adapter_config.json
+#           ./my_adapter/adapter_model.safetensors
+```
+
+Or via CLI script:
+
+```bash
+python examples/conversion/adapter/export_adapter.py \
+  --hf-model-path /path/to/hf/model \
+  --lora-checkpoint /checkpoints/lora_run \
+  --output ./my_adapter
+```
+
+The exported adapter loads directly with HuggingFace PEFT:
+
+```python
+from peft import PeftModel
+model = PeftModel.from_pretrained(base_model, "./my_adapter")
+```
+
+Export runs on CPU — no GPU required.
+
+## Code Anchors
+
+LoRA dataclass:
+
+```python
+# src/megatron/bridge/peft/lora.py
+@dataclass
+class LoRA(PEFT, ModuleMatcher):
+    target_modules: List[str] = field(
+        default_factory=lambda: ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"]
+    )
+    dim: int = 32
+    alpha: int = 32
+    dropout: float = 0.0
+    dropout_position: Literal["pre", "post"] = "pre"
+    lora_A_init_method: str = "xavier"
+    lora_B_init_method: str = "zero"
+    a2a_experimental: bool = False
+    lora_dtype: torch.dtype = None
+    normalize_moe_lora: bool = False
+```
+
+DoRA dataclass:
+
+```python
+# src/megatron/bridge/peft/dora.py
+@dataclass
+class DoRA(PEFT, ModuleMatcher):
+    target_modules: List[str] = field(
+        default_factory=lambda: ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"]
+    )
+    dim: int = 32
+    alpha: int = 64   # DoRA default differs from LoRA default
+```
+
+Recipe function:
+
+```python
+# tutorials/recipes/llama/01_quickstart_finetune.py
+from megatron.bridge.recipes.llama import llama32_1b_peft_config
+
+config = llama32_1b_peft_config(peft_scheme="lora")  # or "dora"
+config.peft.dim = 16
+config.peft.alpha = 32
+```
+
+Export:
+
+```python
+# examples/conversion/adapter/export_adapter.py
+bridge = AutoBridge(hf_model_path=...)
+bridge.export_adapter_ckpt(peft_checkpoint=..., output_path=...)
+```
+
+## Pitfalls
+
+1. **MoE expert layers silently skipped without normalize_moe_lora or explicit targets**:
+   The default `target_modules` covers attention and MLP layers for dense models.
+   For MoE models, expert weights may not be covered — verify with a forward pass
+   that expert parameters have `requires_grad=True`.
+
+2. **DoRA alpha convention**: DoRA default `alpha=64`, not 32. Check the `DoRA`
+   dataclass defaults before overriding.
+
+3. **normalize_moe_lora requires evenly divisible dim**: `dim` must be divisible by
+   `moe_router_topk`. Indivisible `dim` values will error.
+
+4. **Export produces HF PEFT adapter — no merge step needed**: Unlike some frameworks,
+   `export_adapter_ckpt` produces `adapter_config.json` + `adapter_model.safetensors`
+   which load directly via `PeftModel.from_pretrained`. No separate merge step is
+   required before HuggingFace use.
+
+5. **TP > 1 with PEFT**: LoRA adapter shapes are sharded with the base layer when
+   `tensor_model_parallel_size > 1`. Adapter `dim` must be consistent across TP ranks.
+   Mismatched `dim` causes a shape error at initialization.
+
+## Verification
+
+Smoke test LoRA on 1 GPU with mock data:
+
+```bash
+torchrun --nproc_per_node=1 tutorials/recipes/llama/01_quickstart_finetune.py \
+  --pretrained-checkpoint /path/to/checkpoint
+```
+
+Success criteria:
+
+- Exit code 0
+- Finite loss in logs
+- Adapter files generated: `adapter_config.json` + `adapter_model.safetensors`
+- `PeftModel.from_pretrained(base_model, output_path)` loads without error
diff --git a/skills/Megatron-Bridge/megatron-bridge-lora-sft/card.yaml b/skills/Megatron-Bridge/megatron-bridge-lora-sft/card.yaml
new file mode 100644
index 0000000..6f5b635
--- /dev/null
+++ b/skills/Megatron-Bridge/megatron-bridge-lora-sft/card.yaml
@@ -0,0 +1,80 @@
+title: megatron_bridge_lora_sft
+validated_on: "2026-05-03"
+summary: >
+  Megatron-Bridge exposes LoRA and DoRA via the LoRA and DoRA dataclasses in
+  src/megatron/bridge/peft/. Default target_modules cover attention and MLP dense
+  layers. MoE expert rank normalization is via normalize_moe_lora=True (divides dim
+  by moe_router_topk for expert layers). Adapter export to HuggingFace PEFT format
+  uses AutoBridge.export_adapter_ckpt — produces adapter_config.json and
+  adapter_model.safetensors compatible with PeftModel.from_pretrained.
+
+validation_status:
+  lora_dataclass:
+    - code_verified
+  dora_dataclass:
+    - code_verified
+  normalize_moe_lora:
+    - code_verified
+  recipe_function_llama32_1b:
+    - code_verified
+  export_adapter_ckpt:
+    - code_verified
+  peft_model_load_after_export:
+    - code_verified
+  tp_peft_sharding:
+    - unclear
+  end_to_end_moe_lora_finetune:
+    - unclear
+
+feature_meaning:
+  lora_dataclass: >
+    LoRA(target_modules, dim=32, alpha=32, normalize_moe_lora=False).
+    Applied to model via peft_scheme="lora" in recipe functions.
+  dora_dataclass: >
+    DoRA(target_modules, dim=32, alpha=64). DoRA default alpha is 64, not 32.
+    Applied via peft_scheme="dora".
+  normalize_moe_lora: >
+    When True, expert linear layers use dim // moe_router_topk instead of full dim.
+    Non-expert layers keep full dim. dim must be evenly divisible by moe_router_topk.
+  export_adapter_ckpt: >
+    AutoBridge(hf_model_path).export_adapter_ckpt(peft_checkpoint, output_path).
+    Generates adapter_config.json + adapter_model.safetensors. Runs on CPU.
+    Output loads directly via PeftModel.from_pretrained(base_model, output_path).
+
+recommended_path:
+  lora_minimal:
+    recipe: llama32_1b_peft_config(peft_scheme="lora")
+    peft.dim: 16
+    peft.alpha: 32
+  dora:
+    recipe: llama32_1b_peft_config(peft_scheme="dora")
+    peft.dim: 16
+    peft.alpha: 64
+  moe_lora:
+    peft.normalize_moe_lora: true
+    peft.dim: 32
+    note: dim must be divisible by moe_router_topk
+  export:
+    step_1: "bridge = AutoBridge(hf_model_path)"
+    step_2: "bridge.export_adapter_ckpt(peft_checkpoint, output_path)"
+
+known_constraints:
+  - DoRA default alpha is 64, not 32; overriding without checking defaults may produce incorrect scaling.
+  - normalize_moe_lora requires dim evenly divisible by moe_router_topk.
+  - TP > 1 with PEFT requires consistent adapter dim across all TP ranks; mismatch errors at init.
+  - export_adapter_ckpt produces HF PEFT adapter files — no separate merge step is needed before HF use.
+
+known_limitations:
+  - End-to-end MoE LoRA fine-tune on a real MoE checkpoint not confirmed in CI.
+  - TP > 1 PEFT sharding behavior not fully validated from source review.
+
+evidence:
+  - src/megatron/bridge/peft/lora.py
+  - src/megatron/bridge/peft/dora.py
+  - tutorials/recipes/llama/01_quickstart_finetune.py
+  - examples/conversion/adapter/export_adapter.py
+
+follow_up_validation:
+  - Add a checked-in end-to-end LoRA adapter export round-trip CI test.
+  - Confirm normalize_moe_lora on a real MoE checkpoint (DeepSeek, Qwen3-MoE).
+  - Clarify whether TP > 1 PEFT is validated on current container versions.
diff --git a/skills/Megatron-Bridge/megatron-bridge-lora-sft/evals/evals.json b/skills/Megatron-Bridge/megatron-bridge-lora-sft/evals/evals.json
new file mode 100644
index 0000000..309f3a5
--- /dev/null
+++ b/skills/Megatron-Bridge/megatron-bridge-lora-sft/evals/evals.json
@@ -0,0 +1,44 @@
+[
+  {
+    "id": "lora-001-llama-dim-alpha-target-modules",
+    "question": "How do I set up LoRA fine-tuning for a Llama model in Megatron-Bridge with rank 16 targeting attention and MLP layers?",
+    "expected_skill": "megatron-bridge-lora-sft",
+    "expected_script": null,
+    "ground_truth": "Use llama32_1b_peft_config(peft_scheme='lora') as the starting recipe. Set config.peft.dim=16 and config.peft.alpha=32. The default target_modules already includes ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'] covering both attention and MLP layers. Launch with torchrun --nproc_per_node=1 tutorials/recipes/llama/01_quickstart_finetune.py --pretrained-checkpoint /path/to/checkpoint.",
+    "expected_behavior": [
+      "References llama32_1b_peft_config(peft_scheme='lora') as the recipe entry point",
+      "Sets config.peft.dim=16 (not lora_rank)",
+      "Sets config.peft.alpha=32",
+      "Mentions that default target_modules covers linear_qkv, linear_proj, linear_fc1, linear_fc2",
+      "Includes the torchrun launch command with --pretrained-checkpoint"
+    ]
+  },
+  {
+    "id": "lora-002-moe-normalize-moe-lora",
+    "question": "I'm applying LoRA to a MoE model in Megatron-Bridge but want expert layers to use a smaller rank than attention layers. How do I do that?",
+    "expected_skill": "megatron-bridge-lora-sft",
+    "expected_script": null,
+    "ground_truth": "Set normalize_moe_lora=True on the LoRA dataclass. With dim=32 and moe_router_topk=2, expert linear layers get effective dim = 32 // 2 = 16, while non-expert layers keep the full dim=32. dim must be evenly divisible by moe_router_topk. This is set directly on the LoRA dataclass: LoRA(dim=32, alpha=32, normalize_moe_lora=True, target_modules=[...]).",
+    "expected_behavior": [
+      "References normalize_moe_lora=True as the mechanism for per-layer rank reduction",
+      "Explains that expert layer effective dim = dim // moe_router_topk",
+      "Explains that non-expert layers keep the full dim",
+      "Notes that dim must be evenly divisible by moe_router_topk",
+      "Shows the LoRA dataclass usage directly, not a fabricated PEFTConfig field"
+    ]
+  },
+  {
+    "id": "lora-003-export-adapter-hf-peft",
+    "question": "How do I export my LoRA adapter checkpoint from Megatron-Bridge to HuggingFace PEFT format?",
+    "expected_skill": "megatron-bridge-lora-sft",
+    "expected_script": null,
+    "ground_truth": "Use AutoBridge(hf_model_path) and call bridge.export_adapter_ckpt(peft_checkpoint='/checkpoints/lora_run', output_path='./my_adapter'). This produces adapter_config.json and adapter_model.safetensors in the output directory. The export runs on CPU. The result loads directly with PeftModel.from_pretrained(base_model, './my_adapter'). Alternatively, use the CLI: python examples/conversion/adapter/export_adapter.py --hf-model-path ... --lora-checkpoint ... --output ...",
+    "expected_behavior": [
+      "References AutoBridge and export_adapter_ckpt as the export mechanism",
+      "Shows the peft_checkpoint and output_path arguments",
+      "States the export produces adapter_config.json and adapter_model.safetensors",
+      "Mentions the export runs on CPU (no GPU needed)",
+      "Shows PeftModel.from_pretrained as the consumption pattern, or the CLI alternative"
+    ]
+  }
+]
diff --git a/skills/NeMo-AutoModel/automodel-expert-lora/SKILL.md b/skills/NeMo-AutoModel/automodel-expert-lora/SKILL.md
new file mode 100644
index 0000000..83357b8
--- /dev/null
+++ b/skills/NeMo-AutoModel/automodel-expert-lora/SKILL.md
@@ -0,0 +1,201 @@
+---
+name: automodel-expert-lora
+description: Apply LoRA to fused MoE expert layers in NeMo AutoModel. Covers PeftConfig setup, moe_rank_scaling for automatic per-expert rank reduction, target_modules wildcard matching for expert layers, and the GroupedExpertsTE limitation. Use when fine-tuning MoE models (models using GroupedExperts or GroupedExpertsDeepEP) with LoRA and needing expert layers adapted, or when diagnosing why expert weights are not being trained.
+when_to_use: LoRA on MoE models in NeMo AutoModel, expert weight adaptation, moe_rank_scaling, target_modules for MoE, expert LoRA patching, GroupedExperts LoRA, dim scaling by n_activated_experts, apply_lora_to_linear_modules MoE.
+---
+
+# Expert LoRA for Fused MoE Models
+
+Card: @skills/automodel-expert-lora/card.yaml
+
+## The Problem
+
+In NeMo AutoModel, fused MoE expert layers (`GroupedExperts`, `GroupedExpertsDeepEP`) are
+not `nn.Linear` modules. `match_all_linear=True` iterates `nn.Linear` only and silently
+skips expert parameters.
+
+Result: LoRA runs but only attention or dense linear layers are adapted. Expert weights are
+never modified. No error is raised.
+
+Additionally, `GroupedExpertsTE` (Transformer Engine expert layers) are not supported —
+passing them raises `NotImplementedError`.
+
+## Quick Decision
+
+| Scenario | PeftConfig setting |
+|---|---|
+| Adapt expert layers | `target_modules=["*experts*"]` |
+| Adapt specific expert name | `target_modules=["experts"]` |
+| Reduce expert rank proportionally | `moe_rank_scaling=True` |
+| Dense model only | `match_all_linear=True` (skips MoE) |
+| TE expert layers | Not supported — raises NotImplementedError |
+
+## Enablement
+
+### Step 1 — Configure PeftConfig for expert layers
+
+```python
+from nemo_automodel.components._peft.lora import PeftConfig, apply_lora_to_linear_modules
+
+peft_config = PeftConfig(
+    target_modules=["*experts*"],  # wildcard matches modules with "experts" in the name
+    dim=16,
+    alpha=32,
+)
+
+n_patched = apply_lora_to_linear_modules(model, peft_config)
+# returns count of modules patched
+```
+
+For exact name matching instead of wildcard:
+
+```python
+peft_config = PeftConfig(
+    target_modules=["experts"],  # exact substring match
+    dim=8,
+    alpha=32,
+)
+```
+
+### Step 2 — Use moe_rank_scaling for proportional rank reduction
+
+`moe_rank_scaling=True` divides `dim` by `n_activated_experts` for expert modules while
+keeping the full `dim` for dense linear layers. This normalizes total adapter capacity.
+
+```python
+peft_config = PeftConfig(
+    target_modules=["experts", "linear"],  # both MoE and dense
+    dim=16,
+    alpha=32,
+    moe_rank_scaling=True,
+)
+# model.config.n_activated_experts = 2
+# → expert lora_dim = 16 // 2 = 8
+# → linear lora_dim = 16 (unchanged)
+
+n_patched = apply_lora_to_linear_modules(model, peft_config)
+```
+
+Constraints:
+- `dim` must be >= `n_activated_experts`; otherwise raises `ValueError`
+- Non-evenly-divisible `dim` is allowed (floor division) but emits a warning
+- `moe_rank_scaling=False` (default): all modules use the full `dim`
+
+### Step 3 — Verify expert layers are trainable
+
+```python
+trainable = [(n, p.shape) for n, p in model.named_parameters() if p.requires_grad]
+# Confirm expert parameter names appear in the list
+assert any("experts" in n for n, _ in trainable), \
+    "No expert parameters are trainable — check target_modules"
+```
+
+## Code Anchors
+
+PeftConfig and application function:
+
+```python
+# nemo_automodel/components/_peft/lora.py
+@dataclass
+class PeftConfig:
+    target_modules: list = field(default_factory=list)
+    exclude_modules: list = field(default_factory=list)
+    match_all_linear: bool = False
+    dim: int = 8
+    alpha: int = 32
+    use_dora: bool = False
+    dropout: float = 0.0
+    dropout_position: Literal["pre", "post"] = "post"
+    lora_A_init: str = "xavier"
+    lora_dtype: Optional[torch.dtype] = None
+    use_triton: bool = False
+    moe_rank_scaling: bool = False
+
+def apply_lora_to_linear_modules(
+    model: nn.Module,
+    peft_config: PeftConfig,
+    quantization_config=None,
+    skip_freeze: bool = False,
+) -> int:
+    # patches matched nn.Linear and MoE expert modules
+    # returns count of patched modules
+```
+
+MoE module patching:
+
+```python
+# nemo_automodel/components/_peft/lora.py
+def patch_moe_module(
+    orig_module,
+    dim=8,
+    alpha=32,
+    lora_A_init_method="xavier",
+    lora_dtype=None,
+) -> nn.Module:
+    # GroupedExperts     → GroupedExpertsLoRA
+    # GroupedExpertsDeepEP → GroupedExpertsDeepEPLoRA
+    # GroupedExpertsTE   → raises NotImplementedError
+```
+
+Tests:
+
+```python
+# tests/unit_tests/_peft/test_lora_experts.py
+test_apply_lora_equivalence        # wildcard target_modules=["*experts*"]
+test_apply_lora_patching_logic     # exact and wildcard matching
+test_moe_rank_scaling_basic        # dim=16, n_activated_experts=2 → lora_dim=8
+test_moe_rank_scaling_default_off  # moe_rank_scaling=False keeps full dim
+test_moe_rank_scaling_floor_division_warning  # non-divisible dim
+test_moe_rank_scaling_dim_too_small           # dim < n_activated_experts → ValueError
+test_moe_rank_scaling_output_equivalence      # zero-init B → identical baseline output
+```
+
+## Pitfalls
+
+1. **Silent expert-skip with match_all_linear**: `match_all_linear=True` iterates
+   `nn.Linear` modules only. Expert modules are not `nn.Linear` — they are silently
+   skipped. Training appears to run but only dense/attention layers are adapted.
+   Always set `target_modules` explicitly when working with MoE models.
+
+2. **GroupedExpertsTE not supported**: Models using Transformer Engine expert layers
+   (`GroupedExpertsTE`) raise `NotImplementedError` when `patch_moe_module` is called.
+   There is no workaround — TE expert LoRA is not implemented.
+
+3. **dim too small with moe_rank_scaling**: Setting `dim < n_activated_experts` with
+   `moe_rank_scaling=True` raises a `ValueError`. Increase `dim` to at least
+   `n_activated_experts`.
+
+4. **Floor division warning**: When `dim` is not evenly divisible by `n_activated_experts`,
+   floor division is applied and a warning is logged. The resulting `lora_dim` may be
+   unexpectedly small. Verify effective rank with trainable parameter inspection.
+
+5. **target_modules must match module names, not parameter names**: Wildcard patterns
+   like `"*experts*"` are matched against module names from `model.named_modules()`,
+   not parameter names from `model.named_parameters()`.
+
+## Verification
+
+Run unit tests:
+
+```bash
+pytest tests/unit_tests/_peft/test_lora_experts.py -v
+```
+
+Confirm expert modules are patched:
+
+```python
+peft_config = PeftConfig(target_modules=["*experts*"], dim=8)
+n = apply_lora_to_linear_modules(model, peft_config)
+assert n > 0, "No modules were patched — check target_modules pattern"
+
+trainable = {n for n, p in model.named_parameters() if p.requires_grad}
+assert any("experts" in name for name in trainable), \
+    "Expert parameters not in trainable set"
+```
+
+Success criteria:
+
+- Unit tests pass
+- `n_patched > 0` after `apply_lora_to_linear_modules`
+- Expert parameter names appear in `model.named_parameters()` with `requires_grad=True`
+- No `NotImplementedError` (i.e., model does not use `GroupedExpertsTE`)
diff --git a/skills/NeMo-AutoModel/automodel-expert-lora/card.yaml b/skills/NeMo-AutoModel/automodel-expert-lora/card.yaml
new file mode 100644
index 0000000..ada1bba
--- /dev/null
+++ b/skills/NeMo-AutoModel/automodel-expert-lora/card.yaml
@@ -0,0 +1,76 @@
+title: automodel_expert_lora
+validated_on: "2026-05-03"
+summary: >
+  NeMo AutoModel supports LoRA on fused MoE expert layers via PeftConfig with
+  target_modules and moe_rank_scaling. Setting target_modules=["*experts*"] patches
+  GroupedExperts and GroupedExpertsDeepEP modules. moe_rank_scaling=True divides dim
+  by n_activated_experts for expert layers while keeping full dim for dense layers.
+  GroupedExpertsTE raises NotImplementedError. Confirmed from source and unit tests.
+
+validation_status:
+  peft_config_dataclass:
+    - code_verified
+  apply_lora_to_linear_modules:
+    - code_verified
+  patch_moe_module_grouped_experts:
+    - code_verified
+  patch_moe_module_deepep:
+    - code_verified
+  patch_moe_module_te_not_supported:
+    - code_verified
+  moe_rank_scaling_dim_division:
+    - code_verified
+  wildcard_target_modules_matching:
+    - code_verified
+  unit_tests:
+    - code_verified
+  end_to_end_finetune:
+    - unclear
+
+feature_meaning:
+  peft_config: >
+    Dataclass controlling LoRA application. Key fields: target_modules (list of
+    module name patterns), dim (LoRA rank), alpha, match_all_linear (nn.Linear only),
+    moe_rank_scaling (divide dim by n_activated_experts for expert layers).
+  apply_lora_to_linear_modules: >
+    Main entry point. Freezes base model parameters, iterates named modules,
+    patches matched nn.Linear and MoE expert modules. Returns count of patched modules.
+  patch_moe_module: >
+    Patches a single MoE module. GroupedExperts → GroupedExpertsLoRA,
+    GroupedExpertsDeepEP → GroupedExpertsDeepEPLoRA, GroupedExpertsTE → NotImplementedError.
+  moe_rank_scaling: >
+    When True, expert lora_dim = dim // n_activated_experts; dense lora_dim = dim.
+    Requires dim >= n_activated_experts. Non-divisible dim uses floor division with warning.
+
+recommended_path:
+  expert_lora_minimal:
+    target_modules: '["*experts*"]'
+    dim: 16
+    alpha: 32
+  expert_lora_with_rank_scaling:
+    target_modules: '["*experts*", "linear"]'
+    dim: 16
+    alpha: 32
+    moe_rank_scaling: true
+  verification:
+    check: "n = apply_lora_to_linear_modules(model, peft_config); assert n > 0"
+
+known_constraints:
+  - GroupedExpertsTE is not supported and raises NotImplementedError; no workaround exists.
+  - dim must be >= n_activated_experts when moe_rank_scaling=True or ValueError is raised.
+  - Non-divisible dim is allowed with moe_rank_scaling but uses floor division; a warning is logged.
+  - match_all_linear=True only iterates nn.Linear and silently skips all MoE expert modules.
+  - target_modules patterns are matched against module names, not parameter names.
+
+known_limitations:
+  - End-to-end MoE LoRA fine-tune on a real checkpoint (Mixtral, Qwen3-MoE) not in CI.
+  - DoRA (use_dora=True) with MoE modules: not confirmed from unit tests.
+
+evidence:
+  - nemo_automodel/components/_peft/lora.py
+  - tests/unit_tests/_peft/test_lora_experts.py
+
+follow_up_validation:
+  - Add an end-to-end CI test that runs apply_lora_to_linear_modules on a real MoE checkpoint.
+  - Confirm DoRA (use_dora=True) behavior with GroupedExperts modules.
+  - Confirm moe_rank_scaling behavior when n_activated_experts > 2.
diff --git a/skills/NeMo-AutoModel/automodel-expert-lora/evals/evals.json b/skills/NeMo-AutoModel/automodel-expert-lora/evals/evals.json
new file mode 100644
index 0000000..434632e
--- /dev/null
+++ b/skills/NeMo-AutoModel/automodel-expert-lora/evals/evals.json
@@ -0,0 +1,44 @@
+[
+  {
+    "id": "moe-lora-001-match-all-linear-silent-skip",
+    "question": "I'm applying LoRA to a MoE model in NeMo AutoModel with match_all_linear=True but expert layers are not being trained. What's happening?",
+    "expected_skill": "automodel-expert-lora",
+    "expected_script": null,
+    "ground_truth": "match_all_linear=True only iterates nn.Linear modules. MoE expert layers (GroupedExperts, GroupedExpertsDeepEP) are not nn.Linear and are silently skipped. Fix: set target_modules=['*experts*'] in PeftConfig instead of relying on match_all_linear. Confirm patching by checking that apply_lora_to_linear_modules returns a count > 0 and that expert parameter names appear with requires_grad=True.",
+    "expected_behavior": [
+      "Identifies that match_all_linear only iterates nn.Linear and silently skips MoE expert modules",
+      "Explains that GroupedExperts and GroupedExpertsDeepEP are not nn.Linear",
+      "Provides target_modules=['*experts*'] as the fix in PeftConfig",
+      "Mentions apply_lora_to_linear_modules return count as a verification step",
+      "Does not suggest increasing dim as the primary fix"
+    ]
+  },
+  {
+    "id": "moe-lora-002-moe-rank-scaling",
+    "question": "How do I apply a smaller LoRA rank to MoE expert layers than to dense attention layers in NeMo AutoModel?",
+    "expected_skill": "automodel-expert-lora",
+    "expected_script": null,
+    "ground_truth": "Use moe_rank_scaling=True in PeftConfig. With dim=16 and n_activated_experts=2, expert layers get lora_dim=8 (dim // n_activated_experts) while dense linear layers keep the full dim=16. Set target_modules to cover both expert and linear modules. dim must be >= n_activated_experts or a ValueError is raised. Non-divisible dim uses floor division with a warning.",
+    "expected_behavior": [
+      "References moe_rank_scaling=True in PeftConfig as the mechanism",
+      "Explains that expert lora_dim = dim // n_activated_experts",
+      "Explains that dense linear layers keep the full dim",
+      "Notes that dim must be >= n_activated_experts or ValueError is raised",
+      "Notes that non-divisible dim uses floor division with a warning"
+    ]
+  },
+  {
+    "id": "moe-lora-003-groupedexpertste-not-supported",
+    "question": "I'm trying to apply LoRA to a Transformer Engine MoE model in NeMo AutoModel and getting a NotImplementedError. Why?",
+    "expected_skill": "automodel-expert-lora",
+    "expected_script": null,
+    "ground_truth": "GroupedExpertsTE (Transformer Engine expert layers) are not supported by patch_moe_module. When apply_lora_to_linear_modules encounters a GroupedExpertsTE module, it raises NotImplementedError with 'LoRA is not supported for Transformer Engine'. There is no workaround — TE expert LoRA is not implemented. Only GroupedExperts (→ GroupedExpertsLoRA) and GroupedExpertsDeepEP (→ GroupedExpertsDeepEPLoRA) are supported.",
+    "expected_behavior": [
+      "Identifies GroupedExpertsTE as the unsupported module type",
+      "States that patch_moe_module raises NotImplementedError for TE expert layers",
+      "States there is no workaround — TE expert LoRA is not implemented",
+      "Lists the supported types: GroupedExperts and GroupedExpertsDeepEP",
+      "Does not suggest a TE-specific workaround that does not exist"
+    ]
+  }
+]

From 4306256e25ac092eb30d57582fbb8ed7d054627c Mon Sep 17 00:00:00 2001
From: Doondi-Ashlesh <doondiashlesh@gmail.com>
Date: Sun, 3 May 2026 22:57:32 +0000
Subject: [PATCH 3/3] Remove skills_contribution staging folder

Signed-off-by: Doondi-Ashlesh <doondiashlesh@gmail.com>
---
 .../skills/automodel-expert-lora/SKILL.md     | 211 ---------------
 .../skills/automodel-expert-lora/card.yaml    |  33 ---
 .../automodel-expert-lora/evals/evals.json    |  39 ---
 .../skills/megatron-bridge-lora-sft/SKILL.md  | 247 ------------------
 .../skills/megatron-bridge-lora-sft/card.yaml |  31 ---
 .../megatron-bridge-lora-sft/evals/evals.json |  39 ---
 6 files changed, 600 deletions(-)
 delete mode 100644 skills_contribution/skills/automodel-expert-lora/SKILL.md
 delete mode 100644 skills_contribution/skills/automodel-expert-lora/card.yaml
 delete mode 100644 skills_contribution/skills/automodel-expert-lora/evals/evals.json
 delete mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md
 delete mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/card.yaml
 delete mode 100644 skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json

diff --git a/skills_contribution/skills/automodel-expert-lora/SKILL.md b/skills_contribution/skills/automodel-expert-lora/SKILL.md
deleted file mode 100644
index ef7e723..0000000
--- a/skills_contribution/skills/automodel-expert-lora/SKILL.md
+++ /dev/null
@@ -1,211 +0,0 @@
----
-name: automodel-expert-lora
-description: Apply LoRA to fused MoE expert layers in NeMo AutoModel using HuggingFace Transformers v5+ models. Covers expert parameter detection, rank_pattern configuration, and the validation warning emitted when match_all_linear silently skips expert weights. Use when fine-tuning MoE models (Mixtral, Qwen3-MoE, DeepSeek) with LoRA and needing expert layers adapted, or when diagnosing why only attention layers are changing during MoE LoRA training.
-when_to_use: LoRA on MoE models in NeMo AutoModel, expert weight adaptation, rank_pattern configuration, silent skip diagnosis; 'match_all_linear MoE', 'expert LoRA', 'fused expert parameters', 'target_modules MoE', 'Mixtral LoRA', 'Qwen3-MoE LoRA', 'DeepSeek LoRA', 'nn.Parameter expert'.
-license: Apache-2.0
----
-
-# Expert LoRA for Fused MoE Models
-
-Card: @skills/automodel-expert-lora/card.yaml
-
-## The Problem
-
-In Transformers v5+, fused MoE models (Mixtral, Qwen3-MoE, DeepSeek-V3,
-GLM-4.5) register expert weights as `nn.Parameter` inside a combined linear
-layer — not as individual `nn.Linear` modules. `match_all_linear=True` iterates
-`nn.Linear` only. Expert parameters are invisible to it.
-
-Result: LoRA appears to run, loss changes only from attention adaptation, and
-the expert layers are never modified. No error is raised.
-
-As of NeMo AutoModel v0.x (issue #1151), `apply_lora()` now emits a
-`UserWarning` when this condition is detected, and three utilities are
-available to configure expert LoRA correctly.
-
-## Quick Decision
-
-| Model family | Expert param pattern | Correct target_modules |
-|---|---|---|
-| Mixtral | `block_sparse_moe.w1/w2/w3` | `["w1", "w2", "w3"]` |
-| Qwen3-MoE | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` |
-| DeepSeek-V3 | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` |
-| GLM-4.5 | `mlp.experts.gate_proj/up_proj/down_proj` | `["gate_proj", "up_proj", "down_proj"]` |
-
-If unsure, run `detect_fused_moe_experts(model)` — it returns the correct
-list for any supported model.
-
-## Enablement
-
-### Step 1 — Detect expert parameter names
-
-```python
-from nemo_automodel.components._peft.lora import detect_fused_moe_experts
-
-targets = detect_fused_moe_experts(model)
-# e.g. returns ["w1", "w2", "w3"] for Mixtral
-#              ["down_proj", "gate_proj", "up_proj"] for Qwen3-MoE
-```
-
-### Step 2 — Build rank_pattern (optional: per-expert rank sizing)
-
-```python
-from nemo_automodel.components._peft.lora import build_expert_lora_rank_pattern
-
-rank_pattern = build_expert_lora_rank_pattern(
-    model,
-    base_rank=16,
-    expert_rank_multiplier=0.5,  # smaller rank for experts to save memory
-)
-# e.g. {"block_sparse_moe": 8}
-```
-
-### Step 3 — Apply LoRA with explicit target_modules
-
-```python
-from peft import LoraConfig, get_peft_model
-from nemo_automodel.components._peft.lora import detect_fused_moe_experts, build_expert_lora_rank_pattern
-
-targets = detect_fused_moe_experts(model)
-rank_pattern = build_expert_lora_rank_pattern(model, base_rank=16)
-
-lora_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    target_modules=targets,
-    rank_pattern=rank_pattern,
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-
-model = get_peft_model(model, lora_config)
-model.print_trainable_parameters()
-# Should show expert layers in trainable params, not just attention
-```
-
-### Validation warning
-
-If `apply_lora()` is called with `match_all_linear=True` and no
-`target_modules`, and the model has fused expert parameters, a `UserWarning`
-is emitted with the detected parameter names and a fix snippet. Treat this
-as an error — silent expert-skip produces wrong training dynamics.
-
-```
-UserWarning: [NeMo AutoModel] Fused MoE expert parameters detected but will
-NOT be adapted by LoRA.
-
-  Detected expert parameter names: ['w1', 'w2', 'w3']
-
-  To apply LoRA to expert layers, pass target_modules explicitly:
-    lora_config = LoraConfig(
-        target_modules=['w1', 'w2', 'w3'],
-        rank_pattern=build_expert_lora_rank_pattern(model, base_rank=16),
-    )
-```
-
-## Code Anchors
-
-Expert detection utility:
-
-```python
-# nemo_automodel/components/_peft/lora.py
-_FUSED_EXPERT_PARAM_PATTERNS = (
-    "block_sparse_moe",   # Mixtral
-    "mlp.experts",        # Qwen3-MoE, DeepSeek
-    "moe.experts",        # generic
-    "ffn.experts",        # generic
-)
-
-def detect_fused_moe_experts(model: nn.Module) -> list[str]:
-    # inspects named_parameters() for known fused MoE patterns
-    # returns sorted list of leaf parameter name suffixes
-```
-
-Rank pattern builder:
-
-```python
-# nemo_automodel/components/_peft/lora.py
-def build_expert_lora_rank_pattern(
-    model: nn.Module,
-    base_rank: int,
-    expert_rank_multiplier: float = 1.0,
-) -> dict[str, int]:
-    # maps MoE pattern keys to int(base_rank * multiplier)
-    # returns {} for dense models
-```
-
-Validation hook in apply_lora:
-
-```python
-# nemo_automodel/components/_peft/lora.py
-def apply_lora(model, lora_config, match_all_linear=False, target_modules=None):
-    validate_lora_config_for_moe(model, match_all_linear, target_modules)
-    # ... existing LoRA application logic ...
-```
-
-Tests:
-
-```python
-# tests/unit/components/peft/test_expert_lora.py
-class TestDetectFusedMoeExperts   # 4 tests
-class TestBuildExpertLoraRankPattern  # 4 tests
-class TestValidateLoraConfigForMoe    # 3 tests
-```
-
-## Pitfalls
-
-1. **Silent failure with match_all_linear**: The most dangerous failure mode.
-   Training appears normal, loss decreases, but expert weights are never
-   adapted. Only detectable by checking `model.print_trainable_parameters()`
-   and confirming expert layers appear — or by observing that expert-heavy
-   tasks show no improvement vs attention-only LoRA.
-
-2. **rank_pattern key must match parameter path substring**: The keys in
-   `rank_pattern` are matched against full parameter names. Use the pattern
-   as returned by `detect_fused_moe_experts` or `build_expert_lora_rank_pattern`
-   — do not abbreviate.
-
-3. **expert_rank_multiplier < 1 floors at rank 1**: Setting
-   `expert_rank_multiplier=0.1` with `base_rank=4` gives rank 1, not 0.
-   This is intentional — rank 0 is invalid. Verify effective rank with
-   `model.print_trainable_parameters()`.
-
-4. **Dense model returns empty pattern**: `build_expert_lora_rank_pattern`
-   returns `{}` for dense models. Passing an empty `rank_pattern` to
-   `LoraConfig` is safe — PEFT falls back to the global `r` value.
-
-5. **target_modules suppresses the warning**: Once `target_modules` is
-   provided, `validate_lora_config_for_moe` returns immediately and
-   does not check whether the provided names actually cover expert layers.
-   Use `detect_fused_moe_experts` to generate the list rather than
-   guessing module names.
-
-## Verification
-
-Unit tests for all three utilities:
-
-```bash
-pytest tests/unit/components/peft/test_expert_lora.py -v
-```
-
-Expected: `12 passed`
-
-Confirm expert layers are trainable after apply_lora:
-
-```python
-model = get_peft_model(model, lora_config)
-trainable = {n for n, p in model.named_parameters() if p.requires_grad}
-expert_patterns = detect_fused_moe_experts(model.base_model)
-assert any(
-    any(pat in name for pat in expert_patterns)
-    for name in trainable
-), "No expert parameters in trainable set — check target_modules"
-```
-
-Success criteria:
-
-- `12 passed` on unit tests
-- `model.print_trainable_parameters()` shows expert layer names in the
-  trainable parameter count
-- No `UserWarning` about fused MoE expert skip when target_modules is set
diff --git a/skills_contribution/skills/automodel-expert-lora/card.yaml b/skills_contribution/skills/automodel-expert-lora/card.yaml
deleted file mode 100644
index a377293..0000000
--- a/skills_contribution/skills/automodel-expert-lora/card.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: automodel-expert-lora
-version: "1.0"
-author: Doondi-Ashlesh
-status: community
-recommendation_level: stable
-
-description: >
-  Apply LoRA to fused MoE expert layers in NeMo AutoModel. Covers expert
-  parameter detection, rank_pattern configuration, and the validation warning
-  emitted when match_all_linear silently skips expert weights in Transformers v5+.
-
-use_cases:
-  - LoRA fine-tuning on Mixtral models targeting expert layers
-  - LoRA fine-tuning on Qwen3-MoE, DeepSeek, GLM-4.5 with expert adaptation
-  - Diagnosing silent expert-skip in existing MoE LoRA training runs
-  - Per-expert rank sizing via rank_pattern
-
-known_limitations:
-  - Expert detection relies on known fused MoE parameter name patterns
-  - Custom MoE architectures with non-standard parameter naming require
-    manual target_modules specification
-  - rank_pattern key matching is substring-based; ambiguous keys may
-    match unintended layers
-
-follow_up_validation:
-  - End-to-end MoE LoRA fine-tune + eval loop test not yet in CI
-  - Functional test on real Mixtral or Qwen3-MoE checkpoint pending
-
-related_issues:
-  - https://github.com/NVIDIA-NeMo/Automodel/issues/1151
-
-related_skills:
-  - megatron-bridge-lora-sft
diff --git a/skills_contribution/skills/automodel-expert-lora/evals/evals.json b/skills_contribution/skills/automodel-expert-lora/evals/evals.json
deleted file mode 100644
index 615e6e8..0000000
--- a/skills_contribution/skills/automodel-expert-lora/evals/evals.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-  "skill_name": "automodel-expert-lora",
-  "evals": [
-    {
-      "id": 1,
-      "prompt": "I'm applying LoRA to a Mixtral model in NeMo AutoModel with match_all_linear=True but the loss is not improving on expert-heavy tasks. What's wrong and how do I fix it?",
-      "expected_output": "Diagnosis that match_all_linear=True only matches nn.Linear and misses Mixtral's fused expert parameters (nn.Parameter in block_sparse_moe). Fix: use detect_fused_moe_experts(model) to get target_modules=['w1','w2','w3'] and pass them explicitly to LoraConfig.",
-      "assertions": [
-        "Response identifies nn.Parameter vs nn.Linear as the root cause",
-        "Response references detect_fused_moe_experts utility",
-        "Response provides target_modules=['w1','w2','w3'] for Mixtral",
-        "Response does not suggest increasing lora_rank as the primary fix",
-        "Response mentions UserWarning that would have been emitted"
-      ]
-    },
-    {
-      "id": 2,
-      "prompt": "How do I apply different LoRA ranks to attention vs expert layers in a Qwen3-MoE model using NeMo AutoModel?",
-      "expected_output": "Use build_expert_lora_rank_pattern(model, base_rank=16, expert_rank_multiplier=0.5) to generate rank_pattern dict, then pass both target_modules (from detect_fused_moe_experts) and rank_pattern to LoraConfig.",
-      "assertions": [
-        "Response includes build_expert_lora_rank_pattern with base_rank and expert_rank_multiplier",
-        "Response shows rank_pattern passed to LoraConfig",
-        "Response includes detect_fused_moe_experts to get target_modules",
-        "Response explains that expert_rank_multiplier < 1 reduces expert rank below base_rank"
-      ]
-    },
-    {
-      "id": 3,
-      "prompt": "After applying LoRA to my Qwen3-MoE model, how do I verify that expert layers are actually being trained and not silently skipped?",
-      "expected_output": "Use model.print_trainable_parameters() and check that expert layer names appear, plus a code snippet using detect_fused_moe_experts to assert expert pattern names are in the trainable parameter set.",
-      "assertions": [
-        "Response includes model.print_trainable_parameters() call",
-        "Response provides assertion or check using detect_fused_moe_experts",
-        "Response explains what to look for in the trainable parameter output",
-        "Response mentions the 12 passed unit test expectation as a baseline check"
-      ]
-    }
-  ]
-}
diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md b/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md
deleted file mode 100644
index b43027d..0000000
--- a/skills_contribution/skills/megatron-bridge-lora-sft/SKILL.md
+++ /dev/null
@@ -1,247 +0,0 @@
----
-name: megatron-bridge-lora-sft
-description: Configure and run LoRA, DoRA, and full SFT fine-tuning in Megatron-Bridge. Covers PEFT recipe selection, target module wiring, adapter merging, and HuggingFace checkpoint export. Use when applying LoRA or DoRA to any Bridge-supported model, setting up SFT datasets, debugging PEFT config errors, or exporting fine-tuned weights back to HuggingFace format.
-when_to_use: LoRA or DoRA fine-tuning, SFT recipe setup, PEFT config errors, adapter merging, HuggingFace export after fine-tuning; 'peft_config', 'LoRA', 'DoRA', 'lora_rank', 'target_modules', 'merge_lora', 'sft_config', 'fine-tune', 'adapter export'.
-license: Apache-2.0
----
-
-# LoRA / DoRA / SFT Fine-Tuning
-
-Stable docs: @docs/training/peft.md
-Card: @skills/megatron-bridge-lora-sft/card.yaml
-
-## Quick Decision
-
-| Goal | Recipe type | Min GPUs |
-|---|---|---|
-| LoRA on 8B model | `*_peft_config` | 1 |
-| LoRA on 70B model | `*_peft_config` | 8 |
-| LoRA on 235B MoE | `*_peft_config` | 16 |
-| Full SFT on 8B | `*_sft_config` | 2 |
-| Full SFT on 70B | `*_sft_config` | 16 |
-| Merge adapters + export to HF | Post-training step | Same as training |
-
-Use PEFT recipes when GPU count is the constraint. Use SFT recipes when
-you need full gradient flow through all parameters.
-
-## Enablement
-
-### LoRA (minimal)
-
-```python
-from megatron.bridge.recipes.llama import llama3_8b_peft_config
-
-cfg = llama3_8b_peft_config()
-
-# Default: rank=16, alpha=32, target_modules=["linear_qkv", "linear_proj"]
-# Override rank and alpha:
-cfg.peft.lora_rank = 32
-cfg.peft.lora_alpha = 64
-
-# Add MLP layers to target modules:
-cfg.peft.target_modules = [
-    "linear_qkv",
-    "linear_proj",
-    "linear_fc1",
-    "linear_fc2",
-]
-```
-
-### DoRA
-
-```python
-cfg.peft.use_dora = True
-cfg.peft.lora_rank = 16
-cfg.peft.lora_alpha = 16  # alpha == rank is the DoRA convention
-```
-
-### SFT (full fine-tune)
-
-```python
-from megatron.bridge.recipes.llama import llama3_8b_sft_config
-
-cfg = llama3_8b_sft_config()
-cfg.dataset.data_path = ["/data/train.jsonl"]
-cfg.dataset.seq_length = 4096
-cfg.train.global_batch_size = 128
-cfg.train.micro_batch_size = 2
-cfg.optimizer.lr = 1e-5
-```
-
-### MoE LoRA — expert layer targeting
-
-For MoE models (Qwen3-MoE, DeepSeek, GLM-4.5), expert weights are
-registered as `nn.Parameter`, not `nn.Linear`. `match_all_linear=True`
-silently skips them. Set `target_modules` explicitly:
-
-```python
-cfg = qwen3_30b_a3b_peft_config()
-cfg.peft.target_modules = [
-    "linear_qkv",       # attention
-    "linear_proj",      # attention output
-    "gate_proj",        # expert gate
-    "up_proj",          # expert up
-    "down_proj",        # expert down
-]
-cfg.peft.lora_rank = 16
-cfg.peft.lora_alpha = 32
-```
-
-### Adapter merge and HuggingFace export
-
-```python
-from megatron.bridge.peft.merge import merge_lora_weights
-from megatron.bridge.convert import export_to_hf
-
-# Step 1: merge adapters into base weights
-merge_lora_weights(
-    checkpoint_dir="/checkpoints/lora_run",
-    output_dir="/checkpoints/merged",
-)
-
-# Step 2: export merged checkpoint to HuggingFace format
-export_to_hf(
-    megatron_checkpoint="/checkpoints/merged",
-    hf_output_dir="/hf_model/",
-    model_type="llama3",
-)
-```
-
-Or via CLI:
-
-```bash
-python scripts/convert/megatron_to_hf.py \
-  --checkpoint /checkpoints/merged \
-  --output /hf_model/ \
-  --model-type llama3
-```
-
-## Entry Points
-
-```bash
-# LoRA fine-tune (1 GPU)
-uv run python -m torch.distributed.run --nproc_per_node=1 \
-  scripts/training/run_recipe.py \
-  --recipe llama3_8b_peft_config \
-  --dataset llm-finetune
-
-# SFT fine-tune (2 GPUs)
-uv run python -m torch.distributed.run --nproc_per_node=2 \
-  scripts/training/run_recipe.py \
-  --recipe llama3_8b_sft_config \
-  --dataset llm-finetune
-
-# Override LoRA rank via CLI
-uv run python -m torch.distributed.run --nproc_per_node=1 \
-  scripts/training/run_recipe.py \
-  --recipe llama3_8b_peft_config \
-  --dataset llm-finetune \
-  'peft.lora_rank=32' \
-  'peft.lora_alpha=64'
-```
-
-## Code Anchors
-
-PEFT config definition:
-
-```python
-# src/megatron/bridge/training/config.py
-@dataclass
-class PEFTConfig:
-    lora_rank: int = 16
-    lora_alpha: float = 32
-    lora_dropout: float = 0.0
-    use_dora: bool = False
-    target_modules: list[str] = field(default_factory=lambda: ["linear_qkv", "linear_proj"])
-    match_all_linear: bool = False
-```
-
-LoRA adapter application:
-
-```python
-# src/megatron/bridge/training/peft.py
-def apply_lora(model, peft_config):
-    # wraps target modules with LoraLinear / DoraLinear
-    # match_all_linear iterates nn.Linear only — misses nn.Parameter MoE experts
-```
-
-Merge utility:
-
-```python
-# src/megatron/bridge/peft/merge.py
-def merge_lora_weights(checkpoint_dir, output_dir):
-    # loads base + adapter shards, merges in-place, writes merged checkpoint
-```
-
-PEFT recipe examples:
-
-```python
-# src/megatron/bridge/recipes/llama.py
-def llama3_8b_peft_config() -> ConfigContainer:
-    cfg = llama3_8b_sft_config()
-    cfg.peft = PEFTConfig(lora_rank=16, lora_alpha=32)
-    cfg.model.tensor_model_parallel_size = 1
-    cfg.model.pipeline_model_parallel_size = 1
-    return cfg
-```
-
-## Pitfalls
-
-1. **MoE expert layers silently skipped**: `match_all_linear=True` only
-   matches `nn.Linear`. Expert weights in fused MoE blocks (Qwen3-MoE,
-   DeepSeek, GLM-4.5) are `nn.Parameter` — they are invisible to the
-   matcher. Always set `target_modules` explicitly for MoE models.
-
-2. **DoRA alpha convention**: DoRA expects `lora_alpha == lora_rank`. Using
-   the standard LoRA convention (`alpha = 2 * rank`) will not error but
-   produces suboptimal scaling. Set `alpha = rank` for DoRA.
-
-3. **Merge before export**: Exporting a LoRA checkpoint to HuggingFace
-   without merging produces a broken HF model — the base weights do not
-   include adapter contributions. Always run `merge_lora_weights()` first.
-
-4. **TP > 1 with PEFT**: LoRA adapters are sharded along with the base
-   layer when `tensor_model_parallel_size > 1`. The adapter shapes must be
-   consistent across TP ranks. Mismatched `lora_rank` between ranks causes
-   a shape error at initialization, not at the first forward pass.
-
-5. **SFT with packed sequences requires MBS=1**: When `PackedSequenceSpecs`
-   is active, setting `micro_batch_size > 1` raises a `ValueError`. PEFT
-   recipes default to `MBS=1`; SFT recipes may need explicit adjustment.
-
-6. **`calculate_per_token_loss` for SFT with CP**: When context parallelism
-   (`context_parallel_size > 1`) is enabled for SFT, set
-   `cfg.model.calculate_per_token_loss = True` and
-   `cfg.ddp.average_in_collective = False`. Omitting either causes
-   incorrect loss scaling across CP ranks.
-
-7. **LoRA dropout and inference**: `lora_dropout > 0` is training-only.
-   Ensure the adapter is saved and merged in eval mode or dropout
-   will be applied during export, corrupting merged weights.
-
-## Verification
-
-Unit test coverage for PEFT config validation:
-
-```bash
-uv run python -m pytest tests/unit_tests/training/test_config.py \
-  -k "peft or lora" -v
-```
-
-Smoke test LoRA on 1 GPU with mock data:
-
-```bash
-CUDA_VISIBLE_DEVICES=0 uv run python -m torch.distributed.run --nproc_per_node=1 \
-  scripts/training/run_recipe.py \
-  --recipe llama3_8b_peft_config \
-  --dataset llm-finetune \
-  'train.train_iters=5' \
-  'logger.log_interval=1'
-```
-
-Success criteria:
-
-- Exit code 0
-- Finite loss at iteration 5 (e.g. `lm loss: 9.8E+00`)
-- Log shows `PEFTConfig` with expected `lora_rank` and `target_modules`
-- No `KeyError` or shape mismatch during adapter initialization
diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml b/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml
deleted file mode 100644
index 30b87eb..0000000
--- a/skills_contribution/skills/megatron-bridge-lora-sft/card.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: megatron-bridge-lora-sft
-version: "1.0"
-author: Doondi-Ashlesh
-status: community
-recommendation_level: stable
-
-description: >
-  Configure and run LoRA, DoRA, and full SFT fine-tuning in Megatron-Bridge.
-  Covers PEFT recipe selection, target module wiring for dense and MoE models,
-  adapter merging, and HuggingFace checkpoint export.
-
-use_cases:
-  - LoRA fine-tuning on dense models (Llama, Qwen3, Gemma)
-  - DoRA fine-tuning
-  - LoRA on MoE models with explicit expert layer targeting
-  - Full SFT fine-tuning
-  - Adapter merge and HuggingFace export
-
-known_limitations:
-  - Expert layer LoRA requires explicit target_modules for MoE models
-  - DoRA requires alpha == rank for correct weight decomposition scaling
-  - Adapter merge must precede HuggingFace export
-
-follow_up_validation:
-  - End-to-end LoRA merge + export round-trip test not yet in CI
-  - MoE expert LoRA functional test pending
-
-related_skills:
-  - recipe-recommender
-  - perf-parallelism-strategies
-  - perf-sequence-packing
diff --git a/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json b/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json
deleted file mode 100644
index 4835132..0000000
--- a/skills_contribution/skills/megatron-bridge-lora-sft/evals/evals.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-  "skill_name": "megatron-bridge-lora-sft",
-  "evals": [
-    {
-      "id": 1,
-      "prompt": "How do I set up LoRA fine-tuning for Llama 3 8B in Megatron-Bridge with rank 32 targeting attention and MLP layers?",
-      "expected_output": "Python config snippet using llama3_8b_peft_config(), setting lora_rank=32, lora_alpha=64, and target_modules covering both attention (linear_qkv, linear_proj) and MLP (linear_fc1, linear_fc2) layers, plus the launch command.",
-      "assertions": [
-        "Response includes llama3_8b_peft_config() recipe reference",
-        "Response sets lora_rank=32",
-        "Response sets target_modules with at least 4 entries covering attention and MLP",
-        "Response includes the uv run torch.distributed.run launch command",
-        "Response does not suggest match_all_linear=True as the solution"
-      ]
-    },
-    {
-      "id": 2,
-      "prompt": "I'm trying to apply LoRA to a Qwen3-30B-A3B MoE model in Megatron-Bridge but my loss isn't changing — it looks like only attention layers are being adapted. What's wrong?",
-      "expected_output": "Diagnosis that match_all_linear silently skips MoE expert parameters (nn.Parameter, not nn.Linear), and the fix: explicitly setting target_modules to include gate_proj, up_proj, down_proj in addition to attention layers.",
-      "assertions": [
-        "Response identifies the root cause as match_all_linear missing nn.Parameter expert weights",
-        "Response provides explicit target_modules list including expert layer names",
-        "Response references qwen3_30b_a3b_peft_config or equivalent MoE recipe",
-        "Response does not suggest increasing lora_rank as the primary fix"
-      ]
-    },
-    {
-      "id": 3,
-      "prompt": "How do I merge LoRA adapters and export the result to HuggingFace format after training in Megatron-Bridge?",
-      "expected_output": "Two-step process: first merge_lora_weights() to combine adapter into base checkpoint, then export_to_hf() or CLI megatron_to_hf.py to produce the HF model directory.",
-      "assertions": [
-        "Response shows merge step before export step",
-        "Response includes checkpoint_dir and output_dir arguments for merge",
-        "Response includes CLI or Python export command with model-type argument",
-        "Response warns that skipping merge produces a broken HF model"
-      ]
-    }
-  ]
-}