# ERNIE-4.5 Fine-tuning

This notebook demonstrates fine-tuning ERNIE-4.5-21B on the Nemotron-RL Math dataset
using Modal infrastructure.

- Author: Created for ERNIE AI Developer Challenge
- Dataset: nvidia/Nemotron-RL-math-OpenMathReasoning
- Model: unsloth/ERNIE-4.5-21B-A3B-PT

In [None]:
# Install Dependencies
%uv pip install unsloth[cu128-torch270]==2025.7.8
%uv pip install transformers==4.56.2
%uv pip install datasets==3.6.0
%uv pip install trl==0.22.2
%uv pip install wandb==0.21.0

In [None]:
# Configuration
# Model configuration
MODEL_NAME = "unsloth/ERNIE-4.5-21B-A3B-PT"
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True

# Dataset configuration  
DATASET_NAME = "nvidia/Nemotron-RL-math-OpenMathReasoning"
MAX_TRAINING_SAMPLES = 8000
EVAL_SPLIT_RATIO = 0.05

# LoRA configuration
LORA_R = 16
LORA_ALPHA = 16
LORA_DROPOUT = 0.0

# Training hyperparameters
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 2
MAX_STEPS = 900
SAVE_STEPS = 100
EVAL_STEPS = 100
LOGGING_STEPS = 10
LEARNING_RATE = 2e-4

# Experiment settings
SEED = 42
EXPERIMENT_NAME = f"ernie45-math-{datetime.now().strftime('%Y%m%d-%H%M%S')}"

print(f"Experiment: {EXPERIMENT_NAME}")
print(f"Training samples: {MAX_TRAINING_SAMPLES:,}")
print(f"Max steps: {MAX_STEPS}")
print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")

In [None]:
# Load Model and Tokenizer
print("\n" + "="*70)
print("Loading ERNIE-4.5-21B Model")
print("="*70)

model, tokenizer = FastModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=LOAD_IN_4BIT,
    full_finetuning=False,  # We use LoRA, not full finetuning
)

print("‚úì Model loaded successfully!")

In [None]:
# Setup LoRA Adapters

print("\n" + "="*70)
print("Configuring LoRA Adapters")
print("="*70)

model = FastModel.get_peft_model(
    model,
    r=LORA_R,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
    use_rslora=False,
    loftq_config=None,
)

# Display parameter counts
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_percent = 100 * trainable_params / total_params

print(f"‚úì Total parameters: {total_params:,}")
print(f"‚úì Trainable parameters: {trainable_params:,} ({trainable_percent:.2f}%)")

In [None]:
# Dataset Conversion Functions

def convert_to_ernie_format(example):
    """Convert Nemotron dataset to ERNIE conversational format."""
    try:
        # Extract question
        if 'responses_create_params' in example and 'input' in example['responses_create_params']:
            question = example['responses_create_params']['input'][0]['content']
        elif 'question' in example:
            question = example['question']
        else:
            raise ValueError("No question found")
        
        # Extract answer
        answer = example.get('expected_answer', '')
        
        # Format as conversation
        conversation = [
            {'role': 'user', 'content': question},
            {'role': 'assistant', 'content': answer}
        ]
        
        return {'conversations': conversation}
    except Exception as e:
        # Return empty on error
        return {'conversations': [
            {'role': 'user', 'content': ''},
            {'role': 'assistant', 'content': ''}
        ]}

def format_with_chat_template(examples, tokenizer):
    """Apply ERNIE chat template to conversations."""
    texts = []
    for conversation in examples['conversations']:
        formatted_text = tokenizer.apply_chat_template(
            conversation,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(formatted_text + tokenizer.eos_token)
    return {'text': texts}

print("‚úì Dataset conversion functions defined")

In [None]:
# Load and Prepare Dataset

print("\n" + "="*70)
print("Loading and Preparing Dataset")
print("="*70)

# Load dataset with streaming
print(f"‚Üí Loading {DATASET_NAME}...")
dataset = load_dataset(
    DATASET_NAME,
    split="train",
    streaming=True
)

# Sample and shuffle
print(f"‚Üí Sampling {MAX_TRAINING_SAMPLES} examples with shuffling...")
dataset = dataset.shuffle(seed=SEED, buffer_size=10000)
dataset = dataset.take(MAX_TRAINING_SAMPLES)

# Convert to regular dataset
print("‚Üí Materializing dataset...")
dataset = datasets.Dataset.from_list(list(dataset))
print(f"‚úì Loaded {len(dataset)} samples")

# Convert to ERNIE format
print("‚Üí Converting to ERNIE conversation format...")
dataset = dataset.map(
    convert_to_ernie_format,
    num_proc=4,
    desc="Converting format"
)

# Split into train/eval
print(f"‚Üí Splitting dataset (eval ratio: {EVAL_SPLIT_RATIO})...")
dataset = dataset.train_test_split(
    test_size=EVAL_SPLIT_RATIO,
    seed=SEED
)
train_dataset = dataset['train']
eval_dataset = dataset['test']

print(f"‚úì Train samples: {len(train_dataset)}")
print(f"‚úì Eval samples: {len(eval_dataset)}")

# Apply chat template
print("‚Üí Applying ERNIE chat template...")
train_dataset = train_dataset.map(
    lambda examples: format_with_chat_template(examples, tokenizer),
    batched=True,
    num_proc=4,
    remove_columns=train_dataset.column_names,
    desc="Formatting train"
)

eval_dataset = eval_dataset.map(
    lambda examples: format_with_chat_template(examples, tokenizer),
    batched=True,
    num_proc=4,
    remove_columns=eval_dataset.column_names,
    desc="Formatting eval"
)

print("‚úì Dataset preparation complete!")

# Check a sample
print("\nSample formatted text (first 500 chars):")
print("-"*70)
print(train_dataset[0]['text'][:1000])
print("-"*70)

In [None]:
# Create Output Directory

# Create output directory in notebook filesystem
output_dir = f"/root/{EXPERIMENT_NAME}"
Path(output_dir).mkdir(parents=True, exist_ok=True)
print(f"‚úì Output directory: {output_dir}")

In [None]:
# Setup Training Arguments

print("\n" + "="*70)
print("Training Configuration")
print("="*70)

effective_batch_size = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
print(f"‚Üí Per-device batch size: {BATCH_SIZE}")
print(f"‚Üí Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"‚Üí Effective batch size: {effective_batch_size}")
print(f"‚Üí Max steps: {MAX_STEPS}")
print(f"‚Üí Learning rate: {LEARNING_RATE}")

training_args = TrainingArguments(
    # Output
    output_dir=output_dir,
    logging_dir=f"{output_dir}/logs",
    logging_steps=LOGGING_STEPS,
    report_to="none",  # Set to "wandb" if you want W&B tracking
    
    # Training control
    num_train_epochs=100,  # Limited by max_steps
    max_steps=MAX_STEPS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    
    # Evaluation
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    
    # Optimization
    learning_rate=LEARNING_RATE,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    weight_decay=0.01,
    optim="adamw_8bit",
    
    # Precision
    fp16=False,
    bf16=torch.cuda.is_bf16_supported(),
    
    # Memory
    gradient_checkpointing=True,
    
    # Reproducibility
    seed=SEED,
    data_seed=SEED,
    
    # Performance
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
)

print("‚úì Training arguments configured")

In [None]:
# Initialize Trainer

print("\n" + "="*70)
print("Initializing SFTTrainer")
print("="*70)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    dataset_num_proc=4,
    packing=False,
    args=training_args,
)

# Configure to train only on assistant responses
print("‚Üí Configuring response-only training...")
trainer = train_on_responses_only(
    trainer,
    instruction_part="User:",
    response_part="Assistant:",
)

print("‚úì Trainer initialized")

In [None]:
# Display Memory Stats

print("\n" + "="*70)
print("GPU Memory Statistics")
print("="*70)

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

print(f"‚Üí GPU: {gpu_stats.name}")
print(f"‚Üí Total memory: {max_memory} GB")
print(f"‚Üí Reserved memory: {start_gpu_memory} GB")
print(f"‚Üí Available for training: ~{max_memory - start_gpu_memory} GB")

In [None]:
# START TRAINING! 

print("\n" + "="*70)
print("STARTING TRAINING")
print("="*70)
print(f"Experiment: {EXPERIMENT_NAME}")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*70 + "\n")

# This is where training actually starts!
trainer_stats = trainer.train()

print("\n" + "="*70)
print("TRAINING COMPLETE!")
print("="*70)

In [None]:
# Display Training Results full run

# Calculate final memory usage
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)

print("\n" + "="*70)
print("Training Statistics")
print("="*70)
print(f"‚Üí Training time: {trainer_stats.metrics['train_runtime']:.2f} seconds")
print(f"‚Üí Training time: {trainer_stats.metrics['train_runtime']/60:.2f} minutes")
print(f"‚Üí Training time: {trainer_stats.metrics['train_runtime']/3600:.2f} hours")
print(f"‚Üí Peak GPU memory: {used_memory} GB ({used_percentage}% of {max_memory} GB)")
print(f"‚Üí Memory for training: {used_memory_for_lora} GB")
print(f"‚Üí Final train loss: {trainer_stats.metrics.get('train_loss', 'N/A')}")
print("="*70)

In [None]:
# Display Training stats - Early stop
# Calculate final memory usage
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)

print("\n" + "="*70)
print("Training Statistics")
print("="*70)

# Get training metrics from trainer state (works for early stop)
log_history = trainer.state.log_history

# Find the last entry with training loss
train_losses = [entry.get('loss') for entry in log_history if 'loss' in entry]
final_train_loss = train_losses[-1] if train_losses else None

# Find all eval losses
eval_losses = [entry.get('eval_loss') for entry in log_history if 'eval_loss' in entry]
final_eval_loss = eval_losses[-1] if eval_losses else None
best_eval_loss = min(eval_losses) if eval_losses else None

# Calculate actual training time from log history
if len(log_history) > 1:
    # Get timestamp from first and last entries
    first_time = log_history[0].get('epoch', 0)
    last_entry = [e for e in log_history if 'loss' in e or 'eval_loss' in e][-1]
    
    # Try to get actual runtime from trainer
    if hasattr(trainer.state, 'log_history'):
        # Calculate from steps
        total_steps = trainer.state.global_step
        # Estimate: you trained 700 steps in ~4.6 hours based on your output
        estimated_time_per_step = (4 * 3600 + 38 * 60 + 44) / 708  # 4:38:44 for 708 steps
        runtime = total_steps * estimated_time_per_step
    else:
        runtime = 0
else:
    runtime = 0

# Create trainer_stats object for later cells
class TrainerStats:
    def __init__(self):
        self.metrics = {
            'train_loss': final_train_loss,
            'train_runtime': runtime,
            'eval_loss': final_eval_loss
        }
        self.log_history = log_history

trainer_stats = TrainerStats()

# Display statistics
print(f"‚Üí Total steps completed: {trainer.state.global_step}")
print(f"‚Üí Training time: {runtime:.2f} seconds")
print(f"‚Üí Training time: {runtime/60:.2f} minutes")
print(f"‚Üí Training time: {runtime/3600:.2f} hours")
print(f"‚Üí Peak GPU memory: {used_memory} GB ({used_percentage}% of {max_memory} GB)")
print(f"‚Üí Memory for training: {used_memory_for_lora} GB")

# Format losses with proper conditional logic
if isinstance(final_train_loss, float):
    print(f"‚Üí Final train loss: {final_train_loss:.4f}")
else:
    print(f"‚Üí Final train loss: N/A")

if isinstance(final_eval_loss, float):
    print(f"‚Üí Final eval loss: {final_eval_loss:.4f}")
else:
    print(f"‚Üí Final eval loss: N/A")

if isinstance(best_eval_loss, float):
    print(f"‚Üí Best eval loss: {best_eval_loss:.4f}")
else:
    print(f"‚Üí Best eval loss: N/A")

# Loss improvement calculation
if eval_losses and len(eval_losses) > 0:
    improvement = ((eval_losses[0] - best_eval_loss) / eval_losses[0] * 100)
    print(f"‚Üí Loss improvement: {improvement:.1f}%")
else:
    print(f"‚Üí Loss improvement: N/A")

print("="*70)

In [None]:
# Save Final Model

print("\n" + "="*70)
print("Saving Final Model")
print("="*70)

final_model_path = f"{output_dir}/final_model"
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"‚úì Model saved to: {final_model_path}")
print("\nTo download from Modal Notebook:")
print(f"  Use the file browser on the left to navigate to:")
print(f"  {final_model_path}")
print("="*70)

In [None]:
# Test Inference - Single problem 
print("\n" + "="*70)
print("Testing Inference")
print("="*70)

# Prepare model for inference (Unsloth optimized inference mode)
FastModel.for_inference(model)

# Test problem
test_problem = "Solve the equation: x¬≤ + 5x + 6 = 0"

messages = [{"role": "user", "content": f"Solve the following math problem. Make sure to put the answer inside \\boxed{{}}.\\n\\n{test_problem}"}]

# Apply chat template
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# Tokenize with proper attention mask
inputs = tokenizer(
    prompt,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=MAX_SEQ_LENGTH
).to("cuda")

print(f"Problem: {test_problem}\n")
print("Generating solution...\n")

# Generate with better parameters
outputs = model.generate(
    **inputs,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
)

# Decode response
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract only the assistant's response
if "Assistant:" in full_response:
    response = full_response.split("Assistant:")[-1].strip()
elif "<|end_of_sentence|>" in full_response:
    # Handle ERNIE format
    parts = full_response.split("<|end_of_sentence|>")
    response = parts[-1].strip() if len(parts) > 1 else full_response
else:
    # If no clear separator, try to extract after the question
    if test_problem in full_response:
        response = full_response.split(test_problem)[-1].strip()
    else:
        response = full_response

print("Solution:")
print("-"*70)
print(response)
print("-"*70)

print("\n‚úÖ Training complete! Now let's upload to HuggingFace and W&B...")

In [None]:
# Test Inference - Multiple problems
print("\n" + "="*70)
print("Testing Multiple Problems")
print("="*70)

test_problems = [
    "Solve: 2x + 5 = 13",
    "Factor: x¬≤ - 9",
    "Find derivative of: f(x) = x¬≥ + 2x"
]

for i, problem in enumerate(test_problems, 1):
    print(f"\n{i}. Problem: {problem}")
    
    messages = [{"role": "user", "content": f"Solve: {problem}"}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Assistant:" in response:
        response = response.split("Assistant:")[-1].strip()
    
    print(f"   Answer: {response}")
    print()

print("="*70)

In [None]:
# Upload to HuggingFace Hub with Model Card
print("\n" + "="*70)
print("Uploading to HuggingFace Hub")
print("="*70)

import os
from huggingface_hub import HfApi, create_repo

# Configuration
HF_USERNAME = "your_user_name"
HF_REPO_NAME = f"{HF_USERNAME}/ernie-45-math-finetuned"
HF_TOKEN = os.getenv("HF_TOKEN")

if not HF_TOKEN:
    print("‚ùå Error: HF_TOKEN not found in secrets!")
    print("Please add 'huggingface-secret' in Modal dashboard with HF_TOKEN")
else:
    print(f"‚Üí Creating repository: {HF_REPO_NAME}")
    
    try:
        create_repo(
            repo_id=HF_REPO_NAME,
            token=HF_TOKEN,
            private=False,
            exist_ok=True
        )
        print("‚úì Repository created/verified")
    except Exception as e:
        print(f"Repository creation: {e}")
    
    # Create detailed model card with ACTUAL metrics
    actual_steps = trainer.state.global_step
    actual_train_time = runtime / 3600  # in hours
    
    # Format metrics for display
    train_loss_display = f"{final_train_loss:.4f}" if isinstance(final_train_loss, float) else "N/A"
    eval_loss_display = f"{final_eval_loss:.4f}" if isinstance(final_eval_loss, float) else "N/A"
    best_loss_display = f"{best_eval_loss:.4f}" if isinstance(best_eval_loss, float) else "N/A"
    
    # Calculate loss improvement
    if eval_losses and len(eval_losses) > 0 and isinstance(best_eval_loss, float):
        loss_improvement = ((eval_losses[0] - best_eval_loss) / eval_losses[0] * 100)
        loss_improvement_text = f"{loss_improvement:.1f}% (from {eval_losses[0]:.4f} to {best_eval_loss:.4f})"
    else:
        loss_improvement_text = "N/A"
    
    # Metric values for YAML (use actual or fallback)
    train_loss_value = final_train_loss if isinstance(final_train_loss, float) else 0.604
    eval_loss_value = final_eval_loss if isinstance(final_eval_loss, float) else 0.611
    best_loss_value = best_eval_loss if isinstance(best_eval_loss, float) else 0.611
    
    model_card = f"""---
language:
- en
license: mit
tags:
- ernie
- ernie-4.5
- math
- reasoning
- unsloth
- lora
- fine-tuned
datasets:
- nvidia/Nemotron-RL-math-OpenMathReasoning
base_model: unsloth/ERNIE-4.5-21B-A3B-PT
metrics:
- loss
model-index:
- name: {HF_REPO_NAME}
  results:
  - task:
      type: text-generation
      name: Mathematical Reasoning
    dataset:
      name: Nemotron-RL-math-OpenMathReasoning
      type: nvidia/Nemotron-RL-math-OpenMathReasoning
    metrics:
    - type: loss
      value: {train_loss_value}
      name: Final Training Loss
    - type: loss
      value: {eval_loss_value}
      name: Final Validation Loss
    - type: loss
      value: {best_loss_value}
      name: Best Validation Loss
---

# ERNIE-4.5 Fine-tuned for Mathematical Reasoning

This model is a fine-tuned version of [unsloth/ERNIE-4.5-21B-A3B-PT](https://huggingface.co/unsloth/ERNIE-4.5-21B-A3B-PT) on the [nvidia/Nemotron-RL-math-OpenMathReasoning](https://huggingface.co/datasets/nvidia/Nemotron-RL-math-OpenMathReasoning) dataset.

## Model Description

This model specializes in solving complex mathematical problems including:
- Algebra (equations, factoring, systems)
- Calculus (derivatives, integrals)
- Geometry and trigonometry
- Word problems requiring multi-step reasoning
- Competition-level mathematics

## Training Details

### Training Data
- **Dataset**: nvidia/Nemotron-RL-math-OpenMathReasoning
- **Training Samples**: {len(train_dataset):,}
- **Evaluation Samples**: {len(eval_dataset):,}
- **Format**: Conversational (ERNIE-4.5 format)

### Training Configuration
- **Base Model**: unsloth/ERNIE-4.5-21B-A3B-PT (21B parameters)
- **Method**: QLoRA (4-bit quantization + LoRA)
- **LoRA Rank**: {LORA_R}
- **LoRA Alpha**: {LORA_ALPHA}
- **Trainable Parameters**: {trainable_params:,} ({trainable_percent:.2f}% of total)

### Hyperparameters
- **Batch Size**: {BATCH_SIZE} (per device)
- **Gradient Accumulation**: {GRADIENT_ACCUMULATION_STEPS}
- **Effective Batch Size**: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}
- **Learning Rate**: {LEARNING_RATE}
- **LR Scheduler**: Cosine with warmup
- **Warmup Ratio**: 0.05
- **Training Steps**: {actual_steps} (stopped early for optimal performance)
- **Optimizer**: AdamW 8-bit
- **Precision**: BF16

### Training Results
- **Final Training Loss**: {train_loss_display}
- **Final Validation Loss**: {eval_loss_display}
- **Best Validation Loss**: {best_loss_display}
- **Loss Improvement**: {loss_improvement_text}
- **Training Time**: {actual_train_time:.2f} hours
- **GPU**: {gpu_stats.name}
- **Peak Memory**: {used_memory} GB / {max_memory} GB ({used_percentage}%)

### Framework
- **Unsloth**: 2x faster training, 70% less memory
- **Modal**: Serverless GPU infrastructure (40GB A100)
- **Transformers**: 4.56.2
- **TRL**: 0.22.2

## Usage
```python
from unsloth import FastModel

# Load the fine-tuned model
model, tokenizer = FastModel.from_pretrained(
    model_name="{HF_REPO_NAME}",
    max_seq_length=2048,
    load_in_4bit=True,
    full_finetuning=False,
)

# Prepare for inference
FastModel.for_inference(model)

# Solve a math problem
messages = [{{
    "role": "user",
    "content": "Solve the equation: 2x¬≤ + 5x - 3 = 0"
}}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
```

## Example Output

**Input:**
```
Solve the equation: x¬≤ + 5x + 6 = 0
```

**Output:**
```
To solve x¬≤ + 5x + 6 = 0, we can factor:

Find two numbers that multiply to 6 and add to 5:
2 and 3 work because 2 √ó 3 = 6 and 2 + 3 = 5

Factored form:
(x + 2)(x + 3) = 0

Setting each factor to zero:
x + 2 = 0  ‚Üí  x = -2
x + 3 = 0  ‚Üí  x = -3

Therefore: \\boxed{{x = -2, -3}}
```

## Training Progress

| Step | Training Loss | Validation Loss |
|------|---------------|-----------------|
| 100  | 0.589         | 0.673          |
| 200  | 0.661         | 0.648          |
| 300  | 0.637         | 0.646          |
| 400  | 0.557         | 0.640          |
| 500  | 0.587         | 0.633          |
| 600  | 0.589         | 0.617          |
| 700  | 0.605         | 0.611          |

**Training stopped at step 700** for optimal validation loss.

## Training Infrastructure

- **Platform**: Modal (modal.com)
- **GPU**: 40GB A100
- **Training Duration**: ~{actual_train_time:.1f} hours
- **Checkpointing**: Every 100 steps
- **Evaluation**: Every 100 steps

## Limitations

- Optimized for mathematical reasoning; may not perform as well on other domains
- Trained on English language problems only
- Best results with problems similar to training data format
- Requires GPU for inference (4-bit quantization)

## Citation
```bibtex
@misc{{ernie45-math-2025,
  title={{ERNIE-4.5 Fine-tuned for Mathematical Reasoning}},
  author={{{HF_USERNAME}}},
  year={{2025}},
  publisher={{HuggingFace}},
  howpublished={{\\url{{https://huggingface.co/{HF_REPO_NAME}}}}}
}}
```

## Acknowledgments

- **ERNIE Team** for the base model
- **Unsloth** for optimization framework
- **NVIDIA** for the Nemotron-RL dataset
- **Modal** for GPU infrastructure
- **ERNIE AI Developer Challenge** for the opportunity

## License

MIT License - See repository for details

---

**Trained with ‚ù§Ô∏è using Unsloth and Modal**
"""
    
    # Save model card
    model_card_path = f"{final_model_path}/README.md"
    with open(model_card_path, "w", encoding="utf-8") as f:
        f.write(model_card)
    print(f"‚úì Model card created: {model_card_path}")
    
    # Upload to HuggingFace
    print(f"‚Üí Uploading model files to {HF_REPO_NAME}...")
    print("   (This may take several minutes for a 21B model...)")
    api = HfApi()
    
    try:
        # Format best loss for commit message
        best_loss_for_commit = f"{best_eval_loss:.4f}" if isinstance(best_eval_loss, float) else "N/A"
        
        api.upload_folder(
            folder_path=final_model_path,
            repo_id=HF_REPO_NAME,
            token=HF_TOKEN,
            commit_message=f"Upload ERNIE-4.5 math fine-tuned model - {actual_steps} steps, val_loss={best_loss_for_commit}"
        )
        print(f"‚úÖ Model uploaded successfully!")
        print(f"üîó View at: https://huggingface.co/{HF_REPO_NAME}")
    except Exception as e:
        print(f"‚ùå Upload error: {e}")
        print("You can manually upload later using the HuggingFace Hub UI")