In [5]:
import sys
import torch
from model.model import Config, MoEModel, optimize


In [6]:


# Set device
if torch.cuda.is_available():
  DEVICE = 'cuda'
else:
  DEVICE = 'cpu'

config = Config(
    n_features = 3,
    n_hidden = 2,
    n_experts = 2,
    n_active_experts = 1,
    load_balancing_loss = True,
)

# Configure importance and feature probability (sparsity)
model = MoEModel(
    config=config,
    device=DEVICE,
    importance = torch.tensor([1, 1, 1]),
    feature_probability = torch.tensor(0.01)
)

# Train the model
print("Training model...")
optimize(model, n_batch=512, steps=5000, print_freq=500, lr=1e-3)

print("Gate matrix:")
print(model.gate)
print("Expert weights:")
print(model.W_experts)

# Print final model parameters
print("\nFinal model parameters:")
print(f"Feature probability: {model.feature_probability.item()}")
print(f"Importance weights: {model.importance}")

Training model...
Step 0: loss=0.020687, lr=0.001000
Step 500: loss=0.010072, lr=0.001000
Step 1000: loss=0.009897, lr=0.001000
Step 1500: loss=0.009899, lr=0.001000
Step 2000: loss=0.009887, lr=0.001000
Step 2500: loss=0.009858, lr=0.001000
Step 3000: loss=0.009750, lr=0.001000
Step 3500: loss=0.009842, lr=0.001000
Step 4000: loss=0.009763, lr=0.001000
Step 4500: loss=0.009741, lr=0.001000
Step 4999: loss=0.009819, lr=0.001000
Gate matrix:
Parameter containing:
tensor([[-3.0735, -3.0686, -3.0480],
        [ 3.0735,  3.0686,  3.0480]], requires_grad=True)
Expert weights:
Parameter containing:
tensor([[[-0.3367,  0.5204],
         [ 0.1288,  0.1393],
         [-0.8435,  0.3104]],

        [[-0.7333, -0.6771],
         [ 0.7157, -0.6990],
         [ 0.6587,  0.7481]]], requires_grad=True)

Final model parameters:
Feature probability: 0.009999999776482582
Importance weights: tensor([1, 1, 1])
