In [4]:
MODEL_NAME = "Llama-3-8B-Instruct-MoE-4"
yaml_config = """
base_model: meta-llama/Meta-Llama-3-8B-Instruct
gate_mode: random
dtype: bfloat16
experts_per_token: 2
experts:
  - source_model: meta-llama/Meta-Llama-3-8B-Instruct
    positive_prompts: []
  - source_model: meta-llama/Meta-Llama-3-8B-Instruct
    positive_prompts: []
  - source_model: meta-llama/Meta-Llama-3-8B-Instruct
    positive_prompts: []
  - source_model: meta-llama/Meta-Llama-3-8B-Instruct
    positive_prompts: []
  - source_model: meta-llama/Meta-Llama-3-8B-Instruct
    positive_prompts: []
  - source_model: meta-llama/Meta-Llama-3-8B-Instruct
    positive_prompts: []
  - source_model: meta-llama/Meta-Llama-3-8B-Instruct
    positive_prompts: []
  - source_model: meta-llama/Meta-Llama-3-8B-Instruct
    positive_prompts: []
"""

In [5]:
import torch

# @param ["CPU", "CPU + High-RAM", "GPU"]
if torch.cuda.is_available():
    runtime = "GPU"
else:
    runtime = "CPU"
    print("GPU not available, switching to CPU.\n")

# @markdown Use the `main` branch by default, [`moe`](https://github.com/cg123/mergekit/blob/mixtral/moe.md) if you want to create a Mixture of Experts.
branch = "moe" # @param ["main", "moe"]
trust_remote_code = False # @param {type:"boolean"}


# Install mergekit
if branch == "main":
    !git clone https://github.com/arcee-ai/mergekit.git
    !cd mergekit && pip install -qqq -e . --progress-bar off
elif branch == "moe":
    !git clone -b mixtral https://github.com/arcee-ai/mergekit.git
    !cd mergekit && pip install -qqq -e . --progress-bar off
    %pip install -qqq -U transformers --progress-bar off



# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config)

# Base CLI
if branch == "main":
    cli = "mergekit-yaml config.yaml merge --copy-tokenizer"
elif branch == "moe":
    cli = "mergekit-moe config.yaml merge --copy-tokenizer"

# Additional arguments
if runtime == "CPU":
    cli += " --allow-crimes --out-shard-size 1B --lazy-unpickle"
elif runtime == "GPU":
    cli += " --device cuda --low-cpu-memory"
if trust_remote_code:
    cli += " --trust-remote-code"
#cli += " --i-understand-this-is-not-useful-without-training"

print(cli)

# Merge models
!{cli}

fatal: destination path 'mergekit' already exists and is not an empty directory.


Note: you may need to restart the kernel to use updated packages.
mergekit-moe config.yaml merge --copy-tokenizer --device cuda --low-cpu-memory
config.json: 100%|█████████████████████████████| 654/654 [00:00<00:00, 6.03MB/s]
Warm up loaders:   0%|                                    | 0/9 [00:00<?, ?it/s]
Fetching 11 files:   0%|                                 | 0/11 [00:00<?, ?it/s][A

model-00002-of-00004.safetensors:   0%|             | 0.00/5.00G [00:00<?, ?B/s][A[A


model-00004-of-00004.safetensors:   0%|             | 0.00/1.17G [00:00<?, ?B/s][A[A[A



model-00003-of-00004.safetensors:   0%|             | 0.00/4.92G [00:00<?, ?B/s][A[A[A[A




model-00001-of-00004.safetensors:   0%|             | 0.00/4.98G [00:00<?, ?B/s][A[A[A[A[A





generation_config.json: 100%|██████████████████| 187/187 [00:00<00:00, 1.45MB/s][A[A[A[A[A[A

Fetching 11 files:  18%|████▌                    | 2/11 [00:00<00:00,  9.18it/s][A





special_tokens_map.json: 100%|█████████

In [7]:
from huggingface_hub import HfApi
from transformers import AutoTokenizer

hf_username = "Put Hugging Face username here"

api = HfApi(token="write token here")

# Upload merge folder
api.create_repo(
    repo_id=f"{hf_username}/{MODEL_NAME}",
    repo_type="model",
    exist_ok=True,
)
api.upload_folder(
    repo_id=f"{hf_username}/{MODEL_NAME}",
    folder_path="merge",
)

# add tokenizer to the repo
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
tokenizer.push_to_hub(f"{hf_username}/{MODEL_NAME}")

model-00001-of-00010.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]
[A

[A[A


[A[A[A



[A[A[A[A
[A



model-00001-of-00010.safetensors:   0%|          | 1.67M/9.97G [00:00<15:04, 11.0MB/s]

[A[A



[A[A[A[A

model-00001-of-00010.safetensors:   0%|          | 2.79M/9.97G [00:00<25:17, 6.57MB/s]



[A[A[A[A

model-00001-of-00010.safetensors:   0%|          | 6.55M/9.97G [00:00<11:29, 14.4MB/s]



[A[A[A[A
model-00001-of-00010.safetensors:   0%|          | 16.0M/9.97G [00:00<08:39, 19.2MB/s]



[A[A[A[A

[A[A
[A



model-00001-of-00010.safetensors:   0%|          | 22.8M/9.97G [00:01<06:45, 24.5MB/s]

[A[A



[A[A[A[A

model-00001-of-00010.safetensors:   0%|          | 27.8M/9.97G [00:01<06:36, 25.1MB/s]
[A



model-00001-of-00010.safetensors:   0%|          | 31.2M/9.97G [00:01<06:26, 25.7MB/s]

[A[A



[A[A[A[A
model-00001-of-00010.safetensors:   0%|          | 33.9M/9.97G [00:01<10:23, 15.9MB/s]



[A[A[A[A
[A

[A[A



mod

CommitInfo(commit_url='https://huggingface.co/VictorDCh/Llama-3-8B-Instruct-MoE-4/commit/21d36d4459e41cd52b966d4dad1e8f821172c0e2', commit_message='Upload tokenizer', commit_description='', oid='21d36d4459e41cd52b966d4dad1e8f821172c0e2', pr_url=None, pr_revision=None, pr_num=None)