In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
torch.set_default_device("mps")
repo = "SweatyCrayfish/llama-3-8b-quantized"

model = AutoModelForCausalLM.from_pretrained(repo, torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)

In [None]:
inputs = tokenizer('''def print_prime(n):
   """
   Print all primes between 1 and n
   """''', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=200)
text = tokenizer.batch_decode(outputs)[0]
print(text)

In [None]:
from mlx_lm import load, generate
from mlx_lm import convert

out = load(repo)

# convert(repo, quantize=True, mlx_path="model")

# TinyLLama

In [8]:
from llama_cpp import Llama

model_path = "./tinyllama-1.1b-chat-v1.0.Q5_K_S.gguf"

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = Llama(
  model_path=model_path,  # Download the model file first
  n_ctx=2048,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads=4,            # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers=35         # The number of layers to offload to GPU, if you have GPU acceleration available
)

# Simple inference example
output = llm(
  "<|system|>\n{system_message}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>", # Prompt
  max_tokens=1024,  # Generate up to 512 tokens
  stop=["</s>"],   # Example stop token - not necessarily correct for this specific model! Please check before using.
  echo=True        # Whether to echo the prompt
)

# Chat Completion API

llm = Llama(model_path=model_path, chat_format="llama-2")  # Set chat_format according to the model you are using
llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a story writing assistant."},
        {
            "role": "user",
            "content": "Write a story about llamas."
        }
    ]
)


llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from ./tinyllama-1.1b-chat-v1.0.Q5_K_S.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = tinyllama_tinyllama-1.1b-chat-v1.0
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          llama.block_count u32              = 22
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 5632
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 64
llama_model_loader: - kv   7:                 ll

{'id': 'chatcmpl-5d0a2428-1c7e-4c42-b345-5acdd360fc9a',
 'object': 'chat.completion',
 'created': 1728401374,
 'model': './tinyllama-1.1b-chat-v1.0.Q5_K_S.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
   'logprobs': None,
   'finish_reason': 'length'}],
 'usage': {'prompt_tokens': 33, 'completion_tokens': 479, 'total_tokens': 512}}

In [16]:
# Chat Completion API

llm = Llama(model_path=model_path,
            chat_format="llama-2",
        n_ctx=2048,  # The max sequence length to use - note that longer sequence lengths require much more resources
        n_threads=4,            # The number of CPU threads to use, tailor to your system and the resulting performance
        n_gpu_layers=35         # The number of layers to offload to GPU, if you have GPU acceleration available
  )  # Set chat_format according to the model you are using

res = llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a helpful teacher for a second grade class."},
        {
            "role": "user",
            "content": "Who was George Washington?"
        }
    ]
)

llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from ./tinyllama-1.1b-chat-v1.0.Q5_K_S.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = tinyllama_tinyllama-1.1b-chat-v1.0
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          llama.block_count u32              = 22
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 5632
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 64
llama_model_loader: - kv   7:                 ll

In [17]:
res["choices"][0]["message"]["content"]

'\nGeorge Washington was an American statesman, commander-in-chief of the Continental Army during the American Revolution, and the first president of the United States.'

# Llama 3.2 3B 8-bit

In [1]:
from mlx_lm import load, generate

model, tokenizer = load("mlx-community/Llama-3.2-3B-8bit")
response = generate(model, tokenizer, prompt="hello", verbose=True)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 6 files: 100%|██████████| 6/6 [01:28<00:00, 14.75s/it]


Prompt: hello
, i am new to this forum and i have a question about the 2nd law of thermodynamics. i have a question about the 2nd law of thermodynamics. i have a question about the 2nd law of thermodynamics. i have a question about the 2nd law of thermodynamics. i have a question about the 2nd law of thermodynamics. i have a question about the 2nd law of thermodynamics. i have a question about the 2
Prompt: 2 tokens, 0.835 tokens-per-sec
Generation: 100 tokens, 11.266 tokens-per-sec
Peak memory: 3.208 GB


# Llama 3.2 1B

In [2]:
from mlx_lm import load, generate

model, tokenizer = load("mlx-community/Llama-3.2-1B-Instruct-bf16")
response = generate(model, tokenizer, prompt="hello", verbose=True)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 77912.77it/s]


Prompt: hello
, i'm a new user of this platform. I'm looking for a reliable and efficient way to manage my finances. I've heard about budgeting apps, but I'm not sure which one to choose. There are so many options out there, and I'm not sure which one is the best for me.

I'm looking for a budgeting app that allows me to track my expenses, create a budget, and set financial goals. I'd like to be able to set up automatic transfers from
Prompt: 2 tokens, 2.194 tokens-per-sec
Generation: 100 tokens, 30.408 tokens-per-sec
Peak memory: 2.331 GB


In [5]:
response

str

# BitNet

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "mps"
model = AutoModelForCausalLM.from_pretrained("HF1BitLLM/Llama3-8B-1.58-100B-tokens",
                                             device_map=device,
                                              torch_dtype=torch.bfloat16)    
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

input_text = "Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:"


input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
output = model.generate(input_ids, max_length=10, do_sample=False)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

ValueError: Unknown quantization type, got bitnet - supported types are: ['awq', 'bitsandbytes_4bit', 'bitsandbytes_8bit', 'gptq', 'aqlm', 'quanto', 'eetq', 'hqq', 'compressed-tensors', 'fbgemm_fp8', 'torchao']

# Try quantization

In [4]:
from torchao.quantization import quantize_, int8_weight_only
from mlx_lm import load, generate
import torch

model, tokenizer = load("mlx-community/Llama-3.2-1B-Instruct-bf16")

quantized_model = quantize_(torch.compile(model), int8_weight_only)

Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 139810.13it/s]


AttributeError: 'function' object has no attribute 'named_children'