In [None]:
# @title # 🌊 AutoBitnet

# @markdown ---

# @markdown ### ✨ Model Parameters

MODEL_CONFIG = "NousResearch/Nous-Hermes-llama-2-7b" # @param {type:"string"}
HEADS = 6 # @param {type: "number"}
DIMENSIONS = 768 # @param {type: "number"}
LAYERS = 6 # @param {type: "number"}
INTERMEDIATE_SIZE= 1024 # @param {type: "number"}
CONTEXT_LENGTH = 256 # @param {type: "number"}
NEW_MODEL = "Bitnet-Nous-Llama2-70M" # @param {type:"string"}
HUGGINGFACE_ID = "Put your Hugging Face Id Name" # @param {type:"string"} {exampe:"Navdeet"}


# @markdown ---

# @markdown ### 💥 Training Parameters

HF_TOKEN = "" # @param {type:"string"}
WANDB_TOKEN = "" # @param {type:"string"}
DATASET = "abideen/Cosmopedia-100k-pretrain" # @param {type:"string"}
BATCH_SIZE = 16 # @param {type:"number"}
LEARNING_RATE = 1.5e-4 # @param {type:"number"}
EPOCHS = 2 # @param {type:"number"}
!pip install datasets wandb accelerate
from torch import nn
from transformers.models.llama.modeling_llama import *
from transformers import (AutoTokenizer, AutoConfig, LlamaForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments)
from datasets import load_dataset
from huggingface_hub import login
import wandb
from huggingface_hub import create_repo, HfApi

def activation_quant(x):
    scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
    y = (x * scale).round().clamp_(-128, 127) / scale
    return y
def weight_quant(w):
    scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
    u = (w * scale).round().clamp_(-1, 1) / scale
    return u

class BitLinear(nn.Linear):
    def forward(self, x):
        w = self.weight # a weight tensor with shape [d, k]
        x = x.to(w.device)
        RMSNorm = LlamaRMSNorm(x.shape[-1]).to(w.device)
        x_norm = RMSNorm(x)
        # A trick for implementing Straight−Through−Estimator (STE) using detach()
        x_quant = x_norm + (activation_quant(x_norm) - x_norm).detach()
        w_quant = w + (weight_quant(w) - w).detach()
        y = F.linear(x_quant, w_quant)
        return y

def convert_to_bitnet(model, copy_weights):
    for name, module in model.named_modules():
        # Replace linear layers with BitNet
        if isinstance(module, LlamaSdpaAttention) or isinstance(module, LlamaMLP):
            for child_name, child_module in module.named_children():
                if isinstance(child_module, nn.Linear):
                    bitlinear = BitLinear(child_module.in_features, child_module.out_features, child_module.bias is not None).to(device="cuda:0")
                    if copy_weights:
                        bitlinear.weight = child_module.weight
                        if child_module.bias is not None:
                            bitlinear.bias = child_module.bias
                    setattr(module, child_name, bitlinear)
        # Remove redundant input_layernorms
        elif isinstance(module, LlamaDecoderLayer):
            for child_name, child_module in module.named_children():
                if isinstance(child_module, LlamaRMSNorm) and child_name == "input_layernorm":
                    setattr(module, child_name, nn.Identity().to(device="cuda:0"))


wandb.login(key=WANDB_TOKEN)
login(token=HF_TOKEN)
data = load_dataset(DATASET)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG)

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=False,
        max_length=CONTEXT_LENGTH,
        return_overflowing_tokens=True,
        return_length=True,
    )
    # Combine all tokens
    combined = []
    for tokenized_doc in outputs['input_ids']:
        combined += tokenized_doc + [tokenizer.eos_token_id]
    # Chunk
    input_batch = []
    for i in range(0, len(combined) - CONTEXT_LENGTH, CONTEXT_LENGTH):
        input_batch.append(combined[i:i+CONTEXT_LENGTH])
    return {"input_ids": input_batch}

tokenized_data = data.map(
    tokenize, batched=True, remove_columns=data["train"].column_names,
)

total_tokens = tokenized_data['train'].num_rows * CONTEXT_LENGTH
print(f"Training on {total_tokens:_} tokens")

config = AutoConfig.from_pretrained(
    MODEL_CONFIG,
    vocab_size=len(tokenizer),
    n_ctx=CONTEXT_LENGTH,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

config.hidden_size = DIMENSIONS
config.max_position_embeddings = DIMENSIONS
config.num_attention_heads = HEADS
config.num_hidden_layers = LAYERS
config.num_key_value_heads = HEADS
config.intermediate_size = INTERMEDIATE_SIZE

### Create the llama model with our custom config. Convert it to bitnet.
model = LlamaForCausalLM(config)
convert_to_bitnet(model, copy_weights=False)
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

output_path = "./Bitnet-Nous-Llama2-70M"
args = TrainingArguments(
    output_dir=output_path,
    per_device_train_batch_size=BATCH_SIZE,
    logging_steps=100,
    gradient_accumulation_steps=2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    warmup_steps=0.1,
    lr_scheduler_type="cosine",
    learning_rate=LEARNING_RATE,
    # max_steps=5000,
    save_steps=0.25,
    fp16=True,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_data["train"],
)

trainer.train()




[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Downloading data:   0%|          | 0.00/343M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/342M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Training on 122_035_712 tokens


config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

Model size: 77.5M parameters


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Currently logged in as: [33mbeastsoul[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,7.665
200,6.1328
300,5.4195
400,5.1057
500,4.8765
600,4.7208
700,4.568
800,4.4862
900,4.3867
1000,4.3031


ValueError: Provided path: '/content/out/final_model' is not a directory

In [None]:
trainer.save_model(f"{output_path}/final_model")
folder = "/content/Bitnet-Nous-Llama2-70M/final_model"
api = HfApi()
create_repo(
    repo_id = f"{HUGGINGFACE_ID}/{NEW_MODEL}",
    repo_type="model",
    exist_ok=True,
    token=HF_TOKEN,
)

api.upload_folder(
    folder_path=folder,
    repo_type="model",
    repo_id=f"{HUGGINGFACE_ID}/{NEW_MODEL}",
    token=HF_TOKEN,
)

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Vasanth/Bitnet-Nous-Llama2-70M/commit/4dd6d237d29404bfbe837429d3170b0fe798ed7d', commit_message='Upload folder using huggingface_hub', commit_description='', oid='4dd6d237d29404bfbe837429d3170b0fe798ed7d', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.models.llama.modeling_llama import *
# Load a pretrained BitNet model
model = "abideen/Bitnet-Llama-70M"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model)


def activation_quant(x):
    scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
    y = (x * scale).round().clamp_(-128, 127)
    y = y / scale
    return y
def weight_quant(w):
    scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
    u = (w * scale).round().clamp_(-1, 1)
    u = u / scale
    return u

class BitLinear(nn.Linear):
    def forward(self, x):
        w = self.weight # a weight tensor with shape [d, k]
        x = x.to(w.device)
        RMSNorm = LlamaRMSNorm(x.shape[-1]).to(w.device)
        x_norm = RMSNorm(x)
        # A trick for implementing Straight−Through−Estimator (STE) using detach()
        x_quant = x_norm + (activation_quant(x_norm) - x_norm).detach()
        w_quant = w + (weight_quant(w) - w).detach()
        y = F.linear(x_quant, w_quant)
        return y

def convert_to_bitnet(model, copy_weights):
    for name, module in model.named_modules():
        # Replace linear layers with BitNet
        if isinstance(module, LlamaSdpaAttention) or isinstance(module, LlamaMLP):
            for child_name, child_module in module.named_children():
                if isinstance(child_module, nn.Linear):
                    bitlinear = BitLinear(child_module.in_features, child_module.out_features, child_module.bias is not None).to(device="cuda:0")
                    if copy_weights:
                        bitlinear.weight = child_module.weight
                        if child_module.bias is not None:
                            bitlinear.bias = child_module.bias
                    setattr(module, child_name, bitlinear)
        # Remove redundant input_layernorms
        elif isinstance(module, LlamaDecoderLayer):
            for child_name, child_module in module.named_children():
                if isinstance(child_module, LlamaRMSNorm) and child_name == "input_layernorm":
                    setattr(module, child_name, nn.Identity().to(device="cuda:0"))


convert_to_bitnet(model, copy_weights=True)
model.to(device="cuda:0")

prompt = "What is Machine Learning?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at abideen/Bitnet-Llama-70M and are newly initialized: ['model.layers.0.input_layernorm.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.3.input_layernorm.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.5.input_layernorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'What is Machine Learning?\n\nIn today’s digital age, machine learning has become a crucial aspect of our lives. With the increasing popularity of machine learning, machine learning has become a powerful tool for learning and learning. With the'