In [1]:
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import torch.nn as nn
import torch.nn.functional as F
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_token():
    return os.getenv("HF_TOKEN")

In [None]:
class QuantisedLinearLayer(nn.Module):
    def __init__(self, in_features, out_features, bias=True, dtype=torch.float32):
        super().__init__()
        
        self.register_buffer(
            "int8_weights", torch.randint(-128, 127, (out_features, in_features), dtype=torch.int8)
        )
        
        self.register_buffer("scales", torch.randn((out_features), dtype=dtype))
        
        if bias:
            self.register_buffer("bias", torch.randn((1, out_features), dtype=dtype))
        
        else:
            self.bias = None

    def quantize(self, weights):
        w_fp32 = weights.clone().to(torch.float32)
        scales = w_fp32.abs().max(dim=-1).values / 127
        scales = scales.to(weights.dtype)
        int8_weights = torch.round(weights /scales.unsqueeze(1)).to(torch.int8)

        self.int8_weights = int8_weights
        self.scales = scales
    
    def forward(self, input):
        return computeLinearOutput(self.int8_weights, 
                              input, self.scales, self.bias)


def computeLinearOutput(int8_weights, input, scales, bias):
    weights_fp32 = (int8_weights.to(torch.float32) * scales.unsqueeze(1)).to(input.dtype)
    output = F.linear(input, weights_fp32, bias)
    return output

def transformLinearToTarget(module, target_class, module_name_to_exclude):
    for name, child in module.named_children():
        if isinstance(child, nn.Linear) and not \
        any([x == name for x in module_name_to_exclude]):
            old_bias = child.bias
            old_weight = child.weight

            new_module = target_class(child.in_features, child.out_features, old_bias is not None, child.weight.dtype)
            setattr(module, name, new_module)
            getattr(module, name).quantize(old_weight)
            
            if old_bias is not None:
              getattr(module, name).bias = old_bias
        else:
            transformLinearToTarget(child, 
                     target_class, module_name_to_exclude)

In [5]:
model_id = "unsloth/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [6]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

print("Model before:", model)

if torch.cuda.is_available():
    device = torch.device("cuda")
    model.to(device)

Device set to use cuda:0


Model before: LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((20

In [25]:
print(pipe("def addTwoNumbers():", max_new_tokens=120, do_sample=False)[0]["generated_text"])



def addTwoNumbers(): 
    # Get the two numbers from the user
    num1 = int(input("Enter the first number: "))
    num2 = int(input("Enter the second number: "))

    # Add the two numbers
    sum = num1 + num2

    # Print the result
    print("The sum of the two numbers is:", sum)

    # Return the result
    return sum

# Call the function
result = addTwoNumbers()  # This will print the result of the function

# Print the result
print("The final result is:", result) 


In [26]:
transformLinearToTarget(model, QuantisedLinearLayer, ["lm_head"])
print("Model after:", model)

Model after: LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): QuantisedLinearLayer()
          (k_proj): QuantisedLinearLayer()
          (v_proj): QuantisedLinearLayer()
          (o_proj): QuantisedLinearLayer()
        )
        (mlp): LlamaMLP(
          (gate_proj): QuantisedLinearLayer()
          (up_proj): QuantisedLinearLayer()
          (down_proj): QuantisedLinearLayer()
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): LlamaRotaryEmbedding()
  )
  (lm_head): Linear(in_features=2048, out_features=128256, bias=False)
)


In [27]:
print(pipe("def addTwoNumbers():", max_new_tokens=120, do_sample=False)[0]["generated_text"])

def addTwoNumbers(): 
    # Get the two numbers from the user
    num1 = int(input("Enter the first number: "))
    num2 = int(input("Enter the second number: "))

    # Add the two numbers
    sum = num1 + num2

    # Print the result
    print("The sum of the two numbers is:", sum)

    # Return the result
    return sum

# Call the function
result = addTwoNumbers()  # This will print the result of the function

# Print the result
print("The final result is:", result) 
