In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
torch.manual_seed(1230)




<torch._C.Generator at 0x7b2b4454bc30>

In [2]:
def time_it(start,end):
    nano = end-start
    return nano/1e9

In [3]:
device = "cuda"
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
max_token = 200

## Full Precision Model

In [4]:
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Print model size
print(f"Model size: {model.get_memory_footprint():,} bytes")



Model size: 4,423,265,024 bytes


In [5]:
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John Smith and I am a software engineer. I have been working in the software industry for the past 5 years and have experience in developing web applications using various technologies such as Java, JavaScript, and HTML. I am proficient in using tools such as Git, JIRA, and Slack to manage projects and communicate with team members. I am also skilled in designing and implementing user-friendly interfaces using CSS and HTML. In my free time, I enjoy playing video games, reading books, and spending time with my family and friends. I am passionate about learning new technologies and staying up-to-date with the latest trends in the industry. I am looking forward to working with you and contributing to the development of the project. Thank you for considering my application. Best regards,

[Your Name]
Seconds: 7.625865433
Token/s 23.735011008290893


In [6]:
del model

## INT 8 Quantization

In [7]:
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM
import torch

In [8]:
quantization_config = BitsAndBytesConfig(
   load_in_8bit=True,
   bnb_8bit_compute_dtype=torch.bfloat16
)

model_8bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
print(f"Model size: {model_8bit.get_memory_footprint():,} bytes")
torch.cuda.empty_cache()

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Model size: 1,242,749,696 bytes


In [9]:
%%time
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model_8bit.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John and I am a software engineer. I have been working in the software industry for the past 10 years. I have worked on various projects, including web development, mobile app development, and data analysis. I have a Bachelor's degree in Computer Science from the University of California, Berkeley. In my free time, I enjoy playing video games, reading books, and spending time with my family and friends. I am passionate about learning new technologies and staying up-to-date with the latest trends in the industry. I am also interested in entrepreneurship and have started my own company, which I am currently working on. I am looking forward to working with you and helping you achieve your goals. Thank you for considering my application. I hope to hear from you soon. Best regards,

[Your Name]
Seconds: 29.420471193
Token/s 6.152178828565643
CPU times: user 27.7 s, sys: 1.38 s, total: 29.1 s
Wall time: 29.4 s


In [10]:
del model_8bit

## INT4 Quantization FP4

In [11]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
print(f"Model size: {model_4bit.get_memory_footprint():,} bytes")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Model size: 758,307,584 bytes


In [12]:
%%time
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model_4bit.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John and I am 25 years old. I am a student and I am studying in the University of London. I am a very enthusiastic and motivated person. I am very interested in sports and fitness. I have a passion for fitness and I love to work out. I am a very hardworking person and I am always looking for ways to improve my fitness. I am a very active person and I love to go out and exercise. I am very passionate about fitness and I love to share my knowledge and experience with others. I am a very friendly and outgoing person and I love to meet new people and make new friends. I am a very positive and optimistic person and I always see the best in people. I am a very hardworking person and I always put in the effort to achieve my goals. I am a very determined person and I always have a positive attitude towards life. I am a very creative and artistic person and
Seconds: 10.990910984
Token/s 18.651775116587554
CPU times: user 11.1 s, sys: 281 ms, total: 11.3 s
Wall time: 11 s


In [13]:
del model_4bit

## INT4 Quantization NF4

In [14]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
)

model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
print(f"Model size: {model_4bit.get_memory_footprint():,} bytes")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Model size: 758,307,584 bytes


In [15]:
%%time
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model_4bit.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John Smith and I am a student at the University of XYZ. I am currently enrolled in the Bachelor of Science in Computer Science program. I am currently taking 12 credits and have completed 10 credits. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project
Seconds: 10.868202905
Token/s 18.862364071771992
CP

In [16]:
del model_4bit

## Nested 4Bit Quantization

In [17]:
double_quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
)

model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
print(f"Model size: {model_4bit.get_memory_footprint():,} bytes")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Model size: 758,307,584 bytes


In [18]:
%%time
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model_4bit.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John Smith and I am a student at the University of XYZ. I am currently enrolled in the Bachelor of Science in Computer Science program. I am currently taking 12 credits and have completed 10 credits. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project
Seconds: 11.767528937
Token/s 17.42081970628767
CPU

In [19]:
del model_4bit

## All Quantization feature together

In [20]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
print(f"Model size: {model_4bit.get_memory_footprint():,} bytes")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Model size: 758,307,584 bytes


In [21]:
%%time
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model_4bit.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John Smith and I am a student at the University of XYZ. I am currently enrolled in the Bachelor of Science in Computer Science program. I am currently taking 12 credits and have completed 10 credits. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project
Seconds: 10.655001702
Token/s 19.23979045085656
CPU

| Quant | Memory (GB) | Inference (Tokens/s) |
| ------ | -------- | ------- |
| Full Precision | 4,4 | 40.91 |
| 8bit | 1,2 | 8.2 |
| 4 bit FP4 | 0.750 | 20.72 | 
| 4 bit Normal Float 4 | 0.750 |19.77 |
| Nested 4 bit | 0.758 | 20.21 | 
| All together | 0.750 | 21.98 | 
