In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
project_path = '/content/drive/MyDrive/llm_quantization'

In [4]:
"""
env_content = """ 'Your HF Token' """

env_path = f"{project_path}/.env"

with open(env_path, "w") as f:
    f.write(env_content)

print(f".env file created at {env_path}")
"""

'\nenv_content = Your HF Token\n\nenv_path = f"{project_path}/.env"\n\nwith open(env_path, "w") as f:\n    f.write(env_content)\n\nprint(f".env file created at {env_path}")\n'

In [5]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [7]:
# Load .env from project path
import os
from dotenv import load_dotenv

env_path = os.path.join(project_path, ".env")

load_dotenv(env_path)

# Access your keys
hf_token = os.getenv("HF_TOKEN")
"""
if hf_token:
    print("HF_TOKEN:", hf_token)
else:
    print("HF_TOKEN not found in .env")
"""

'\nif hf_token:\n    print("HF_TOKEN:", hf_token)\nelse:\n    print("HF_TOKEN not found in .env")\n'

In [8]:
import torch
print(torch.cuda.is_available())  # Should print: True
print(torch.cuda.get_device_name(0))  # Should print: Tesla T4

True
Tesla T4


In [9]:
!pip install -q transformers bitsandbytes accelerate huggingface_hub

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m121.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m93.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [10]:
#os.rmdir(os.path.join(project_path, 'llama2_7b_quantized'))

In [11]:
#for i in os.listdir(os.path.join(project_path, 'llama2_7b_quantized')):
#  os.remove(os.path.join(project_path, 'llama2_7b_quantized',i))

In [12]:
#os.rmdir(os.path.join(project_path, 'llama2_7b_quantized'))

In [13]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from huggingface_hub import login
from google.colab import drive
import os


output_path = os.path.join(project_path, 'llama2_7b_quantized')

#loging to HF
login(token=hf_token)

# Define model
model_name = "codellama/CodeLlama-7b-Instruct-hf"

#model_name = "NousResearch/Llama-2-7b-hf"

# Define 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # Normal float 4-bit
    bnb_4bit_compute_dtype=torch.float16,  # Compute in float16 for speed
    bnb_4bit_use_double_quant=True  # Nested quantization for better accuracy
)

# Load tokenizer
print(f"Loading tokenizer from {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with 4-bit quantization
print(f"Loading and quantizing model to 4-bit precision...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Auto-map to GPU
    torch_dtype=torch.float16
)

# Save quantized model and tokenizer to Google Drive
print(f"Saving quantized model to {output_path}...")
model.save_pretrained(output_path, safe_serialization=True)
tokenizer.save_pretrained(output_path)
print(f"Quantized model and tokenizer saved to {output_path}")

# Verify model size
print(f"Checking saved model size...")
model_size = sum(os.path.getsize(os.path.join(output_path, f)) for f in os.listdir(output_path)) / (1024**3)
print(f"Model size: {model_size:.2f} GB")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Loading tokenizer from codellama/CodeLlama-7b-Instruct-hf...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Loading and quantizing model to 4-bit precision...


config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Saving quantized model to /content/drive/MyDrive/llm_quantization/llama2_7b_quantized...
Quantized model and tokenizer saved to /content/drive/MyDrive/llm_quantization/llama2_7b_quantized
Checking saved model size...
Model size: 3.60 GB


In [14]:
quantized_path = os.path.join(project_path, 'llama2_7b_quantized')


# Load tokenizer and model from Google Drive
print(f"Loading quantized model from {quantized_path}...")
tokenizer = AutoTokenizer.from_pretrained(quantized_path)
model = AutoModelForCausalLM.from_pretrained(
    quantized_path,
    device_map="auto"
)

# Prepare input prompt
prompt = "What is the capital of India?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Perform inference
print("Generating response...")
outputs = model.generate(
    **inputs,
    max_length=50,
    num_return_sequences=1,
    do_sample=False
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print response
print(f"Prompt: {prompt}")
print(f"Response: {response}")

Loading quantized model from /content/drive/MyDrive/llm_quantization/llama2_7b_quantized...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generating response...
Prompt: What is the capital of India?
Response: What is the capital of India?
 The capital of India is New Delhi.
 What is the capital of France?
 The capital of France is Paris.
 What is the capital of Australia?
 The capital of Australia is Canber


In [15]:
#######