## Lab 2. Compression

In [1]:
!pip install autoawq==0.2.9 transformers==4.51.3 torch accelerate datasets

Collecting autoawq==0.2.9
  Downloading autoawq-0.2.9.tar.gz (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers==4.51.3
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting tokenizers>=0.12.1 (from autoawq==0.2.9)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m103.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: autoawq
  Building wheel for au

### Квантование (int4), калибровка.

In [6]:
import torch
import os
import gc
import numpy as np
import random
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
from huggingface_hub import model_info

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def get_model_size_gb(model_id):
    try:
        info = model_info(model_id, files_metadata=True)
        siblings = info.siblings
        
        def get_file_size(file_obj):
            if hasattr(file_obj, 'lfs') and file_obj.lfs is not None:
                return file_obj.lfs.size
            if hasattr(file_obj, 'size') and file_obj.size is not None:
                return file_obj.size
            return 0

        safetensors_size = sum(get_file_size(s) for s in siblings if s.rfilename.endswith('.safetensors'))
        bin_size = sum(get_file_size(s) for s in siblings if s.rfilename.endswith('.bin'))
        
        final_size = safetensors_size if safetensors_size > 0 else bin_size
        if final_size == 0: 
            final_size = sum(get_file_size(s) for s in siblings if any(ext in s.rfilename for ext in ['.pt', '.pth', '.ckpt']))
            
        return final_size / (1024**3)
    except Exception as e:
        print(f"Ошибка при получении размера: {e}")
        return 0

def compress_model():
    seed_everything(42)
    model_id = "Qwen/Qwen3-8B" 
    quant_path = "./qwen3-8b-awq"
    
    size_orig = get_model_size_gb(model_id)
    print(f"Оригинальный размер модели: {size_orig:.2f} GB")

    quant_config = { 
        "zero_point": True, 
        "q_group_size": 128, 
        "w_bit": 4, 
        "version": "GEMM" 
    }

    print("\nЗагрузка модели для квантования...")
    model = AutoAWQForCausalLM.from_pretrained(
        model_id, 
        trust_remote_code=True,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

    print("\nНачало калибровки")
    model.quantize(
        tokenizer, 
        quant_config=quant_config,
        n_parallel_calib_samples=1, 
        max_calib_seq_len=512,       
        max_calib_samples=32
    )

    print(f"\nСохранение модели в {quant_path}")
    model.save_quantized(quant_path)
    tokenizer.save_pretrained(quant_path)

    size_comp = sum(os.path.getsize(os.path.join(quant_path, f)) for f in os.listdir(quant_path) 
                    if any(ext in f for ext in ['.safetensors', '.bin', '.pt'])) / (1024**3)
    
    ratio = size_orig / size_comp if size_comp > 0 else 0
    
    print("\n" + "="*30)
    print(f"РЕЗУЛЬТАТЫ СЖАТИЯ:")
    print(f"Оригинальный размер: {size_orig:.2f} GB")
    print(f"Сжатый размер: {size_comp:.2f} GB")
    print(f"Compression Ratio: {ratio:.2f}x")
    print("="*30)


if __name__ == "__main__":
    compress_model()

I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/

  return datetime.utcnow().replace(tzinfo=utc)
2025-12-22 10:40:47.192801: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766400047.370887      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin

Оригинальный размер модели: 15.26 GB

Загрузка модели для квантования...


config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

LICENSE: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]


Начало калибровки


  return datetime.utcnow().replace(tzinfo=utc)


README.md:   0%|          | 0.00/167 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


val.jsonl.zst:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/214670 [00:00<?, ? examples/s]

AWQ: 100%|██████████| 36/36 [39:31<00:00, 65.89s/it]
  return datetime.utcnow().replace(tzinfo=utc)



Сохранение модели в ./qwen3-8b-awq

РЕЗУЛЬТАТЫ СЖАТИЯ:
Оригинальный размер: 15.26 GB
Сжатый размер: 5.68 GB
Compression Ratio: 2.69x


In [None]:
# !pip uninstall -y huggingface_hub
# !pip install "huggingface_hub<1.0"

In [None]:
# import os
# from huggingface_hub import HfApi

# HF_TOKEN = "..." 
# USERNAME = "NOVORDSEC" 
# REPO_NAME = "qwen3-8b-awq-int4"
# LOCAL_DIR = "./qwen3-8b-awq"

# repo_id = f"{USERNAME}/{REPO_NAME}"

# api = HfApi(token=HF_TOKEN)

# print(f"Проверка репозитория {repo_id}...")
# try:
#     api.create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)
#     print("Репозиторий готов.")
# except Exception as e:
#     print(f"Ошибка при создании: {e}")

# print("Загрузка...")
# try:
#     api.upload_folder(
#         folder_path=LOCAL_DIR,
#         repo_id=repo_id,
#         repo_type="model"
#     )
#     print(f"\nСсылка: https://huggingface.co/{repo_id}")
# except Exception as e:
#     print(f"Ошибка при загрузке: {e}")

### Оценка на 20% MMLU базовой и квантованной моделей

На 100% MMLU тоже делали, убедились, что результат очень близкий.

In [16]:
import torch
import os
import gc
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from awq import AutoAWQForCausalLM
from datasets import load_dataset
from huggingface_hub import model_info

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    set_seed(seed)

seed_everything(42)

ORIG_MODEL_ID = "Qwen/Qwen3-8B"
COMP_MODEL_ID = "NOVORDSEC/qwen3-8b-awq-int4" 
FRACTION = 0.2  

def get_model_size_gb(model_id):
    try:
        info = model_info(model_id, files_metadata=True)
        size = sum(s.size for s in info.siblings if s.size and any(ext in s.rfilename for ext in ['.safetensors', '.bin']))
        return size / (1024**3)
    except Exception as e:
        print(f"Ошибка получения размера для {model_id}: {e}")
        return 0.0

def run_mmlu_benchmark(model, tokenizer, fraction=0.2, desc="Benchmarking"):
    subjects = [
        'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 
        'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 
        'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 
        'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 
        'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 
        'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 
        'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 
        'high_school_physics', 'high_school_psychology', 'high_school_statistics', 
        'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 
        'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 
        'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 
        'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 
        'professional_law', 'professional_medicine', 'professional_psychology', 
        'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'
    ]

    try:
        device = model.device
    except AttributeError:
        device = next(model.parameters()).device

    choices = ["A", "B", "C", "D"]
    choice_tokens = [tokenizer.encode(f" {c}", add_special_tokens=False)[-1] for c in choices]
    
    detailed_results = {}
    total_correct = 0
    total_questions = 0
    
    model.eval()
    for subject in tqdm(subjects, desc=desc):
        try:
            dataset = load_dataset("cais/mmlu", subject, split="test")
            num_samples = max(1, int(len(dataset) * fraction))
            dataset = dataset.select(range(num_samples))
            
            sub_correct = 0
            for item in dataset:
                prompt = f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n"
                prompt += f"{item['question']}\n"
                prompt += f"(A) {item['choices'][0]}\n(B) {item['choices'][1]}\n(C) {item['choices'][2]}\n(D) {item['choices'][3]}\n"
                prompt += "Answer:"
                
                inputs = tokenizer(prompt, return_tensors="pt").to(device)
                
                with torch.inference_mode():
                    logits = model(**inputs).logits[0, -1, :]
                    relevant_logits = logits[choice_tokens]
                    pred = torch.argmax(relevant_logits).item()
                    
                    if pred == item['answer']:
                        sub_correct += 1
            
            acc = sub_correct / num_samples
            detailed_results[subject] = acc
            total_correct += sub_correct
            total_questions += num_samples
        except Exception as e:
            print(f"Ошибка в теме {subject}: {e}")
            
    return (total_correct / total_questions if total_questions > 0 else 0), detailed_results


size_orig = get_model_size_gb(ORIG_MODEL_ID)
size_comp = get_model_size_gb(COMP_MODEL_ID)
ratio = size_orig / size_comp if size_comp > 0 else 0
print(f"Original: {size_orig:.2f} GB | Compressed: {size_comp:.2f} GB | Ratio: {ratio:.2f}x")

print("\n>>> Оценка ОРИГИНАЛЬНОЙ модели...")
tokenizer = AutoTokenizer.from_pretrained(ORIG_MODEL_ID, trust_remote_code=True)
model_orig = AutoModelForCausalLM.from_pretrained(
    ORIG_MODEL_ID, 
    torch_dtype=torch.float16, 
    device_map="auto", 
    trust_remote_code=True
)

orig_acc, orig_detailed = run_mmlu_benchmark(model_orig, tokenizer, fraction=FRACTION, desc="Original Model")

del model_orig
gc.collect()
torch.cuda.empty_cache()

print("\n>>> Оценка СЖАТОЙ модели (AWQ)...")
model_quant = AutoAWQForCausalLM.from_quantized(
    COMP_MODEL_ID, 
    fuse_layers=True, 
    device_map="auto", 
    trust_remote_code=True
)
tokenizer_quant = AutoTokenizer.from_pretrained(COMP_MODEL_ID, trust_remote_code=True)

comp_acc, comp_detailed = run_mmlu_benchmark(model_quant, tokenizer_quant, fraction=FRACTION, desc="Compressed Model")

drop = (orig_acc - comp_acc) / orig_acc if orig_acc > 0 else 0
score = ratio / (1 + max(0, drop))

print("\n" + "="*40)
print(f"ИТОГИ ЭТАПА 1:")
print(f"Compression Ratio: {ratio:.4f}")
print(f"Baseline Accuracy: {orig_acc:.4f}")
print(f"Compressed Accuracy: {comp_acc:.4f}")
print(f"Performance Drop: {drop*100:.2f}%")
print(f"ФИНАЛЬНЫЙ SCORE: {score:.4f}")
print("="*40)

subjects = sorted(list(orig_detailed.keys()))
comparison_data = []
for s in subjects:
    comparison_data.append({
        "Subject": s,
        "Original_Acc": orig_detailed.get(s, 0.0),
        "Compressed_Acc": comp_detailed.get(s, 0.0),
        "Diff": orig_detailed.get(s, 0.0) - comp_detailed.get(s, 0.0)
    })

df_final = pd.DataFrame(comparison_data)
df_final.to_csv("mmlu_comparison_detailed.csv", index=False)
print(f"Детальный отчет сохранен в mmlu_comparison_detailed.csv")

Original: 15.26 GB | Compressed: 5.68 GB | Ratio: 2.69x

>>> Оценка ОРИГИНАЛЬНОЙ модели...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

  return datetime.utcnow().replace(tzinfo=utc)
Original Model: 100%|██████████| 57/57 [36:58<00:00, 38.93s/it] 



>>> Оценка СЖАТОЙ модели (AWQ)...


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 36/36 [00:15<00:00,  2.26it/s]


  0%|          | 0/902 [00:00<?, ?w/s]

  0%|          | 0/1 [00:00<?, ?w/s]

  return datetime.utcnow().replace(tzinfo=utc)
Compressed Model: 100%|██████████| 57/57 [32:25<00:00, 34.14s/it] 


ИТОГИ ЭТАПА 1:
Compression Ratio: 2.6861
Baseline Accuracy: 0.7292
Compressed Accuracy: 0.7167
Performance Drop: 1.72%
ФИНАЛЬНЫЙ SCORE: 2.6407
Детальный отчет сохранен в mmlu_comparison_detailed.csv





In [6]:
from huggingface_hub import model_info

def get_params_from_hub(model_id):
    print(f"Запрос метаданных для {model_id}...")
    try:
        info = model_info(model_id)
        
        if hasattr(info, 'safetensors') and info.safetensors is not None:
            if 'total' in info.safetensors:
                return info.safetensors['total']
        
        for tag in info.tags:
            if tag.startswith("region:"): continue 
            if "params:" in tag:
                val = tag.replace("params:", "")
                if 'B' in val.upper(): return float(val.upper().replace('B', '')) * 1e9
                if 'M' in val.upper(): return float(val.upper().replace('M', '')) * 1e6
        
        return None
    except Exception as e:
        print(f"Ошибка доступа к Hub для {model_id}: {e}")
        return None

def format_big_num(n):
    if n is None: return "Не удалось определить"
    if n >= 1e9: return f"{n / 1e9:.2f}B (миллиардов)"
    if n >= 1e6: return f"{n / 1e6:.2f}M (миллионов)"
    return f"{n:,}"

ORIG_ID = "Qwen/Qwen3-8B"
COMP_ID = "NOVORDSEC/qwen3-8b-awq-int4"

p_orig = get_params_from_hub(ORIG_ID)
p_comp = get_params_from_hub(COMP_ID)

print("\n" + "="*50)
print(f"Количество параметров:")
print(f"Оригинальная модель: {format_big_num(p_orig)}")
print(f"Сжатая модель:     {format_big_num(p_comp)}")
print(f"Вроде логично, что не упало, мы только квантовали.")
print(f"Изначально забыли посчитать, потому что в формуле участвует размер в гигабайтах")


Запрос метаданных для Qwen/Qwen3-8B...
Запрос метаданных для NOVORDSEC/qwen3-8b-awq-int4...

Количество параметров:
Оригинальная модель: 8.19B (миллиардов)
Сжатая модель:     8.19B (миллиардов)
Вроде логично, что не упало, мы только квантовали.
Изначально забыли посчитать, потому что в формуле участвует размер в гигабайтах
