## 1. Hugging Face 라이브러리 및 기타 도구 설치

In [1]:
!pip install transformers datasets bitsandbytes accelerate
!pip install llama-cpp-python

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

## 2. Gemma-2 모델 다운로드

### 2.1. GPU 사용하기

In [1]:
import torch

if torch.cuda.is_available():
    print("GPU is available:", torch.cuda.get_device_name(0))
else:
    print("GPU is not available.")

GPU is available: Tesla T4


### 2.2. Hugging Face 로그인 및 모델 불러오기

In [8]:
from huggingface_hub import login

login() # login with HF_Token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "google/gemma-2-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

## 3. 양자화(Quantization)

### 3.1. GGUF - 4bit

In [4]:
!pip install huggingface_hub



In [10]:
# llama.cpp 설치
!git clone https://github.com/ggerganov/llama.cpp

In [11]:
# %cd llama.cpp

In [7]:
!ls

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# GEMMA-2 모델 로드
model_name = "google/gemma-2-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# 모델 저장 경로
model.save_pretrained("./gemma-2-2b")
tokenizer.save_pretrained("./gemma-2-2b")

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

('./gemma-2-2b/tokenizer_config.json',
 './gemma-2-2b/special_tokens_map.json',
 './gemma-2-2b/tokenizer.model',
 './gemma-2-2b/added_tokens.json',
 './gemma-2-2b/tokenizer.json')

In [14]:
!pwd

In [15]:
!python3 convert_hf_to_gguf.py \
    --input_dir ./gemma-2-2b \
    --output_dir ./gguf_gemma2 \
    --quantization 4

In [33]:
from huggingface_hub import hf_hub_download

# 모델 가중치 다운로드
model_bin = hf_hub_download(repo_id="google/gemma-2-2b", filename="pytorch_model.bin")

# 모델 구성 다운로드
config_json = hf_hub_download(repo_id="google/gemma-2-2b", filename="config.json")

EntryNotFoundError: 404 Client Error. (Request ID: Root=1-675fdd9f-7c6b661d612a381b36aa9164;5cd15c8b-de5b-4acf-bfc9-6f1c3ec7bb93)

Entry Not Found for url: https://huggingface.co/google/gemma-2-9b/resolve/main/pytorch_model.bin.

### 3.2. Bitsandbytes - 4bit

In [11]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

bitsandbytes_4bit = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
)

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00006-of-00008.safetensors:   0%|          | 21.0M/4.96G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/2.38G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

### 3.3. AWQ - 4bit

In [1]:
# llm-awq 저장소 클론
!git clone https://github.com/mit-han-lab/llm-awq.git

# 설치 디렉토리로 이동
%cd llm-awq

# 필요한 패키지 설치
!pip install .

fatal: destination path 'llm-awq' already exists and is not an empty directory.
/content/llm-awq
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0mObtaining file:///content/llm-awq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: awq
  Building editable for awq (pyproject.toml) ... [?25l[?25hdone
  Created wheel for awq: filename=awq-0.1.0-0.editable-py3-none-any.whl size=9886 sha256=155eb652b4e7f6bcf5983b3ddf1aadd63596263b4dd48600d8e59d4ca66ce6f8
  Stored in directory: /tmp/pip-ephem-wheel-cache-605dwwas/wheels/26/98/ac/1026637af772b6744fe73f3517805cd22f60eb8963bb93ef31
Successfully built awq
Installing collected packages: awq
  Attempting uninstall: awq
    Found existing ins

In [8]:
import awq
print("AWQ success!")

AWQ success!


In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# 모델 이름 설정
model_name = "google/gemma-2-2b"  # 원하는 모델로 변경 가능

# 토크나이저와 모델 로드
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).cuda()

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [10]:
# 로컬 경로에 모델 저장
model_path = "./gemma-2-2b"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print(f"GEMMA-2 모델이 {model_path}에 저장되었습니다!")

GEMMA-2 모델이 ./gemma-2-2b에 저장되었습니다!


In [13]:
!python -m awq.entry \
    --model_path ./gemma-2-2b \
    --w_bit 4 \
    --q_group_size 128 \
    --run_awq \
    --dump_awq ./awq_cache/gemma-2-2b-w4-g128.pt

2024-12-16 08:56:50.392851: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-16 08:56:50.426404: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-16 08:56:50.436779: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/content/llm-awq/awq/entry.py", line 15, in <module>
    from awq.quantize.pre_quant import run_awq, apply_awq
  F

In [15]:
from awq.utils.quantization import load_awq

ModuleNotFoundError: No module named 'awq.utils.quantization'

In [14]:
from awq import AutoAWQForCausalLM

AWQ_4bit = AutoAWQForCausalLM.from_pretrained(
    model_name,
    quantize="awq",  # AWQ 양자화 옵션
    w_bit=4,         # 가중치 4비트
    a_bit=4,         # 활성화 값 4비트
    device_map="auto",
)

ImportError: cannot import name 'AutoAWQForCausalLM' from 'awq' (unknown location)

### 3.4. GPTQ - 4bit


In [19]:
!pip install auto-gptq

Collecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.2.1-py3-none-any.whl.metadata (3.0 kB)
Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/23.5 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gekko-1.2.1-py3-none-any.whl (13.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge, gekko, auto-gptq
Successfully installed auto-gptq-0.7.1 gekko-1.2.1 rouge-1.0.1


In [20]:
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

quantize_config = BaseQuantizeConfig(
    bits=4,  # 4비트 양자화
    group_size=128,  # 그룹 크기
)

GPTQ_4bit = AutoGPTQForCausalLM.from_pretrained(
    model_name,
    quantize_config=quantize_config,
    trust_remote_code=True,
    device_map="auto"
)




TypeError: gemma2 isn't supported yet.

## 4. 데이터셋 평가

### 4.0. 데이터셋 로드 및 모델 평가 함수 작성

In [12]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=c7888d0f102637df8fa98692882ff5284b64a7404b5d66decbab958df649f772
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [13]:
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# 1. 데이터셋 로드 - 질의응답(QA) 평가에 널리 사용되는 데이터셋 사용
dataset = load_dataset("squad", split="validation[:10%]")

# 2. 모델 평가 함수
def evaluate_model_with_bleu_rouge(model, tokenizer, dataset):
    bleu_scores = []
    rouge_scores = {"rouge1": [], "rouge2": [], "rougeL": []}
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

    for sample in dataset.select(range(5)):  # 상위 10개 샘플만 평가
        question = sample["question"]
        context = sample["context"]
        reference = sample["answers"]["text"][0]  # 정답 텍스트
        inputs = tokenizer(question, context, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=50)
        hypothesis = tokenizer.decode(outputs[0], skip_special_tokens=True)  # 모델의 답변

        # BLEU 점수 계산
        bleu_score = sentence_bleu([reference.split()], hypothesis.split())
        bleu_scores.append(bleu_score)

        # ROUGE 점수 계산
        rouge = scorer.score(reference, hypothesis)
        rouge_scores["rouge1"].append(rouge["rouge1"].fmeasure)
        rouge_scores["rouge2"].append(rouge["rouge2"].fmeasure)
        rouge_scores["rougeL"].append(rouge["rougeL"].fmeasure)

        # 출력 결과
        print(f"Q: {question}")
        print(f"Model Answer: {hypothesis}")
        print(f"Reference Answer: {reference}")
        print(f"BLEU Score: {bleu_score:.4f}")
        print(f"ROUGE Scores: R1={rouge['rouge1'].fmeasure:.4f}, R2={rouge['rouge2'].fmeasure:.4f}, RL={rouge['rougeL'].fmeasure:.4f}\n")

    # 평균 BLEU 및 ROUGE 점수 반환
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = {key: sum(values) / len(values) for key, values in rouge_scores.items()}

    return avg_bleu, avg_rouge


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

### 4.2. Bitsandbytes-4bit 평가

In [14]:
average_bleu, average_rouge = evaluate_model_with_bleu_rouge(bitsandbytes_4bit, tokenizer, dataset)

print(f"Average BLEU Score: {average_bleu:.4f}")
print(f"Average ROUGE Scores: R1={average_rouge['rouge1']:.4f}, R2={average_rouge['rouge2']:.4f}, RL={average_rouge['rougeL']:.4f}")

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Q: Which NFL team represented the AFC at Super Bowl 50?
Model Answer: Which NFL team represented the AFC at Super Bowl 50?Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50. The game was the first Super Bowl to be played in the San Francisco Bay Area, and the first to be play

## 5. 과제 수행과 시간 분배

- Hugging Face 토큰발급 및 인증 (약 5분)  
- GPT 사용한 양자화 코드 작성_실행시간 포함
  - GGUF (약 30분)
  - Bitsandbytes (약 20분)
  - AWQ (약 5분)
  - GPTQ (약 5분)
- 제출을 위한 양식 정리 (약 10분)