환경 : Colab A100 GPU <br>
소요시간 : 약 1시간

In [1]:
# 필요한 라이브러리 설치
!pip install transformers datasets torch accelerate -q

# 임포트
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
import os

print(f"CUDA : {torch.cuda.is_available()}")
print(f"GPU : {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA : True
GPU : NVIDIA A100-SXM4-40GB


#데이터셋 로드 - HaluEval

In [3]:
# HaluEval의 모든 binary classification 태스크 로드

# Binary classification 태스크들 (4개)
binary_tasks = ['qa_samples', 'dialogue_samples', 'summarization_samples', 'general']

all_data = []

for task in binary_tasks:
    print(f"\nLoad {task}")
    dataset = load_dataset("pminervini/HaluEval", task)
    task_data = dataset['data']

    # 각 샘플을 dict로 변환하고 task 정보 추가
    for i in range(len(task_data)):
        sample = dict(task_data[i])  # dict로 변환
        sample['task'] = task  # task 필드 추가
        all_data.append(sample)

    print(f"  {len(task_data)} Sample Loaded")

print(f"\nTotal Samples : {len(all_data)}")


Load qa_samples
  10000 Sample Loaded

Load dialogue_samples
  10000 Sample Loaded

Load summarization_samples


summarization_samples/data-00000-of-0000(…):   0%|          | 0.00/25.5M [00:00<?, ?B/s]

Generating data split:   0%|          | 0/10000 [00:00<?, ? examples/s]

  10000 Sample Loaded

Load general


general/data-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating data split:   0%|          | 0/4507 [00:00<?, ? examples/s]

  4507 Sample Loaded

Total Samples : 34507


In [4]:
# 전체 데이터 레이블 분포 확인
data = all_data

hallucination_counts = {}
task_counts = {}

for sample in data:
    label = sample['hallucination']
    task = sample['task']

    hallucination_counts[label] = hallucination_counts.get(label, 0) + 1
    task_counts[task] = task_counts.get(task, 0) + 1

print("Total Label Distribution:")
for label, count in hallucination_counts.items():
    print(f"  {label}: {count}")

print(f"\nSamples per task:")
for task, count in task_counts.items():
    print(f"  {task}: {count}")

# yes=1, no=0으로 변환
def convert_label(label):
    return 1 if label == 'yes' else 0

# 태스크별 텍스트 구성 함수
def create_input_text(sample):
    task = sample['task']

    if task == 'qa_samples':
        return f"Knowledge: {sample['knowledge']}\nQuestion: {sample['question']}\nAnswer: {sample['answer']}"

    elif task == 'dialogue_samples':
        return f"Knowledge: {sample['knowledge']}\nDialogue: {sample['dialogue_history']}\nResponse: {sample['response']}"

    elif task == 'summarization_samples':
        return f"Document: {sample['document']}\nSummary: {sample['summary']}"

    elif task == 'general':
        return f"Query: {sample['user_query']}\nResponse: {sample['chatgpt_response']}"

    else:
        raise ValueError(f"Unknown task: {task}")

# 각 태스크별 첫 샘플 테스트
print(f"\nConverted Samples:")
for task in ['qa_samples', 'dialogue_samples', 'summarization_samples', 'general']:
    sample = next(s for s in data if s['task'] == task)
    sample_text = create_input_text(sample)
    sample_label = convert_label(sample['hallucination'])
    print(f"\n{task}:")
    print(f"  Label: {sample_label}")
    print(f"  Text: {sample_text[:200]}...")

Total Label Distribution:
  yes: 15845
  no: 18662

Samples per task:
  qa_samples: 10000
  dialogue_samples: 10000
  summarization_samples: 10000
  general: 4507

Converted Samples:

qa_samples:
  Label: 1
  Text: Knowledge: Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.First for Women is a woman's magazine published by Bauer Media Group in the U...

dialogue_samples:
  Label: 1
  Text: Knowledge: Iron Man is starring Robert Downey Jr.Robert Downey Jr. starred in Zodiac (Crime Fiction Film)Zodiac (Crime Fiction Film) is starring Jake Gyllenhaal
Dialogue: [Human]: Do you like Iron Man...

summarization_samples:
  Label: 1
  Text: Document: Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board th...

general:
  Label: 0
  Text: Query: Produce a list of common words in the English language.


#모델 로드 및 레이어 구조 설정

이 부분은 현재 Qwen2.5 7B 모델 기준으로 설정되었기 때문에 각 모델에 따라서 다르게 해야함<br>
라마 모델은 로그인 필요

In [11]:
from huggingface_hub import login

# HuggingFace 로그인 (토큰 입력)
login()

print("HuggingFace login successful")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

HuggingFace login successful


In [12]:
# 확인할 모델 리스트
models_to_check = {
    "Qwen2.5-7B": "Qwen/Qwen2.5-7B",
    "LLaMA-3.2-3B": "meta-llama/Llama-3.2-3B-Instruct",
    "Mistral-7B": "mistralai/Mistral-7B-v0.1",
    "Falcon-7B": "tiiuae/falcon-7b"
}

layer_info = {}

for model_name, model_path in models_to_check.items():
    print(f"\n{'='*60}")
    print(f"Checking {model_name} ({model_path})")
    print(f"{'='*60}")

    try:
        # Load tokenizer
        tokenizer_temp = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

        # Load model config only (to save memory)
        from transformers import AutoConfig
        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)

        # Get number of layers
        num_layers = config.num_hidden_layers
        hidden_size = config.hidden_size

        print(f"Total layers: {num_layers}")
        print(f"Hidden dimension: {hidden_size}")
        print(f"First layer: 0")
        print(f"Middle layer: {num_layers // 2}")
        print(f"Last layer: {num_layers - 1}")

        layer_info[model_name] = {
            'path': model_path,
            'num_layers': num_layers,
            'hidden_dim': hidden_size,
            'first': 0,
            'middle': num_layers // 2,
            'last': num_layers - 1
        }

    except Exception as e:
        print(f"Error loading {model_name}: {e}")

print(f"\n{'='*60}")
print("Summary:")
print(f"{'='*60}")
for name, info in layer_info.items():
    print(f"{name}:")
    print(f"  Layers: {info['num_layers']} (first={info['first']}, mid={info['middle']}, last={info['last']})")
    print(f"  Hidden dim: {info['hidden_dim']}")


Checking Qwen2.5-7B (Qwen/Qwen2.5-7B)
Total layers: 28
Hidden dimension: 3584
First layer: 0
Middle layer: 14
Last layer: 27

Checking LLaMA-3.2-3B (meta-llama/Llama-3.2-3B-Instruct)


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

Total layers: 28
Hidden dimension: 3072
First layer: 0
Middle layer: 14
Last layer: 27

Checking Mistral-7B (mistralai/Mistral-7B-v0.1)


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Total layers: 32
Hidden dimension: 4096
First layer: 0
Middle layer: 16
Last layer: 31

Checking Falcon-7B (tiiuae/falcon-7b)


tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_falcon.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.



Total layers: 32
Hidden dimension: 4544
First layer: 0
Middle layer: 16
Last layer: 31

Summary:
Qwen2.5-7B:
  Layers: 28 (first=0, mid=14, last=27)
  Hidden dim: 3584
LLaMA-3.2-3B:
  Layers: 28 (first=0, mid=14, last=27)
  Hidden dim: 3072
Mistral-7B:
  Layers: 32 (first=0, mid=16, last=31)
  Hidden dim: 4096
Falcon-7B:
  Layers: 32 (first=0, mid=16, last=31)
  Hidden dim: 4544


###Qwen2.5-7B:<br>
  Layers: 28 (first=0, mid=14, last=27)<br>
  Hidden dim: 3584<br>
###LLaMA-3.2-3B:<br>
  Layers: 28 (first=0, mid=14, last=27)<br>
  Hidden dim: 3072<br>
###Mistral-7B:<br>
  Layers: 32 (first=0, mid=16, last=31)<br>
  Hidden dim: 4096<br>
###Falcon-7B:<br>
  Layers: 32 (first=0, mid=16, last=31)<br>
  Hidden dim: 4544<br>

In [13]:
# Qwen 모델 먼저 로드해서 구조 확인
# 이 부분은 각자 모델에 맞게 공식 HF 참조해서 수정
model_name = "Qwen/Qwen2.5-7B"

print(f"Loading {model_name}")

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# 모델 로드 (8bit로 메모리 절약)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# 모델 구조 확인
print(f"\nModel Structure:")
print(model)

Loading Qwen/Qwen2.5-7B


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


Model Structure:
Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), 

In [14]:
# Qwen2.5-7B 기준으로 레이어 인덱스 설정
# 이 부분 설정은 위에 있는 값에 따라서 알아서 수정하시면 됩니다

layer_indices = {
    "first": 0,
    "middle": 14,
    "last": 27
}

hidden_dim = 3584  # Qwen2.5-7B의 hidden dimension

print(f"Qwen2.5-7B Setting:")
print(f"  Total Layers: 28")
print(f"  Hidden dimension: {hidden_dim}")
print(f"\nLayers to Extract:")
for pos, idx in layer_indices.items():
    print(f"  {pos}: layer {idx}")

Qwen2.5-7B Setting:
  Total Layers: 28
  Hidden dimension: 3584

Layers to Extract:
  first: layer 0
  middle: layer 14
  last: layer 27


In [7]:
def extract_hidden_states(model, tokenizer, text, layer_indices):
    """
    텍스트를 모델에 입력하고 지정된 레이어의 hidden states를 추출
    평균 pooling 적용
    """
    # 토크나이징
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Forward pass with hidden states
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # hidden_states는 (layer수 + 1)개 (embedding layer 포함)
    # 따라서 layer 0 = hidden_states[1], layer 1 = hidden_states[2], ...
    hidden_states = outputs.hidden_states

    results = {}
    for pos_name, layer_idx in layer_indices.items():
        # layer_idx번째 레이어의 hidden state 가져오기
        # hidden_states[0]은 embedding, hidden_states[1]은 layer 0
        layer_hidden = hidden_states[layer_idx + 1]  # [batch_size, seq_len, hidden_dim]

        # 평균 pooling (모든 토큰에 대해)
        pooled = layer_hidden.mean(dim=1).squeeze()  # [hidden_dim]

        results[pos_name] = pooled.cpu().numpy()

    return results

In [15]:
# 전체 데이터셋 처리
num_samples = len(data)  # 34507개

print(f"Processing {num_samples} samples")

results_list = []

for idx in tqdm(range(num_samples)):
    sample = data[idx]

    # 텍스트 및 레이블 준비
    text = create_input_text(sample)
    label = convert_label(sample['hallucination'])
    task = sample['task']

    # Hidden states 추출
    hidden_states = extract_hidden_states(model, tokenizer, text, layer_indices)

    # 각 레이어별로 결과 저장
    for pos_name, hidden in hidden_states.items():
        row = {
            'model': 'Qwen2.5-7B',
            'task': task,
            'layer_position': pos_name,
            'text': text,
            'label': label
        }

        # Hidden dimension 값들 추가
        for dim_idx in range(hidden_dim):
            row[f'dim_{dim_idx}'] = hidden[dim_idx]

        results_list.append(row)

    # 메모리 정리 (100개마다)
    if (idx + 1) % 100 == 0:
        gc.collect()
        torch.cuda.empty_cache()

print(f"총 {len(results_list)} Data points Extracted")

Processing 34507 samples


100%|██████████| 34507/34507 [27:48<00:00, 20.68it/s]

총 103521 Data points Extracted





In [None]:
df = pd.DataFrame(results_list)

#Summarization

In [None]:
import os

# Convert to DataFrame
print("Creating DataFrame...")
df = pd.DataFrame()

print(f"\nDataFrame Info:")
print(f"  Shape: {df.shape}")
print(f"  Number of columns: {len(df.columns)}")
print(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print(f"\nLabel distribution:")
print(df['label'].value_counts().sort_index())

print(f"\nTask distribution:")
print(df['task'].value_counts())

print(f"\nLayer distribution:")
print(df['layer_position'].value_counts())

print(f"\nTask × Label distribution:")
print(df[['task', 'label']].value_counts().sort_index())

csv_filename = "qwen2.5_7b_halueval_representations.csv"
print(f"\nSaving CSV file: {csv_filename}")

df.to_csv(csv_filename, index=False)

print(f"Save completed! File size: {os.path.getsize(csv_filename) / 1024**2:.2f} MB")

from google.colab import files
print("\nStarting file download...")
files.download(csv_filename)

print("Done!")

Creating DataFrame...

DataFrame Info:
  Shape: (103521, 3589)
  Number of columns: 3589
  Memory usage: 955.62 MB

Label distribution:
label
0    55986
1    47535
Name: count, dtype: int64

Task distribution:
task
qa_samples               30000
dialogue_samples         30000
summarization_samples    30000
general                  13521
Name: count, dtype: int64

Layer distribution:
layer_position
first     34507
middle    34507
last      34507
Name: count, dtype: int64

Task × Label distribution:
task                   label
dialogue_samples       0        14970
                       1        15030
general                0        11076
                       1         2445
qa_samples             0        14970
                       1        15030
summarization_samples  0        14970
                       1        15030
Name: count, dtype: int64

Saving CSV file: qwen2.5_7b_halueval_representations.csv


In [None]:
df.head()