## Fewshot 평가 지표및 Finetuning model load

In [1]:
!pip install transformers
!pip install accelerate
!pip install -q -U git+https://github.com/dopeornope-Lee/peft_modifier.git

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m47.2 MB/s[0m eta [36m0:00:0

## Data

- Data는 NSMC데이터입니다.
- Fewshot 평가도, beomi님의 procedure를 따라, 벤치마크 성능을 도출해보고자 하는 procedure를 따릅니다.

In [2]:
!mkdir -p data_in/KOR/naver_movie
!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_train.txt \
              -O data_in/KOR/naver_movie/ratings_train.txt
!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_test.txt \
              -O data_in/KOR/naver_movie/ratings_test.txt

--2023-08-19 05:01:54--  https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14628807 (14M) [text/plain]
Saving to: ‘data_in/KOR/naver_movie/ratings_train.txt’


2023-08-19 05:01:55 (114 MB/s) - ‘data_in/KOR/naver_movie/ratings_train.txt’ saved [14628807/14628807]

--2023-08-19 05:01:55--  https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connecte

## Import Module

In [3]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm
import pandas as pd
from peft import PeftModel, PeftConfig
import numpy as np
import re
import random
from random import sample



In [5]:
SEED_NUM = 1234
np.random.seed(SEED_NUM)
random.seed(SEED_NUM)

## K(G)OAT nsmc 테스트

### 모델 로드

- 만약 fine tuning 모델을 허깅페이스에 업로드 하였다면, adapter만 업로딩 되었을 것입니다

- 이럴때, 모델을 로드 하는 방식을 따라갈 수 있도록 준비하였습니다.

In [None]:
# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/polyglot-ko-5.8b")

Downloading (…)okenizer_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [9]:
# 모델의 id를 입력해줍니다. (즉 업로드된 허깅페이스 레포명)
peft_model_id = "DopeorNope/KOAT-5.8b"
# config를 가져옵니다(만약 없다면, model.config를 통해서 학습과직후 확인후 업로드하세요!)
config = PeftConfig.from_pretrained(peft_model_id)

#config를 통해 베이스 모델을 불러옵니다.
cls_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,torch_dtype=torch.float16,low_cpu_mem_usage=True).cuda()

# 이후 어댑터와 모델을 결합한 최종 모델을 로드합니다.
cls_model = PeftModel.from_pretrained(cls_model, peft_model_id)

Downloading (…)/adapter_config.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/36.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/13 [00:00<?, ?it/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/926M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

Downloading (…)of-00013.safetensors:   0%|          | 0.00/515M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/3.23M [00:00<?, ?B/s]

In [10]:
cls_model=model

In [11]:
cls_model.config.max_length = 2048
cls_model.config.pad_token_id = 0

### 퓨샷 러닝을 위한 네이버 영화 리뷰 데이터 EDA 및 구성

In [12]:
# 데이터 전처리 준비
DATA_IN_PATH = './data_in/KOR'
DATA_OUT_PATH = './data_out/KOR'

DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_train.txt')
DATA_TEST_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_test.txt')

train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\t', quoting = 3)
train_data = train_data.dropna()

In [13]:
print('데이터 positive 라벨: ', '긍정')
print('데이터 negative 라벨: ', '부정')

데이터 positive 라벨:  긍정
데이터 negative 라벨:  부정


In [14]:
print('학습 예시 케이스 구조: ', '문장: 오늘 기분이 좋아\n감정: 긍정\n')

학습 예시 케이스 구조:  문장: 오늘 기분이 좋아
감정: 긍정



In [15]:
print('gpt 최대 토큰 길이: ', cls_model.config.max_position_embeddings)

gpt 최대 토큰 길이:  2048


In [16]:
sent_lens = [len(tokenizer(s).input_ids) for s in tqdm(train_data['document'])]

print('Few shot 케이스 토큰 평균 길이: ', np.mean(sent_lens))
print('Few shot 케이스 토큰 최대 길이: ', np.max(sent_lens))
print('Few shot 케이스 토큰 길이 표준편차: ',np.std(sent_lens))
print('Few shot 케이스 토큰 길이 80 퍼센타일: ',np.percentile(sent_lens, 80))

100%|██████████| 149995/149995 [00:13<00:00, 10756.48it/s]

Few shot 케이스 토큰 평균 길이:  20.22912763758792
Few shot 케이스 토큰 최대 길이:  280
Few shot 케이스 토큰 길이 표준편차:  16.48828728915166
Few shot 케이스 토큰 길이 80 퍼센타일:  27.0





In [17]:
train_fewshot_data = []

for train_sent, train_label in tqdm(train_data[['document', 'label']].values):
    tokens = tokenizer(train_sent).input_ids

    if len(tokens) <= 25:
        train_fewshot_data.append((train_sent, train_label))

100%|██████████| 149995/149995 [00:14<00:00, 10172.98it/s]


### 데이터를 통한 평가 지표 도출

In [18]:
test_data = pd.read_csv(DATA_TEST_PATH, header=0, delimiter='\t', quoting=3)
test_data = test_data.dropna()
test_data.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [19]:
# Full Dataset
# sample_size = len(test_data)

# Sampled Dataset
sample_size = 500

train_fewshot_samples = []

for _ in range(sample_size):
    fewshot_examples = sample(train_fewshot_data, 10)
    train_fewshot_samples.append(fewshot_examples)

if sample_size < len(test_data['id']):
    test_data = test_data.sample(sample_size, random_state=SEED_NUM)

### K(G)OAT 프롬프트 1 방식 실험

In [20]:
def build_prompt_text(sent):
    return "문장: " + sent + '\n감정:'

def clean_text(sent):
    sent_clean = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", sent)
    return sent_clean

real_labels = []
pred_tokens = []

total_len = len(test_data[['document','label']].values)

for i, (test_sent, test_label) in tqdm(enumerate(test_data[['document','label']].values), total=total_len):
    prompt_text = ''

    for ex in train_fewshot_samples[i]:
        example_text, example_label = ex
        cleaned_example_text = clean_text(example_text)
        appended_prompt_example_text = build_prompt_text(cleaned_example_text)
        appended_prompt_example_text += ' 긍정\n' if example_label == 1 else ' 부정\n'
        prompt_text += appended_prompt_example_text

    cleaned_sent = clean_text(test_sent)
    appended_prompt_sent = build_prompt_text(cleaned_sent)

    prompt_text += appended_prompt_sent

    tokens = tokenizer(prompt_text, return_tensors="pt")
    token_ids, attn_mask = tokens.input_ids.cuda(), tokens.attention_mask.cuda()
    gen_tokens = cls_model.generate(input_ids=token_ids, attention_mask=attn_mask,
                                    max_new_tokens=1, pad_token_id=0)
    pred = tokenizer.batch_decode(gen_tokens[:, -1])[0].strip()

    pred_tokens.append(pred)
    real_labels.append('긍정' if test_label == 1 else '부정')

100%|██████████| 500/500 [01:42<00:00,  4.86it/s]


In [21]:
accuracy_match = [p == t for p, t in zip(pred_tokens, real_labels)]
accuracy = len([m for m in accuracy_match if m]) / len(real_labels)

print(accuracy)

0.712


### K(G)OAT 프롬프트 2 방식 실험

In [22]:
def build_prompt_text(sent):
    return '다음 문장은 긍정일까요 부정일까요?\n' + sent + '\n정답:'

real_labels = []
pred_tokens = []


real_labels = []
pred_tokens = []

total_len = len(test_data[['document','label']].values)

for i, (test_sent, test_label) in tqdm(enumerate(test_data[['document','label']].values), total=total_len):
    prompt_text = ''

    for ex in train_fewshot_samples[i]:
        example_text, example_label = ex
        cleaned_example_text = clean_text(example_text)
        appended_prompt_example_text = build_prompt_text(cleaned_example_text)
        appended_prompt_example_text += ' 긍정\n' if example_label == 1 else ' 부정\n'
        prompt_text += appended_prompt_example_text

    cleaned_sent = clean_text(test_sent)
    appended_prompt_sent = build_prompt_text(cleaned_sent)

    prompt_text += appended_prompt_sent

    tokens = tokenizer(prompt_text, return_tensors="pt")
    token_ids, attn_mask = tokens.input_ids.cuda(), tokens.attention_mask.cuda()
    gen_tokens = cls_model.generate(input_ids=token_ids, attention_mask=attn_mask,
                                    max_new_tokens=1, pad_token_id=0)
    pred = tokenizer.batch_decode(gen_tokens[:, -1])[0].strip()

    pred_tokens.append(pred)
    real_labels.append('긍정' if test_label == 1 else '부정')

100%|██████████| 500/500 [02:23<00:00,  3.48it/s]


In [23]:
accuracy_match = [p == t for p, t in zip(pred_tokens, real_labels)]
accuracy = len([m for m in accuracy_match if m]) / len(real_labels)

print(accuracy)

0.81
