#### 주의!!

이 실습은 가급적 NVIDIA GPU가 설치된 컴퓨터 환경이거나 Google Colab에서 진행해주세요.

## 환경 준비 
(Google Colab 환경에서 사용하세요)

In [None]:
!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt
!pip install -r requirements.txt
!pip install tensorflow==2.2.0

## 데이터 다운로드
(Google Colab 환경에서 사용하세요)

In [None]:
!mkdir -p data_in/KOR/naver_movie
!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_train.txt \
              -O data_in/KOR/naver_movie/ratings_train.txt
!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_test.txt \
              -O data_in/KOR/naver_movie/ratings_test.txt

In [None]:
import os
import tensorflow as tf
from transformers import TFGPT2LMHeadModel

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import gluonnlp as nlp
from gluonnlp.data import SentencepieceTokenizer

import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
import re

import random
from random import sample

아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요.

In [None]:
import wget
import zipfile

wget.download('https://github.com/NLP-kr/tensorflow-ml-nlp-tf2/releases/download/v1.0/gpt_ckpt.zip')

with zipfile.ZipFile('gpt_ckpt.zip') as z:
    z.extractall()

In [None]:
# 시각화

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel('Epochs')
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
SEED_NUM = 1234
tf.random.set_seed(SEED_NUM)
np.random.seed(SEED_NUM)
random.seed(SEED_NUM)

## 퓨샷 러닝을 위한 네이버 영화 리뷰 모델 구성


In [None]:
TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'

tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,
                                               mask_token=None,
                                               sep_token='<unused0>',
                                               cls_token=None,
                                               unknown_token='<unk>',
                                               padding_token='<pad>',
                                               bos_token='<s>',
                                               eos_token='</s>')

In [None]:
class TFGPT2FewshotClassifier(tf.keras.Model):
    def __init__(self, dir_path):
        super(TFGPT2FewshotClassifier, self).__init__()
        self.gpt2 = TFGPT2LMHeadModel.from_pretrained(dir_path)
        
    def call(self, inputs):
        outputs = self.gpt2({'input_ids': inputs})[0][:, -1, :]

        return outputs

In [None]:
BASE_MODEL_PATH = './gpt_ckpt'
cls_model = TFGPT2FewshotClassifier(dir_path=BASE_MODEL_PATH)

## 퓨샷 러닝을 위한 네이버 영화 리뷰 데이터 구성

In [None]:
# 데이터 전처리 준비
DATA_IN_PATH = './data_in/KOR'
DATA_OUT_PATH = './data_out/KOR'

DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_train.txt')
DATA_TEST_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_test.txt')

train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\t', quoting = 3)
train_data = train_data.dropna()

In [None]:
print('데이터 positive 라벨: ', tokenizer('긍정'))
print('데이터 negative 라벨: ', tokenizer('부정'))

In [None]:
print('학습 예시 케이스 구조: ', tokenizer('문장: 오늘 기분이 좋아\n감정: 긍정\n'))

In [None]:
print('gpt2 최대 토큰 길이: ', cls_model.gpt2.config.n_ctx)

In [None]:
sent_lens = [len(tokenizer(s)) for s in train_data['document']]

print('Few shot 케이스 토큰 평균 길이: ', np.mean(sent_lens))
print('Few shot 케이스 토큰 최대 길이: ', np.max(sent_lens))
print('Few shot 케이스 토큰 길이 표준편차: ',np.std(sent_lens))
print('Few shot 케이스 토큰 길이 80 퍼센타일: ',np.percentile(sent_lens, 80))

In [None]:
train_fewshot_data = []

for train_sent, train_label in train_data[['document', 'label']].values:
    tokens = vocab[tokenizer(train_sent)]

    if len(tokens) <= 25:
        train_fewshot_data.append((train_sent, train_label))

## 네이버 영화 리뷰 데이터를 활용한 퓨샷 러닝 및 평가

In [None]:
test_data = pd.read_csv(DATA_TEST_PATH, header=0, delimiter='\t', quoting=3)
test_data = test_data.dropna()
test_data.head()

In [None]:
sample_size = 5000

train_fewshot_samples = []

for _ in range(sample_size):
    fewshot_examples = sample(train_fewshot_data, 30)
    train_fewshot_samples.append(fewshot_examples)

if sample_size < len(test_data['id']):
    test_data = test_data.sample(sample_size, random_state=SEED_NUM)

In [None]:
def build_prompt_text(sent):
    return "문장: " + sent + '\n감정: '

def clean_text(sent):
    sent_clean = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", sent)
    return sent_clean

real_labels = []
pred_tokens = []

for i, (test_sent, test_label) in enumerate(test_data[['document','label']].values):
    tokens = [vocab[vocab.bos_token]]

    for ex in train_fewshot_samples[i]:
        example_text, example_label = ex
        cleaned_example_text = clean_text(example_text)
        appended_prompt_example_text = build_prompt_text(cleaned_example_text)
        appended_prompt_example_text += '긍정' if example_label == 1 else '부정' + '\n'

        tokens += vocab[tokenizer(appended_prompt_example_text)]

    cleaned_sent = clean_text(test_sent)
    appended_prompt_sent = build_prompt_text(cleaned_sent)
    test_tokens = vocab[tokenizer(appended_prompt_sent)]

    tokens += test_tokens

    pred = tf.argmax(cls_model(np.array([tokens], dtype=np.int64)), axis=-1).numpy()
    label = vocab[tokenizer('긍정')] if test_label == 1 else vocab[tokenizer('부정')]

    pred_tokens.append(pred[0])
    real_labels.append(label[0])

In [None]:
accuracy_match = [p == t for p, t in zip(pred_tokens, real_labels)]
accuracy = len([m for m in accuracy_match if m]) / len(real_labels)

print(accuracy)

In [None]:
def build_prompt_text(sent):
    return '감정 분석 문장: ' + sent + '\n결과: '

real_labels = []
pred_tokens = []


for i, (test_sent, test_label) in enumerate(test_data[['document','label']].values):
    tokens = [vocab[vocab.bos_token]]

    for ex in train_fewshot_samples[i]:
        example_text, example_label = ex
        cleaned_example_text = clean_text(example_text)
        appended_prompt_example_text = build_prompt_text(cleaned_example_text)
        appended_prompt_example_text += '긍정' if example_label == 1 else '부정' + '\n'

        tokens += vocab[tokenizer(appended_prompt_example_text)]

    cleaned_sent = clean_text(test_sent)
    appended_prompt_sent = build_prompt_text(cleaned_sent)
    test_tokens = vocab[tokenizer(appended_prompt_sent)]

    tokens += test_tokens

    pred = tf.argmax(cls_model(np.array([tokens], dtype=np.int64)), axis=-1).numpy()
    label = vocab[tokenizer('긍정')] if test_label == 1 else vocab[tokenizer('부정')]

    pred_tokens.append(pred[0])
    real_labels.append(label[0])

In [None]:
accuracy_match = [p == t for p, t in zip(pred_tokens, real_labels)]
accuracy = len([m for m in accuracy_match if m]) / len(real_labels)

print(accuracy)