In [2]:
import pandas as pd
import numpy as np
import torch

## GPT-2 학습을 위한 데이터 생성
긍정적인 리뷰는 `긍정: 내용`으로 변경, 부정적인 리뷰는 `부정: 내용`으로 변경

In [3]:
dir_path = "data/"
input_file = dir_path + "poi_review_corpus.txt"
train_src_file = dir_path + "poi_review_gpt2_train.txt"
test_src_file = dir_path + "poi_review_gpt2_test.txt"

In [4]:
df = pd.read_csv(input_file, sep="\t", encoding='utf-8')
df.label = df.label.map({0: "부정 : ", 1: "긍정 : "})
text = df.label.str.cat(df.content)

train_size = len(text) // 10 * 8

def write_txt(filename, series):
  with open(filename, "w", encoding='utf-8') as f:
    f.write("\n".join(series.values))

write_txt(train_src_file, text[:train_size])
write_txt(test_src_file, text[train_size:])

In [5]:
!head -n 3 /content/drive/MyDrive/data/poi-review/poi_review_gpt2_train.txt
!head -n 3 /content/drive/MyDrive/data/poi-review/poi_review_gpt2_test.txt

'head'��(��) ���� �Ǵ� �ܺ� ����, ������ �� �ִ� ���α׷�, �Ǵ�
��ġ ������ �ƴմϴ�.
'head'��(��) ���� �Ǵ� �ܺ� ����, ������ �� �ִ� ���α׷�, �Ǵ�
��ġ ������ �ƴմϴ�.


In [6]:
from transformers import (
    AutoConfig, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling,
    PreTrainedTokenizer,
    Trainer, TrainingArguments,AutoModelWithLMHead
)
from typing import Dict, List, Optional
from torch.utils.data import Dataset


class LineByLineTextDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach soon.
    Source from https://github.com/huggingface/transformers/blob/db7d6a80e82d66127b2a44b6e3382969fdc8b207/src/transformers/data/datasets/language_modeling.py#L115
    """

    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size, padding="max_length")
        self.examples = batch_encoding["input_ids"]
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> Dict[str, torch.tensor]:
        return self.examples[i]

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = LineByLineTextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = LineByLineTextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator


model_name = "beomi/kcgpt2"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelWithLMHead.from_pretrained(model_name)
model.config.pad_token_id = model.config.eos_token_id
train_dataset,test_dataset,data_collator = load_dataset(train_src_file, test_src_file, tokenizer)

Downloading: 100%|██████████| 872/872 [00:00<00:00, 871kB/s]
Downloading: 100%|██████████| 1.28M/1.28M [00:01<00:00, 823kB/s] 
Downloading: 100%|██████████| 948k/948k [00:01<00:00, 674kB/s] 
Downloading: 100%|██████████| 501M/501M [00:45<00:00, 11.5MB/s] 


In [7]:
model_path = "checkpoint/"
training_args = TrainingArguments(
    output_dir=model_path, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=2, # number of training epochs
    per_device_train_batch_size=16, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [8]:
trainer.train()

***** Running training *****
  Num examples = 80000
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 10000
  5%|▌         | 500/10000 [05:28<1:41:43,  1.56it/s]

{'loss': 4.7492, 'learning_rate': 5e-05, 'epoch': 0.1}


  8%|▊         | 800/10000 [08:40<1:37:52,  1.57it/s]Saving model checkpoint to checkpoint/checkpoint-800
Configuration saved in checkpoint/checkpoint-800\config.json
Model weights saved in checkpoint/checkpoint-800\pytorch_model.bin
 10%|█         | 1000/10000 [10:51<1:37:10,  1.54it/s]

{'loss': 4.5092, 'learning_rate': 4.736842105263158e-05, 'epoch': 0.2}


 15%|█▌        | 1500/10000 [16:15<1:32:44,  1.53it/s]

{'loss': 4.4226, 'learning_rate': 4.473684210526316e-05, 'epoch': 0.3}


 16%|█▌        | 1600/10000 [17:21<1:28:36,  1.58it/s]Saving model checkpoint to checkpoint/checkpoint-1600
Configuration saved in checkpoint/checkpoint-1600\config.json
Model weights saved in checkpoint/checkpoint-1600\pytorch_model.bin
 20%|██        | 2000/10000 [21:45<1:31:52,  1.45it/s]

{'loss': 4.3754, 'learning_rate': 4.210526315789474e-05, 'epoch': 0.4}


 24%|██▍       | 2400/10000 [26:19<1:26:19,  1.47it/s]Saving model checkpoint to checkpoint/checkpoint-2400
Configuration saved in checkpoint/checkpoint-2400\config.json
Model weights saved in checkpoint/checkpoint-2400\pytorch_model.bin
 25%|██▌       | 2500/10000 [27:30<1:25:48,  1.46it/s]

{'loss': 4.349, 'learning_rate': 3.9473684210526316e-05, 'epoch': 0.5}


 30%|███       | 3000/10000 [33:09<1:14:17,  1.57it/s]

{'loss': 4.3147, 'learning_rate': 3.6842105263157895e-05, 'epoch': 0.6}


 32%|███▏      | 3200/10000 [34:58<55:43,  2.03it/s]  Saving model checkpoint to checkpoint/checkpoint-3200
Configuration saved in checkpoint/checkpoint-3200\config.json
Model weights saved in checkpoint/checkpoint-3200\pytorch_model.bin
 35%|███▌      | 3500/10000 [37:37<54:35,  1.98it/s]  

{'loss': 4.2855, 'learning_rate': 3.421052631578947e-05, 'epoch': 0.7}


 40%|████      | 4000/10000 [41:35<28:12,  3.54it/s]  Saving model checkpoint to checkpoint/checkpoint-4000
Configuration saved in checkpoint/checkpoint-4000\config.json


{'loss': 4.2606, 'learning_rate': 3.157894736842105e-05, 'epoch': 0.8}


Model weights saved in checkpoint/checkpoint-4000\pytorch_model.bin
 45%|████▌     | 4500/10000 [45:14<45:06,  2.03it/s]  

{'loss': 4.24, 'learning_rate': 2.8947368421052634e-05, 'epoch': 0.9}


 48%|████▊     | 4800/10000 [47:47<54:20,  1.59it/s]Saving model checkpoint to checkpoint/checkpoint-4800
Configuration saved in checkpoint/checkpoint-4800\config.json
Model weights saved in checkpoint/checkpoint-4800\pytorch_model.bin
 50%|█████     | 5000/10000 [49:33<40:55,  2.04it/s]  

{'loss': 4.2289, 'learning_rate': 2.6315789473684212e-05, 'epoch': 1.0}


 55%|█████▌    | 5500/10000 [53:47<37:33,  2.00it/s]

{'loss': 3.957, 'learning_rate': 2.368421052631579e-05, 'epoch': 1.1}


 56%|█████▌    | 5600/10000 [54:37<36:44,  2.00it/s]Saving model checkpoint to checkpoint/checkpoint-5600
Configuration saved in checkpoint/checkpoint-5600\config.json
Model weights saved in checkpoint/checkpoint-5600\pytorch_model.bin
 60%|██████    | 6000/10000 [58:30<33:13,  2.01it/s]  

{'loss': 3.9345, 'learning_rate': 2.105263157894737e-05, 'epoch': 1.2}


 64%|██████▍   | 6400/10000 [1:01:49<29:45,  2.02it/s]Saving model checkpoint to checkpoint/checkpoint-6400
Configuration saved in checkpoint/checkpoint-6400\config.json
Model weights saved in checkpoint/checkpoint-6400\pytorch_model.bin
 65%|██████▌   | 6500/10000 [1:02:41<28:54,  2.02it/s]  

{'loss': 3.9479, 'learning_rate': 1.8421052631578947e-05, 'epoch': 1.3}


 70%|███████   | 7000/10000 [1:06:50<24:39,  2.03it/s]

{'loss': 3.941, 'learning_rate': 1.5789473684210526e-05, 'epoch': 1.4}


 72%|███████▏  | 7200/10000 [1:08:30<23:00,  2.03it/s]Saving model checkpoint to checkpoint/checkpoint-7200
Configuration saved in checkpoint/checkpoint-7200\config.json
Model weights saved in checkpoint/checkpoint-7200\pytorch_model.bin
 75%|███████▌  | 7500/10000 [1:10:59<20:26,  2.04it/s]

{'loss': 3.9406, 'learning_rate': 1.3157894736842106e-05, 'epoch': 1.5}


 80%|████████  | 8000/10000 [1:15:05<16:20,  2.04it/s]Saving model checkpoint to checkpoint/checkpoint-8000
Configuration saved in checkpoint/checkpoint-8000\config.json


{'loss': 3.9336, 'learning_rate': 1.0526315789473684e-05, 'epoch': 1.6}


Model weights saved in checkpoint/checkpoint-8000\pytorch_model.bin
 85%|████████▌ | 8500/10000 [1:19:13<12:15,  2.04it/s]

{'loss': 3.9352, 'learning_rate': 7.894736842105263e-06, 'epoch': 1.7}


 88%|████████▊ | 8800/10000 [1:21:40<09:49,  2.04it/s]Saving model checkpoint to checkpoint/checkpoint-8800
Configuration saved in checkpoint/checkpoint-8800\config.json
Model weights saved in checkpoint/checkpoint-8800\pytorch_model.bin
 90%|█████████ | 9000/10000 [1:23:26<08:52,  1.88it/s]

{'loss': 3.9254, 'learning_rate': 5.263157894736842e-06, 'epoch': 1.8}


 95%|█████████▌| 9500/10000 [1:28:33<05:41,  1.46it/s]

{'loss': 3.9254, 'learning_rate': 2.631578947368421e-06, 'epoch': 1.9}


 96%|█████████▌| 9600/10000 [1:29:41<04:34,  1.46it/s]Saving model checkpoint to checkpoint/checkpoint-9600
Configuration saved in checkpoint/checkpoint-9600\config.json
Model weights saved in checkpoint/checkpoint-9600\pytorch_model.bin
100%|██████████| 10000/10000 [1:34:16<00:00,  1.45it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 10000/10000 [1:34:16<00:00,  1.77it/s]

{'loss': 3.9158, 'learning_rate': 0.0, 'epoch': 2.0}
{'train_runtime': 5657.0009, 'train_samples_per_second': 28.284, 'train_steps_per_second': 1.768, 'train_loss': 4.154575244140625, 'epoch': 2.0}





TrainOutput(global_step=10000, training_loss=4.154575244140625, metrics={'train_runtime': 5657.0009, 'train_samples_per_second': 28.284, 'train_steps_per_second': 1.768, 'train_loss': 4.154575244140625, 'epoch': 2.0})

In [9]:
trainer.save_model()

Saving model checkpoint to checkpoint/
Configuration saved in checkpoint/config.json
Model weights saved in checkpoint/pytorch_model.bin


# pipeline을 이용한 모델 테스트

In [10]:
from transformers import pipeline

reviewer = pipeline('text-generation',model=model_path, tokenizer=model_name)

loading configuration file checkpoint/config.json
Model config GPT2Config {
  "_name_or_path": "checkpoint/",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 0,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "use_cache": true,
  "vocab_size": 55000
}

In [11]:
def generate_review(reviewer, prefix, positive):
  text = "긍정 : " if positive else "부정 : "
  text += prefix
  return reviewer(text, num_return_sequences =5)

In [12]:
generate_review(reviewer, "가성비 좋아요. 커피도 ", True)

[{'generated_text': '긍정 : 가성비 좋아요. 커피도  맛나구요.  맛있고 좋아요. 사장님들이 다 친절해요.  커피맛이  좋네요. 또 재방문하겠습니다. 사장님 감사합니다.  화이팅~~~!!  ☎☎�'},
 {'generated_text': '긍정 : 가성비 좋아요. 커피도 (스콘 맛있어요!) 마싯네요^^* 번창하세요 강추합니다 강추합니다 사장님 번창하세요~!!!♡♡♡♡♡ 강추해요~^^* 또'},
 {'generated_text': '긍정 : 가성비 좋아요. 커피도  굿!!  특히 아메리카노는 완전 맛있어요 ^^:)))))))))))))))))))  커피도 진짜 맛있고  맛도 미쳤어요 ㅎ'},
 {'generated_text': '긍정 : 가성비 좋아요. 커피도  맛있고  분위기도  최고 입니다~^^  사장님 오래오래 근무해주시길!!~^^  강추~!^^ (1/10~)/  (2/10~'},
 {'generated_text': '긍정 : 가성비 좋아요. 커피도  짱맛!~~*^^*♡**&amp;&amp;♡* 사장님 넘 친절하시네욤~~~~*^^*^*♡*&amp;♡.*♡*%'}]

In [13]:
generate_review(reviewer, "바깥보다 못한 실내에 이마를 탁 치고 갑니다 손님에 대한 ", False)

[{'generated_text': '부정 : 바깥보다 못한 실내에 이마를 탁 치고 갑니다 손님에 대한  반말은 기본 ;; 직원들 응대부터 고치시길  진짜 이렇게 일처리 하면서 일하면 잘되나요? 아무리 코로나여도 다시는 안감..  직원'},
 {'generated_text': '부정 : 바깥보다 못한 실내에 이마를 탁 치고 갑니다 손님에 대한  배려가 없어서 불편합니다.  아 참고로 제가 점장인데요 제가 점주이신지는 모르겠지만 아무튼 제가 남자라서 여깁니다.  저는 남자라서 점장'},
 {'generated_text': '부정 : 바깥보다 못한 실내에 이마를 탁 치고 갑니다 손님에 대한  배려가 없는 가게입니다 다신 안갑니다  1층 직원도 불친절하고  2층 직원이나 여자직원들도 불친절하고 3층 직원이나 여자직원'},
 {'generated_text': '부정 : 바깥보다 못한 실내에 이마를 탁 치고 갑니다 손님에 대한  신경은 1도 안씀. 사장님들 너무 불친절  주문해도 먹고 안갈수도..  매장 이용중인데  여긴 안갈래요. 직원분'},
 {'generated_text': '부정 : 바깥보다 못한 실내에 이마를 탁 치고 갑니다 손님에 대한  예의는 1도 없고, 음식에 나오는  잡내나는 음식도 서비스로 제공되지않는 최악의 환경입니다 맛은  보통인데.. 비린내 너무 심합니다'}]

# 디스크에서 불러오기
https://huggingface.co/blog/how-to-generate

In [14]:
trained_path = "trained_models/"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelWithLMHead.from_pretrained(trained_path)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/beomi/kcgpt2/resolve/main/config.json from cache at C:\Users\heegyukim/.cache\huggingface\transformers\6d23b85cd80b100c1e93dd47cd63fc842920dda4b9bdfc4fd6d4f505c4e6c14a.c8d24433cc735a046e6a9b6443e51d5276f508163ff7a7c6cd812df282bc4381
Model config GPT2Config {
  "_name_or_path": "beomi/kcgpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summ

In [15]:
input_text = "긍정: 가성비 좋아요. 커피도 "
input_ids = tokenizer.encode(input_text, return_tensors="pt")
outputs = model.generate(input_ids, max_length=50,
    num_beams=10, 
    do_sample=True,
    top_k=10,
    temperature=2.0,
    top_p=0.92,
    early_stopping=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

긍정: 가성비 좋아요. 커피도  맛있어요. 직원분도 친절하셔서  좋네요. 번창하세요~^^^^^^^^^^^^^^^^^^^^^^^^^


In [30]:
# model quantization using
# https://pytorch.org/docs/stable/quantization.html
# https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html

from transformers.modeling_utils import Conv1D
from inspect import getmembers

q_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear, Conv1D}, dtype=torch.qint8
)

In [31]:
input_text = "긍정: 가성비 좋아요. 커피도 "
input_ids = tokenizer.encode(input_text, return_tensors="pt")
outputs = q_model.generate(input_ids, max_length=50,
    num_beams=10, 
    do_sample=True,
    top_k=10,
    temperature=2.0,
    top_p=0.92,
    early_stopping=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

긍정: 가성비 좋아요. 커피도  맛있구요. 사장님도  친절하십니다.^^ 번창하세요~~~^^ 번창하세요~~~~^^^^^^^^^^^^^^^^
