In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW
from tqdm import tqdm

In [2]:
import os
os.chdir("Y:\Python\script\Study\Dacon_0201")
os.getcwd()

'Y:\\Python\\script\\Study\\Dacon_0201'

In [3]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# 데이터 로드
data = pd.read_csv('./data/train_2.csv')

# 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/ko-gpt-trinity-1.2B-v0.5', eos_token='</s>')

# 데이터 포맷팅 및 토크나이징
formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            # 질문과 답변 쌍을 </s> token으로 연결
            input_text = row[q_col] + tokenizer.eos_token + row[a_col]
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            formatted_data.append(input_ids)
print('Done.')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
644it [00:00, 728.04it/s]

Done.





In [5]:
# 모델 로드
model = GPT2LMHeadModel.from_pretrained('skt/ko-gpt-trinity-1.2B-v0.5')
model.to(device) # 모델을 GPU단으로 이동

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 1920)
    (wpe): Embedding(1024, 1920)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1920, out_features=51200, bias=False)
)

In [6]:
CFG = {
    'LR' : 1e-5, # Learning Rate
    'EPOCHS' : 25, # 학습 Epoch
    'eps' : 5e-06
}

In [7]:
# 모델 학습 설정
optimizer = AdamW(model.parameters(), lr=CFG['LR'], eps = CFG['eps'])
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 1920)
    (wpe): Embedding(1024, 1920)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1920, out_features=51200, bias=False)
)

In [8]:
# 모델 학습
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")

    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(formatted_data)}")

# 모델 저장
model.save_pretrained("./hansoldeco-kogpt2_4_2")
tokenizer.save_pretrained("./hansoldeco-kogpt2_4_2")

Epoch 1 - Avg Loss: 2.0017: 100%|██████████████████████████████████████████████████| 6440/6440 [13:29<00:00,  7.96it/s]


Epoch 1/25, Average Loss: 2.0017297715041207


Epoch 2 - Avg Loss: 0.8596: 100%|██████████████████████████████████████████████████| 6440/6440 [13:26<00:00,  7.98it/s]


Epoch 2/25, Average Loss: 0.8595979163909088


Epoch 3 - Avg Loss: 0.4038: 100%|██████████████████████████████████████████████████| 6440/6440 [12:47<00:00,  8.39it/s]


Epoch 3/25, Average Loss: 0.4037878365441


Epoch 4 - Avg Loss: 0.2435: 100%|██████████████████████████████████████████████████| 6440/6440 [13:17<00:00,  8.08it/s]


Epoch 4/25, Average Loss: 0.24348242043948484


Epoch 5 - Avg Loss: 0.1878: 100%|██████████████████████████████████████████████████| 6440/6440 [12:58<00:00,  8.27it/s]


Epoch 5/25, Average Loss: 0.18784358548665686


Epoch 6 - Avg Loss: 0.1572: 100%|██████████████████████████████████████████████████| 6440/6440 [13:25<00:00,  7.99it/s]


Epoch 6/25, Average Loss: 0.15715380592289038


Epoch 7 - Avg Loss: 0.1375: 100%|██████████████████████████████████████████████████| 6440/6440 [12:35<00:00,  8.53it/s]


Epoch 7/25, Average Loss: 0.137523602293544


Epoch 8 - Avg Loss: 0.1255: 100%|██████████████████████████████████████████████████| 6440/6440 [12:34<00:00,  8.53it/s]


Epoch 8/25, Average Loss: 0.12554364556241318


Epoch 9 - Avg Loss: 0.1184: 100%|██████████████████████████████████████████████████| 6440/6440 [12:36<00:00,  8.51it/s]


Epoch 9/25, Average Loss: 0.11837184501968047


Epoch 10 - Avg Loss: 0.1065:  12%|█████▊                                            | 747/6440 [01:27<11:16,  8.42it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Epoch 10 - Avg Loss: 0.1098: 100%|█████████████████████████████████████████████████| 6440/6440 [12:34<00:00,  8.53it/s]


Epoch 10/25, Average Loss: 0.10982159897164435


Epoch 11 - Avg Loss: 0.1058: 100%|█████████████████████████████████████████████████| 6440/6440 [12:36<00:00,  8.52it/s]


Epoch 11/25, Average Loss: 0.10577518163243112


Epoch 12 - Avg Loss: 0.1001: 100%|█████████████████████████████████████████████████| 6440/6440 [12:34<00:00,  8.54it/s]


Epoch 12/25, Average Loss: 0.10013770355600053


Epoch 13 - Avg Loss: 0.0975: 100%|█████████████████████████████████████████████████| 6440/6440 [12:34<00:00,  8.54it/s]


Epoch 13/25, Average Loss: 0.09752837403023742


Epoch 14 - Avg Loss: 0.0940: 100%|█████████████████████████████████████████████████| 6440/6440 [12:49<00:00,  8.37it/s]


Epoch 14/25, Average Loss: 0.09400121633698065


Epoch 15 - Avg Loss: 0.0918: 100%|█████████████████████████████████████████████████| 6440/6440 [13:26<00:00,  7.98it/s]


Epoch 15/25, Average Loss: 0.09178142798037749


Epoch 16 - Avg Loss: 0.0901: 100%|█████████████████████████████████████████████████| 6440/6440 [12:35<00:00,  8.52it/s]


Epoch 16/25, Average Loss: 0.09010479301836063


Epoch 17 - Avg Loss: 0.0884: 100%|█████████████████████████████████████████████████| 6440/6440 [12:33<00:00,  8.55it/s]


Epoch 17/25, Average Loss: 0.0884027251905779


Epoch 18 - Avg Loss: 0.0870: 100%|█████████████████████████████████████████████████| 6440/6440 [12:34<00:00,  8.53it/s]


Epoch 18/25, Average Loss: 0.08697277119148938


Epoch 19 - Avg Loss: 0.0869: 100%|█████████████████████████████████████████████████| 6440/6440 [12:33<00:00,  8.54it/s]


Epoch 19/25, Average Loss: 0.0868845847373111


Epoch 20 - Avg Loss: 0.0852: 100%|█████████████████████████████████████████████████| 6440/6440 [12:35<00:00,  8.53it/s]


Epoch 20/25, Average Loss: 0.08518911476032982


Epoch 21 - Avg Loss: 0.0824:  59%|████████████████████████████▉                    | 3810/6440 [07:27<05:43,  7.67it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Epoch 21 - Avg Loss: 0.0834: 100%|█████████████████████████████████████████████████| 6440/6440 [12:37<00:00,  8.50it/s]


Epoch 21/25, Average Loss: 0.08342342945401447


Epoch 22 - Avg Loss: 0.0826: 100%|█████████████████████████████████████████████████| 6440/6440 [12:35<00:00,  8.52it/s]


Epoch 22/25, Average Loss: 0.08264309931040296


Epoch 23 - Avg Loss: 0.0805: 100%|█████████████████████████████████████████████████| 6440/6440 [12:36<00:00,  8.51it/s]


Epoch 23/25, Average Loss: 0.08052040838046427


Epoch 24 - Avg Loss: 0.0797: 100%|█████████████████████████████████████████████████| 6440/6440 [12:33<00:00,  8.55it/s]


Epoch 24/25, Average Loss: 0.0797043372326681


Epoch 25 - Avg Loss: 0.0795: 100%|█████████████████████████████████████████████████| 6440/6440 [12:35<00:00,  8.53it/s]


Epoch 25/25, Average Loss: 0.07947264220182326


('./hansoldeco-kogpt2_4_2\\tokenizer_config.json',
 './hansoldeco-kogpt2_4_2\\special_tokens_map.json',
 './hansoldeco-kogpt2_4_2\\tokenizer.json')

In [9]:
model_dir = "./hansoldeco-kogpt2_4_2"
model = GPT2LMHeadModel.from_pretrained(model_dir)
model.to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir)

test = pd.read_csv('./data/test.csv')
preds = []

# '질문' 컬럼의 각 질문에 대해 답변 생성
for test_question in tqdm(test['질문']):
    input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')

    # 답변 생성
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        max_length=300,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    # 생성된 텍스트(답변) 저장
    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip()
        answer_only = answer_only.replace('\n', ' ')
        preds.append(answer_only)

100%|████████████████████████████████████████████████████████████████████████████████| 130/130 [06:55<00:00,  3.20s/it]


In [10]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

pred_embeddings = model.encode(preds)
pred_embeddings.shape

(130, 512)

In [11]:
submit = pd.read_csv('./data/sample_submission.csv')
submit.iloc[:,1:] = pred_embeddings
submit.head()

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,0.015341,0.019901,0.001819,0.013254,0.128906,-0.038372,0.035526,0.003914,0.012441,...,-0.015642,-0.065815,0.011862,-0.020828,-0.025905,0.047072,0.006964,-0.008943,0.021947,0.004232
1,TEST_001,-0.019322,0.003211,-0.013877,0.033974,0.097455,-0.017306,0.015203,-0.038989,-0.016788,...,-0.023184,-0.03024,0.024912,-0.067202,0.001899,0.023498,-0.004316,-0.058046,0.030207,0.031503
2,TEST_002,0.022715,-0.082353,-0.059012,0.023319,0.088409,-0.03224,0.020548,0.02771,0.018195,...,-0.029399,-0.040173,0.028294,-0.017319,0.005047,0.023821,-0.010594,-0.02364,0.003697,0.087255
3,TEST_003,0.014597,0.010334,0.044874,0.021705,0.022972,-0.021076,-0.076503,-0.035291,0.002525,...,-0.018954,0.023068,0.019846,-0.026382,-0.025583,0.002549,-0.017229,0.019436,-0.025017,0.001964
4,TEST_004,-0.003408,-0.019128,-0.011795,0.001567,0.11545,-0.022965,0.047793,0.049827,-0.03038,...,0.010959,-0.054748,0.051292,-0.017034,-0.030259,0.010181,0.02129,0.010175,-0.020196,0.073795


In [12]:
submit.to_csv('./baseline_submit_4_2.csv', index=False)