In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW
from tqdm import tqdm

In [3]:
import os
os.chdir("Y:\Python\script\Study\Dacon_0201")
os.getcwd()

'Y:\\Python\\script\\Study\\Dacon_0201'

In [4]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
# 데이터 로드
data = pd.read_csv('./data/train.csv')

# 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/ko-gpt-trinity-1.2B-v0.5', eos_token='</s>')

# 데이터 포맷팅 및 토크나이징
formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            # 질문과 답변 쌍을 </s> token으로 연결
            input_text = row[q_col] + tokenizer.eos_token + row[a_col]
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            formatted_data.append(input_ids)
print('Done.')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
644it [00:01, 470.24it/s]


Done.


In [6]:
# 모델 로드
model = GPT2LMHeadModel.from_pretrained('skt/ko-gpt-trinity-1.2B-v0.5')
model.to(device) # 모델을 GPU단으로 이동

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 1920)
    (wpe): Embedding(1024, 1920)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1920, out_features=51200, bias=False)
)

In [7]:
CFG = {
    'LR' : 1e-5, # Learning Rate
    'EPOCHS' : 15, # 학습 Epoch
    'eps' : 5e-06
}

In [8]:
# 모델 학습 설정
optimizer = AdamW(model.parameters(), lr=CFG['LR'], eps = CFG['eps'])
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 1920)
    (wpe): Embedding(1024, 1920)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1920, out_features=51200, bias=False)
)

In [None]:
# 모델 학습
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")

    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(formatted_data)}")

# 모델 저장
model.save_pretrained("./hansoldeco-kogpt2_T")
tokenizer.save_pretrained("./hansoldeco-kogpt2_T")

Epoch 1 - Avg Loss: 1.9983:   3%|█▌                                                 | 201/6440 [00:36<17:37,  5.90it/s]

In [13]:
model_dir = "./hansoldeco-kogpt2_3"
model = GPT2LMHeadModel.from_pretrained(model_dir)
model.to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir)

test = pd.read_csv('./data/test.csv')
preds = []

# '질문' 컬럼의 각 질문에 대해 답변 생성
for test_question in tqdm(test['질문']):
    input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')

    # 답변 생성
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        max_length=300,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    # 생성된 텍스트(답변) 저장
    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip()
        answer_only = answer_only.replace('\n', ' ')
        preds.append(answer_only)

100%|████████████████████████████████████████████████████████████████████████████████| 130/130 [07:30<00:00,  3.46s/it]


In [14]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

pred_embeddings = model.encode(preds)
pred_embeddings.shape

(130, 512)

In [15]:
submit = pd.read_csv('./data/sample_submission.csv')
submit.iloc[:,1:] = pred_embeddings
submit.head()

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,0.02086,0.013327,-0.024121,0.017506,0.111002,0.014007,-0.030595,0.058331,-0.02107,...,-0.009531,-0.059186,0.009969,-0.041596,-0.012163,0.081612,0.012497,-0.017939,-0.023321,0.037912
1,TEST_001,-0.018419,-0.017479,-0.006128,-0.010644,0.066636,-0.001671,0.024308,0.014272,-0.001419,...,-0.014589,-2.4e-05,-0.014986,-0.019787,-0.03395,0.060124,-0.012853,0.001638,-0.005587,0.023062
2,TEST_002,-0.015238,-0.037657,-0.014859,-0.006163,0.118507,-0.020814,0.001047,-0.005111,0.06064,...,-0.008534,-0.025196,0.057599,-0.018881,-0.027957,0.019416,-0.020695,-0.010508,-0.050147,0.074059
3,TEST_003,0.007679,0.048548,0.008126,0.03063,0.064025,-0.047766,-0.083127,-0.026413,-0.02073,...,-0.034084,-0.00333,0.040001,-0.035059,0.022275,0.023752,-0.018037,-0.007049,-0.055945,0.031932
4,TEST_004,-0.014829,-0.01874,-0.02344,0.000925,0.108329,-0.022401,0.039849,0.036982,-0.037178,...,0.018803,-0.065277,0.047362,-0.011628,-0.039066,0.005705,0.007572,0.004773,-0.012496,0.072949


In [16]:
submit.to_csv('./baseline_submit_3.csv', index=False)