In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW
from tqdm import tqdm

In [2]:
import os
os.chdir("Y:\Python\script\Study\Dacon_0201")
os.getcwd()

'Y:\\Python\\script\\Study\\Dacon_0201'

In [3]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# 데이터 로드
data = pd.read_csv('./data/train.csv')

# 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/ko-gpt-trinity-1.2B-v0.5', eos_token='</s>')

# 데이터 포맷팅 및 토크나이징
formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            # 질문과 답변 쌍을 </s> token으로 연결
            input_text = row[q_col] + tokenizer.eos_token + row[a_col]
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            formatted_data.append(input_ids)
print('Done.')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
644it [00:01, 456.92it/s]

Done.





In [5]:
# 모델 로드
model = GPT2LMHeadModel.from_pretrained('skt/ko-gpt-trinity-1.2B-v0.5')
model.to(device) # 모델을 GPU단으로 이동

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 1920)
    (wpe): Embedding(1024, 1920)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1920, out_features=51200, bias=False)
)

In [6]:
CFG = {
    'LR' : 1e-5, # Learning Rate
    'EPOCHS' : 25, # 학습 Epoch
    'eps' : 5e-06
}

In [7]:
# 모델 학습 설정
optimizer = AdamW(model.parameters(), lr=CFG['LR'], eps = CFG['eps'])
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 1920)
    (wpe): Embedding(1024, 1920)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1920,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1920, out_features=51200, bias=False)
)

In [8]:
# 모델 학습
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")

    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(formatted_data)}")

# 모델 저장
model.save_pretrained("./hansoldeco-kogpt2_4")
tokenizer.save_pretrained("./hansoldeco-kogpt2_4")

Epoch 1 - Avg Loss: 2.0007: 100%|██████████████████████████████████████████████████| 6440/6440 [19:09<00:00,  5.60it/s]


Epoch 1/25, Average Loss: 2.000742217181077


Epoch 2 - Avg Loss: 0.8603: 100%|██████████████████████████████████████████████████| 6440/6440 [19:08<00:00,  5.61it/s]


Epoch 2/25, Average Loss: 0.8602887085945525


Epoch 3 - Avg Loss: 0.4000: 100%|██████████████████████████████████████████████████| 6440/6440 [19:09<00:00,  5.60it/s]


Epoch 3/25, Average Loss: 0.40001681030826264


Epoch 4 - Avg Loss: 0.2437: 100%|██████████████████████████████████████████████████| 6440/6440 [19:07<00:00,  5.61it/s]


Epoch 4/25, Average Loss: 0.2437190512681137


Epoch 5 - Avg Loss: 0.1892: 100%|██████████████████████████████████████████████████| 6440/6440 [19:08<00:00,  5.61it/s]


Epoch 5/25, Average Loss: 0.1892161076613673


Epoch 6 - Avg Loss: 0.1556: 100%|██████████████████████████████████████████████████| 6440/6440 [19:08<00:00,  5.61it/s]


Epoch 6/25, Average Loss: 0.1556065561521729


Epoch 7 - Avg Loss: 0.1386: 100%|██████████████████████████████████████████████████| 6440/6440 [18:55<00:00,  5.67it/s]


Epoch 7/25, Average Loss: 0.13859518382854483


Epoch 8 - Avg Loss: 0.1235: 100%|██████████████████████████████████████████████████| 6440/6440 [18:56<00:00,  5.67it/s]


Epoch 8/25, Average Loss: 0.12352965870996509


Epoch 9 - Avg Loss: 0.1155: 100%|██████████████████████████████████████████████████| 6440/6440 [18:59<00:00,  5.65it/s]


Epoch 9/25, Average Loss: 0.11553031926546593


Epoch 10 - Avg Loss: 0.1106: 100%|█████████████████████████████████████████████████| 6440/6440 [18:57<00:00,  5.66it/s]


Epoch 10/25, Average Loss: 0.1105510388288404


Epoch 11 - Avg Loss: 0.1053: 100%|█████████████████████████████████████████████████| 6440/6440 [19:10<00:00,  5.60it/s]


Epoch 11/25, Average Loss: 0.10532773598863726


Epoch 12 - Avg Loss: 0.1010: 100%|█████████████████████████████████████████████████| 6440/6440 [19:09<00:00,  5.60it/s]


Epoch 12/25, Average Loss: 0.10096306730964169


Epoch 13 - Avg Loss: 0.0972: 100%|█████████████████████████████████████████████████| 6440/6440 [19:10<00:00,  5.60it/s]


Epoch 13/25, Average Loss: 0.09722426460075383


Epoch 14 - Avg Loss: 0.0934: 100%|█████████████████████████████████████████████████| 6440/6440 [19:00<00:00,  5.64it/s]


Epoch 14/25, Average Loss: 0.0934310611344654


Epoch 15 - Avg Loss: 0.0933: 100%|█████████████████████████████████████████████████| 6440/6440 [18:59<00:00,  5.65it/s]


Epoch 15/25, Average Loss: 0.09328241282452175


Epoch 16 - Avg Loss: 0.0885: 100%|█████████████████████████████████████████████████| 6440/6440 [19:08<00:00,  5.61it/s]


Epoch 16/25, Average Loss: 0.08845142558338526


Epoch 17 - Avg Loss: 0.0876: 100%|█████████████████████████████████████████████████| 6440/6440 [19:17<00:00,  5.56it/s]


Epoch 17/25, Average Loss: 0.08758627292237343


Epoch 18 - Avg Loss: 0.0879: 100%|█████████████████████████████████████████████████| 6440/6440 [18:59<00:00,  5.65it/s]


Epoch 18/25, Average Loss: 0.08788300434862095


Epoch 19 - Avg Loss: 0.0858: 100%|█████████████████████████████████████████████████| 6440/6440 [18:59<00:00,  5.65it/s]


Epoch 19/25, Average Loss: 0.08584743301016828


Epoch 20 - Avg Loss: 0.0843: 100%|█████████████████████████████████████████████████| 6440/6440 [18:59<00:00,  5.65it/s]


Epoch 20/25, Average Loss: 0.08430626231667973


Epoch 21 - Avg Loss: 0.0841: 100%|█████████████████████████████████████████████████| 6440/6440 [19:00<00:00,  5.65it/s]


Epoch 21/25, Average Loss: 0.08411903629738907


Epoch 22 - Avg Loss: 0.0811: 100%|█████████████████████████████████████████████████| 6440/6440 [19:01<00:00,  5.64it/s]


Epoch 22/25, Average Loss: 0.0810525272134113


Epoch 23 - Avg Loss: 0.0811: 100%|█████████████████████████████████████████████████| 6440/6440 [19:07<00:00,  5.61it/s]


Epoch 23/25, Average Loss: 0.08105548086477825


Epoch 24 - Avg Loss: 0.0802: 100%|█████████████████████████████████████████████████| 6440/6440 [19:12<00:00,  5.59it/s]


Epoch 24/25, Average Loss: 0.08019427226332844


Epoch 25 - Avg Loss: 0.0794: 100%|█████████████████████████████████████████████████| 6440/6440 [19:03<00:00,  5.63it/s]


Epoch 25/25, Average Loss: 0.07937391069312782


('./hansoldeco-kogpt2_4\\tokenizer_config.json',
 './hansoldeco-kogpt2_4\\special_tokens_map.json',
 './hansoldeco-kogpt2_4\\tokenizer.json')

In [9]:
model_dir = "./hansoldeco-kogpt2_4"
model = GPT2LMHeadModel.from_pretrained(model_dir)
model.to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir)

test = pd.read_csv('./data/test.csv')
preds = []

# '질문' 컬럼의 각 질문에 대해 답변 생성
for test_question in tqdm(test['질문']):
    input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')

    # 답변 생성
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        max_length=300,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    # 생성된 텍스트(답변) 저장
    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip()
        answer_only = answer_only.replace('\n', ' ')
        preds.append(answer_only)

100%|████████████████████████████████████████████████████████████████████████████████| 130/130 [14:02<00:00,  6.48s/it]


In [11]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

pred_embeddings = model.encode(preds)
pred_embeddings.shape

Downloading modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading 2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

(130, 512)

In [12]:
submit = pd.read_csv('./data/sample_submission.csv')
submit.iloc[:,1:] = pred_embeddings
submit.head()

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,-0.00174,0.028238,0.034884,-0.014568,0.094486,-0.041205,0.019016,0.033069,-0.026172,...,-0.037115,-0.040449,0.005084,-0.024867,0.00301,0.080024,0.023238,-0.013583,-0.008512,0.053287
1,TEST_001,-0.034669,-0.012421,0.004309,0.006007,0.075005,0.010479,0.018293,0.014101,0.020187,...,0.015067,0.013309,-0.008057,-0.024717,-0.028334,0.063792,-0.01845,-0.020972,0.024904,0.027523
2,TEST_002,0.006782,-0.03481,-0.020411,0.01283,0.124574,-0.042875,0.007455,-0.017053,0.031871,...,-0.011219,-0.045216,0.056968,-0.024782,-0.015799,0.009405,-0.01449,-0.042023,-0.040934,0.067516
3,TEST_003,0.028911,0.046362,-0.001881,0.028639,0.056609,-0.053389,-0.076164,-0.025218,0.000975,...,-0.030978,0.001686,0.021035,-0.048269,0.013208,0.01907,0.000262,-0.000105,-0.035628,0.031741
4,TEST_004,-0.009312,-0.0117,-0.023611,-0.001638,0.110857,-0.017399,0.04542,0.04297,-0.033358,...,0.014104,-0.052094,0.037248,-0.010126,-0.031945,0.006107,0.007673,0.009926,-0.012819,0.072118


In [13]:
submit.to_csv('./baseline_submit_4.csv', index=False)