In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW
from tqdm import tqdm

In [2]:
import os
os.chdir("Y:\Python\script\Study\Dacon_0201")
os.getcwd()

'Y:\\Python\\script\\Study\\Dacon_0201'

In [3]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# 데이터 로드
data = pd.read_csv('./data/train.csv')

# 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2', eos_token='</s>')

# 데이터 포맷팅 및 토크나이징
formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            # 질문과 답변 쌍을 </s> token으로 연결
            input_text = row[q_col] + tokenizer.eos_token + row[a_col]
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            formatted_data.append(input_ids)
print('Done.')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
644it [00:00, 736.27it/s]

Done.





In [6]:
input_text

'AD, PD에 면한 벽체 결로에 대한 대책은 어떤 것이 있나요?</s>AD, PD에 면한 벽체의 결로에 대한 대책으로는 단열재를 미실하게 시공하여 결로가 생기는 벽체의 표면 온도를 노점온도 이상으로 유지해주는 것이 중요합니다. 결로가 발생하는 것을 방지하기 위해서는 적절한 수온조절과 단열재 시공이 필요합니다. 또한, 실내 습도를 적정 수준으로 유지하고 외부에서의 열이나 수분이 벽체에 스며들지 않도록 주의해야 합니다. AD, PD에 면한 벽체 결로에 대한 대책으로는 건물의 외부와 내부 온도 대조를 잘 조절하여 결로 발생을 최소화하는 것이 필요합니다.'

In [5]:
# 모델 로드
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.to(device) # 모델을 GPU단으로 이동

# 모델 학습 하이퍼파라미터(Hyperparameter) 세팅
# 실제 필요에 따라 조정하세요.
CFG = {
    'LR' : 2e-5, # Learning Rate
    'EPOCHS' : 30, # 학습 Epoch
}

In [11]:
# 모델 학습 설정
optimizer = AdamW(model.parameters(), lr=CFG['LR'])
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [13]:
# 모델 학습
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        # 데이터를 GPU단으로 이동
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        # 진행률 표시줄에 평균 손실 업데이트
        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")

    # 에폭의 평균 손실을 출력
    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(formatted_data)}")

# 모델 저장
model.save_pretrained("./hansoldeco-kogpt2")
tokenizer.save_pretrained("./hansoldeco-kogpt2")

Epoch 1 - Avg Loss: 2.4962: 100%|██████████████████████████████████████████████████| 6440/6440 [03:00<00:00, 35.64it/s]


Epoch 1/30, Average Loss: 2.496171522362632


Epoch 2 - Avg Loss: 1.5838: 100%|██████████████████████████████████████████████████| 6440/6440 [03:03<00:00, 35.04it/s]


Epoch 2/30, Average Loss: 1.5837613562341804


Epoch 3 - Avg Loss: 1.0257: 100%|██████████████████████████████████████████████████| 6440/6440 [03:01<00:00, 35.56it/s]


Epoch 3/30, Average Loss: 1.0256689539607267


Epoch 4 - Avg Loss: 0.7001: 100%|██████████████████████████████████████████████████| 6440/6440 [03:01<00:00, 35.53it/s]


Epoch 4/30, Average Loss: 0.7000670270462122


Epoch 5 - Avg Loss: 0.4996: 100%|██████████████████████████████████████████████████| 6440/6440 [03:02<00:00, 35.35it/s]


Epoch 5/30, Average Loss: 0.49964165994608256


Epoch 6 - Avg Loss: 0.3794: 100%|██████████████████████████████████████████████████| 6440/6440 [02:59<00:00, 35.90it/s]


Epoch 6/30, Average Loss: 0.37943205651624695


Epoch 7 - Avg Loss: 0.3030: 100%|██████████████████████████████████████████████████| 6440/6440 [03:03<00:00, 35.14it/s]


Epoch 7/30, Average Loss: 0.30304837670316054


Epoch 8 - Avg Loss: 0.2563: 100%|██████████████████████████████████████████████████| 6440/6440 [03:01<00:00, 35.52it/s]


Epoch 8/30, Average Loss: 0.25629206324887016


Epoch 9 - Avg Loss: 0.2236: 100%|██████████████████████████████████████████████████| 6440/6440 [03:01<00:00, 35.56it/s]


Epoch 9/30, Average Loss: 0.22361377248719844


Epoch 10 - Avg Loss: 0.1994: 100%|█████████████████████████████████████████████████| 6440/6440 [03:03<00:00, 35.06it/s]


Epoch 10/30, Average Loss: 0.1994489519891578


Epoch 11 - Avg Loss: 0.1838: 100%|█████████████████████████████████████████████████| 6440/6440 [02:59<00:00, 35.79it/s]


Epoch 11/30, Average Loss: 0.18378214666168696


Epoch 12 - Avg Loss: 0.1721: 100%|█████████████████████████████████████████████████| 6440/6440 [03:01<00:00, 35.56it/s]


Epoch 12/30, Average Loss: 0.17212172106533133


Epoch 13 - Avg Loss: 0.1605: 100%|█████████████████████████████████████████████████| 6440/6440 [03:02<00:00, 35.38it/s]


Epoch 13/30, Average Loss: 0.16054477259771238


Epoch 14 - Avg Loss: 0.1533: 100%|█████████████████████████████████████████████████| 6440/6440 [03:02<00:00, 35.23it/s]


Epoch 14/30, Average Loss: 0.15332059762672007


Epoch 15 - Avg Loss: 0.1430: 100%|█████████████████████████████████████████████████| 6440/6440 [03:02<00:00, 35.37it/s]


Epoch 15/30, Average Loss: 0.1430431112159703


Epoch 16 - Avg Loss: 0.1387: 100%|█████████████████████████████████████████████████| 6440/6440 [02:58<00:00, 36.04it/s]


Epoch 16/30, Average Loss: 0.13868047184374485


Epoch 17 - Avg Loss: 0.1324: 100%|█████████████████████████████████████████████████| 6440/6440 [03:02<00:00, 35.34it/s]


Epoch 17/30, Average Loss: 0.13238797507520694


Epoch 18 - Avg Loss: 0.1284: 100%|█████████████████████████████████████████████████| 6440/6440 [03:04<00:00, 35.00it/s]


Epoch 18/30, Average Loss: 0.12836335927930587


Epoch 19 - Avg Loss: 0.1229: 100%|█████████████████████████████████████████████████| 6440/6440 [03:01<00:00, 35.48it/s]


Epoch 19/30, Average Loss: 0.1228549020982388


Epoch 20 - Avg Loss: 0.1206: 100%|█████████████████████████████████████████████████| 6440/6440 [03:02<00:00, 35.36it/s]


Epoch 20/30, Average Loss: 0.1205995986181722


Epoch 21 - Avg Loss: 0.1176: 100%|█████████████████████████████████████████████████| 6440/6440 [03:00<00:00, 35.68it/s]


Epoch 21/30, Average Loss: 0.11755058687000958


Epoch 22 - Avg Loss: 0.1160: 100%|█████████████████████████████████████████████████| 6440/6440 [03:00<00:00, 35.72it/s]


Epoch 22/30, Average Loss: 0.11602828534239013


Epoch 23 - Avg Loss: 0.1113: 100%|█████████████████████████████████████████████████| 6440/6440 [03:02<00:00, 35.35it/s]


Epoch 23/30, Average Loss: 0.11132220676275814


Epoch 24 - Avg Loss: 0.1097: 100%|█████████████████████████████████████████████████| 6440/6440 [03:00<00:00, 35.67it/s]


Epoch 24/30, Average Loss: 0.10967308039435689


Epoch 25 - Avg Loss: 0.1086: 100%|█████████████████████████████████████████████████| 6440/6440 [03:00<00:00, 35.64it/s]


Epoch 25/30, Average Loss: 0.10862577930859273


Epoch 26 - Avg Loss: 0.1056: 100%|█████████████████████████████████████████████████| 6440/6440 [03:02<00:00, 35.33it/s]


Epoch 26/30, Average Loss: 0.10564550579459007


Epoch 27 - Avg Loss: 0.1033: 100%|█████████████████████████████████████████████████| 6440/6440 [03:00<00:00, 35.62it/s]


Epoch 27/30, Average Loss: 0.10332218335524943


Epoch 28 - Avg Loss: 0.1032: 100%|█████████████████████████████████████████████████| 6440/6440 [03:02<00:00, 35.30it/s]


Epoch 28/30, Average Loss: 0.1031849570031515


Epoch 29 - Avg Loss: 0.1012: 100%|█████████████████████████████████████████████████| 6440/6440 [03:00<00:00, 35.70it/s]


Epoch 29/30, Average Loss: 0.1011894282546496


Epoch 30 - Avg Loss: 0.1001: 100%|█████████████████████████████████████████████████| 6440/6440 [02:59<00:00, 35.85it/s]


Epoch 30/30, Average Loss: 0.10013015973352775


('./hansoldeco-kogpt2\\tokenizer_config.json',
 './hansoldeco-kogpt2\\special_tokens_map.json',
 './hansoldeco-kogpt2\\tokenizer.json')

In [21]:
# 저장된 Fine-tuned 모델과 토크나이저 불러오기
model_dir = "./hansoldeco-kogpt2"
model = GPT2LMHeadModel.from_pretrained(model_dir)
model.to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir)

# Inference를 위한 test.csv 파일 로드
test = pd.read_csv('./data/test.csv')

# test.csv의 '질문'에 대한 '답변'을 저장할 리스트
preds = []

# '질문' 컬럼의 각 질문에 대해 답변 생성
for test_question in tqdm(test['질문']):
    # 입력 텍스트를 토큰화하고 모델 입력 형태로 변환
    input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')

    # 답변 생성
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        max_length=300,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    # 생성된 텍스트(답변) 저장
    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        # 질문과 답변의 사이를 나타내는 eos_token (</s>)를 찾아, 이후부터 출력
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip()
        answer_only = answer_only.replace('\n', ' ')
        preds.append(answer_only)

100%|████████████████████████████████████████████████████████████████████████████████| 130/130 [04:33<00:00,  2.11s/it]


In [15]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = model.encode(preds)
pred_embeddings.shape

(130, 512)

In [18]:
submit = pd.read_csv('./data/sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,0.03943,0.052284,-0.010837,0.017736,0.108008,0.031396,0.025996,0.025984,-0.007522,...,-0.021107,-0.045743,-0.002101,-0.039602,-0.03091,0.030084,0.024172,0.00531,-0.028633,0.005533
1,TEST_001,-0.037038,0.02002,0.001134,0.009207,0.081419,-0.030418,-0.019119,-0.044742,0.000883,...,-0.042009,-0.022691,0.017259,-0.056746,0.002028,0.052509,-0.015952,-0.070776,0.014139,-0.006767
2,TEST_002,0.023194,-0.070512,-0.063158,0.004954,0.097865,-0.014374,0.038814,0.008755,0.036057,...,-0.017273,-0.030242,0.047169,-0.025758,-0.004745,0.014908,-0.010881,-0.037538,-0.000937,0.084507
3,TEST_003,0.055374,0.031727,-0.031782,0.012531,0.066642,-0.013288,-0.021099,0.058763,-0.002689,...,-0.017568,-0.017694,0.095326,-0.077263,0.045482,0.03162,-0.016304,-0.052003,-0.013276,0.066964
4,TEST_004,0.025686,0.018044,-0.040819,-0.019995,0.13407,-0.07425,-0.020119,0.014266,-0.023227,...,-0.027313,-0.065165,0.034412,-0.025109,-0.033019,-0.011047,0.015343,-0.02177,0.001402,0.059624


In [19]:
# 리더보드 제출을 위한 csv파일 생성
submit.to_csv('./baseline_submit.csv', index=False)