In [1]:
import pandas as pd
import numpy as np
import torch
import transformers
import bitsandbytes as bnb
import os
import wandb

from transformers import PreTrainedTokenizerFast, AdamW, AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from tqdm import tqdm

bin C:\Users\yjg10\anaconda3\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


In [2]:
data = pd.read_csv('./data/train.csv')

tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it", eos_token='</s>')

formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            input_text = row[q_col] + tokenizer.eos_token + row[a_col]
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            formatted_data.append(input_ids)
print('Done.')

644it [00:01, 517.78it/s]


Done.


In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True
)

In [4]:
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it", device_map="cuda", quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [5]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [6]:
config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    #target_modules=["query_key_value"], 
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.03, 
    bias="none", 
    task_type="CAUSAL_LM"
)

In [7]:
model = get_peft_model(model, config)

In [8]:
CFG = {
    'LR' : 1e-5, # Learning Rate
    'EPOCHS' : 5, # 학습 Epoch
    'eps' : 5e-06
}

In [9]:
optimizer = AdamW(model.parameters(), lr=CFG['LR'], eps = CFG['eps'])
model.train()



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 3072, padding_idx=0)
        (layers): ModuleList(
          (0-27): 28 x GemmaDecoderLayer(
            (self_attn): GemmaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.03, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_l

In [10]:
# 모델 학습
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        batch = batch.to('cuda')
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")

    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(formatted_data)}")

# 모델 저장
model.save_pretrained("./hansoldeco-gemma_1")
tokenizer.save_pretrained("./hansoldeco-gemma_1")

  0%|                                                                                         | 0/6440 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Epoch 1 - Avg Loss: 1.4628: 100%|████████████████████████████████████████████████| 6440/6440 [1:31:58<00:00,  1.17it/s]


Epoch 1/5, Average Loss: 1.4627984388413267


Epoch 2 - Avg Loss: 0.8872: 100%|████████████████████████████████████████████████| 6440/6440 [1:32:00<00:00,  1.17it/s]


Epoch 2/5, Average Loss: 0.8872355479423119


Epoch 3 - Avg Loss: 0.5496: 100%|████████████████████████████████████████████████| 6440/6440 [1:31:28<00:00,  1.17it/s]


Epoch 3/5, Average Loss: 0.5495741888383253


Epoch 4 - Avg Loss: 0.3323: 100%|████████████████████████████████████████████████| 6440/6440 [1:31:11<00:00,  1.18it/s]


Epoch 4/5, Average Loss: 0.3322982309571242


Epoch 5 - Avg Loss: 0.2128: 100%|████████████████████████████████████████████████| 6440/6440 [1:31:27<00:00,  1.17it/s]


Epoch 5/5, Average Loss: 0.21278373805100317


('./hansoldeco-gemma_1\\tokenizer_config.json',
 './hansoldeco-gemma_1\\special_tokens_map.json',
 './hansoldeco-gemma_1\\tokenizer.model',
 './hansoldeco-gemma_1\\added_tokens.json',
 './hansoldeco-gemma_1\\tokenizer.json')

In [2]:
model_dir = "./hansoldeco-gemma_1"
model = AutoModelForCausalLM.from_pretrained(model_dir)
model.to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_dir)

test = pd.read_csv('./data/test.csv')
preds = []

# '질문' 컬럼의 각 질문에 대해 답변 생성
for test_question in tqdm(test['질문']):
    input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')

    # 답변 생성
    output_sequences = model.generate(
        input_ids=input_ids.to('cuda'),
        max_length=300,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    # 생성된 텍스트(답변) 저장
    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip()
        answer_only = answer_only.replace('\n', ' ')
        preds.append(answer_only)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|█████████████████████████████████████████████████████████████████████████████| 130/130 [9:13:40<00:00, 255.54s/it]


In [3]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

pred_embeddings = model.encode(preds)
pred_embeddings.shape

(130, 512)

In [4]:
submit = pd.read_csv('./data/sample_submission.csv')
submit.iloc[:,1:] = pred_embeddings
submit.head()

 -1.4100749e-02 -2.0120133e-02 -1.4857436e-02  8.6378111e-03
 -5.6058690e-03  1.2588419e-02 -1.1796307e-02 -5.5036109e-02
 -2.3220483e-02  1.2828622e-03 -2.5946749e-02 -9.7222133e-03
 -3.6573783e-03 -1.6094036e-02 -3.5180431e-02 -1.1942399e-02
 -1.9172367e-02 -3.7330389e-03  8.7110080e-02 -5.1608472e-03
 -2.9324396e-03 -1.5446494e-02  5.9152371e-03 -2.2650961e-02
 -4.7250804e-03 -6.7779817e-02 -2.6356727e-02 -3.2336984e-02
  3.6796372e-02 -2.3775477e-02  1.9863317e-02 -4.1249864e-02
 -3.0175209e-02 -7.7523175e-03 -3.5332765e-03 -2.8997162e-02
 -2.4992911e-02 -1.7004956e-02 -2.2148654e-02 -5.9040911e-02
  1.9170431e-02 -4.3495875e-02  1.8815327e-02  3.8294862e-03
  3.8930681e-05  5.4402649e-03  2.6349509e-03 -2.1422457e-02
  1.9274894e-02 -6.6782221e-02 -6.7119822e-02  1.1798335e-02
 -2.2180285e-02 -3.9011531e-02 -7.9804473e-03 -2.0972403e-02
 -2.1600561e-02 -5.1894423e-02  3.6616925e-02  1.2272762e-02
  1.1539986e-02 -2.2637097e-02  8.9248046e-03 -2.7362615e-02
 -2.8436219e-03 -4.23358

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,-0.030701,0.037838,0.026273,-0.052029,0.066746,-0.029953,0.019923,0.019805,-0.006178,...,-0.01903,-0.05511,-0.028314,-0.047289,0.019225,0.038215,0.006872,-0.004355,0.04359,0.039623
1,TEST_001,0.010415,-0.003913,0.002629,-0.042928,0.125302,-0.008383,-0.017827,-0.000606,0.000235,...,0.006184,-0.014578,0.028897,-0.031307,-0.010302,-0.030923,0.00821,-0.040377,0.034278,0.056625
2,TEST_002,-0.029305,0.014168,0.012606,-0.028724,0.042983,-0.021368,-0.007101,-0.058296,-0.017497,...,-0.025482,-0.029112,0.022751,-0.093013,-0.064567,0.048615,0.04407,-0.011956,0.039015,0.033038
3,TEST_003,-0.029943,-7.9e-05,-0.038297,-0.000717,0.062758,-0.070725,-0.034338,-0.05742,-0.021166,...,-0.010251,-0.028414,0.020845,-0.041541,0.01527,0.031159,0.018876,-0.042428,-0.009054,0.043878
4,TEST_004,-0.014101,0.004296,-0.00945,-0.018725,0.104379,-0.008382,0.052548,0.033437,-0.034418,...,-0.015233,-0.006626,0.004315,-0.026775,-0.007465,-0.034787,0.045336,-0.024773,-0.01508,0.031129


In [5]:
submit.to_csv('./baseline_submit_lora_2.csv', index=False)