In [1]:
import pandas as pd
import numpy as np
import torch
import transformers
import bitsandbytes as bnb
import os
#import wandb
from datetime import datetime

from transformers import PreTrainedTokenizerFast, AutoTokenizer, AdamW, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from tqdm import tqdm

#os.environ["TOKENIZERS_PARALLELISM"] = "true" 
#torch.backends.cuda.matmul.allow_tf32=True
#torch.set_float32_matmul_precision('medium')
#torch.backends.cudnn.benchmark = True

# Data Preprocessing

In [2]:
# 데이터 로드 
data = pd.read_csv('./data/train_data_final.csv')

# 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained('LDCC/LDCC-SOLAR-10.7B',
                                                    eos_token='</s>')


max_length = 128

# 데이터 포맷팅 및 토큰화
formatted_data = []
for _, row in tqdm(data.iterrows()):
    input_text = row['질문'] + tokenizer.eos_token + row['답변']
    input_enc = tokenizer.encode(input_text, return_tensors='pt',
                                 padding='max_length',
                                 truncation=True,
                                 max_length=max_length)
    formatted_data.append(input_enc)

print('Done.')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LlamaTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
6440it [00:03, 1929.38it/s]

Done.





In [3]:
formatted_data = torch.cat(formatted_data, dim=0)

In [4]:
# 모델 로드

model_id = "LDCC/LDCC-SOLAR-10.7B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map='auto', # 'balanced' 로 수정
                                             # torch_dtype=torch.float32,
                                             
                                             )


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    #target_modules=["query_key_value"], 
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

trainable params: 31457280 || all params: 5659561984 || trainable%: 0.555825346359525


In [6]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=formatted_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=64, # 128도 가능할 듯?
        gradient_accumulation_steps=1,
        # max_steps=50, # default=1
        num_train_epochs=1, # default : 1
        learning_rate=1e-4,
        fp16=True,
        logging_steps=10,
        output_dir="outputs",
        optim="paged_adamw_8bit" #  defaults to "adamw_torch"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False 


In [7]:
print('Start train!')
print(f'Start time : {datetime.now()}')
trainer.train()

Start train!
Start time : 2024-03-05 11:51:07.463586




Step,Training Loss
10,1.4696
20,1.163
30,1.0382
40,0.9451
50,0.8891
60,0.828
70,0.775
80,0.7485
90,0.7051
100,0.682


TrainOutput(global_step=101, training_loss=0.9222028385294546, metrics={'train_runtime': 2779.1019, 'train_samples_per_second': 2.317, 'train_steps_per_second': 0.036, 'total_flos': 5.290870903406592e+16, 'train_loss': 0.9222028385294546, 'epoch': 1.0})

In [15]:
# 모델 저장 
model.save_pretrained('./outputs')

# Inference code
https://dacon.io/competitions/official/236216/codeshare/9690?page=1&dtype=recent

https://www.datacamp.com/tutorial/solar-10-7b-fine-tuned-model-tutorial

In [27]:
import time
import datetime
from huggingface_hub import hf_hub_download
import pandas as pd
import numpy as np
from peft import PeftModel, PeftConfig

In [2]:
# peft_model_id = "LDCC/LDCC-SOLAR-10.7B"
# config = PeftConfig.from_pretrained(peft_model_id)  # 
tokenizer = PreTrainedTokenizerFast.from_pretrained('LDCC/LDCC-SOLAR-10.7B',
                                                    eos_token='</s>',
                                                   device_map='auto')

saved_model = AutoModelForCausalLM.from_pretrained('./outputs',
                                                  device_map='balanced')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LlamaTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [4]:
#model.eval()
saved_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(48000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-47): 48 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)


In [6]:
model = saved_model

In [7]:
test = pd.read_csv('./data/test.csv')

preds = []
cfg_max_token = 128

for test_q in tqdm(test['질문']):
    # 입력 텍스트를 토큰화 하고 모델 입력 형태로 변환
    input_ = tokenizer(test_q + tokenizer.eos_token, return_tensors='pt')

    # 답변 생성
    outputs = model.generate(input_.input_ids.to(model.device), max_new_tokens=cfg_max_token)

    # 생성된 텍스트(답변) 저장
    full_text = tokenizer.batch_decode(outputs.detach().cpu(), skip_special_tokens=False)
    # 질문과 답변의 사이를 나타내는 eos_token (</s>)를 찾아, 이후부터 출력
    full_text = full_text[0]
    answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
    answer_only = full_text[answer_start:].strip()
    answer_only = answer_only.replace('\n', ' ')
    preds.append(answer_only)

  0%|          | 0/130 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  1%|          | 1/130 [00:33<1:13:00, 33.96s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  2%|▏         | 2/130 [01:03<1:06:41, 31.26s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  5%|▍         | 6/130 [03:01<1:01:36, 29.81s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  5%|▌         | 7/130 [03:31<1:00:58, 29.74s/it]A decoder-only architecture is being used, but right-padding was detected

# Submission

In [8]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
modelEmb = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = modelEmb.encode(preds)
pred_embeddings.shape

(130, 512)

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
#submit.head()

 -0.05787576  0.02424286 -0.01426161  0.02708409 -0.03406559 -0.02915388
  0.01783147 -0.01336498  0.02643015  0.00913416  0.04120821  0.00343056
  0.0553206  -0.03493883  0.01026988  0.03103421 -0.00014379  0.04794798
 -0.00668999 -0.02794424  0.0574375  -0.00750846 -0.01270988  0.00829487
  0.01746696 -0.01356706 -0.00439308  0.02510952  0.00179581 -0.03258586
  0.00964298 -0.03848421 -0.02407507 -0.00775238  0.01716223 -0.02149599
  0.01907344  0.01062734 -0.00102517  0.00170575  0.00806548 -0.0262778
 -0.03089069 -0.00879641  0.00282553  0.05847174 -0.02919797 -0.03616628
  0.04105035  0.01449057  0.03698153  0.0436534  -0.01013702  0.06011777
 -0.00544043  0.03347301 -0.00146516 -0.00039528  0.01456409 -0.02860089
  0.00377667  0.0438307  -0.00335442  0.03798956  0.00199026 -0.01114311
  0.04903546  0.00649571  0.01185958 -0.04719926  0.06271724  0.00713183
 -0.00235171  0.04034111 -0.00595038 -0.07398773  0.01504692 -0.0138079
 -0.05483552 -0.03416279  0.05759177 -0.01898777  0.0

In [None]:
# 리더보드 제출을 위한 csv파일 생성
cfg_full_name = 'QLoRA + 4bit quantization + LDCC-SOLAR-10.7B'
submit.to_csv(f'./submissions/{cfg_full_name}_saved-model_test-submit2.csv', index=False)