In [1]:
import pandas as pd
import numpy as np
import torch
import transformers
import bitsandbytes as bnb
import os
#import wandb
from datetime import datetime
from pytz import timezone

from transformers import PreTrainedTokenizerFast, AutoTokenizer, AdamW, AutoModelForCausalLM, BitsAndBytesConfig, EarlyStoppingCallback, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel, PeftConfig
from tqdm import tqdm
import sklearn
from sklearn.model_selection import train_test_split

#os.environ["TOKENIZERS_PARALLELISM"] = "true" 
#torch.backends.cuda.matmul.allow_tf32=True
#torch.set_float32_matmul_precision('medium')
#torch.backends.cudnn.benchmark = True

In [2]:
# config
cfg = {
    'runtime' : datetime.now(timezone('Asia/Seoul')).strftime("%y%m%d%H%M"), # cfg를 생성한 시각 -> model dir의 마지막 부분에 들어감
    'model_name' : 'LDCC-SOLAR-10.7B',
    'trainer_name' : 'QLoRA + 4bit quantization + LDCC-SOLAR-10.7B',
    'random_seed' : 2024, # seed

    # train_config 
    'train_strategy' : 'epoch', # 'epoch' or 'steps' 
    'train_size' : 0.9, # train size
    'test_size' : 0.1, # evaluation size
    'shuffle' : True, # shuffle when train_test_split
    'train_batch_size' : 32, # train batch_size / max_length : 512 기준 128, 64 -> CUDA OOM  -> 32 !!
    'eval_batch_size' : 8, # evaluation batch_size / train_batch_size 낮춰주면서 같이 낮춰줌
    'lr' : 1e-4, # learning rate
    'optimizer' : "paged_adamw_8bit", # optimizer

    # steps config
    'max_steps' : 50,  # use only when train_strategy is 'steps'
    'logging_steps' : 5, # use only when train_strategy is 'steps'
    'save_steps' : 5, # default:500 / use only when train_strategy is 'steps' 

    # epoch config
    'num_train_epochs' : 5, # use only when train_stratey is 'epoch'
    
    
    # evaluation(validation) config
    'eval_strategy' : 'epoch', # how often do eval -> unify with 'train_strategy'
    'do_eval' : True, # evaluate or not 

    # save config - logs and models
    'save_strategy' : 'epoch', # how often save a model -> unify with 'train_strategy'
    'log_dir' : './logs', # log directory
    'log_strategy' : 'epoch', # how often write log -> unify with 'train_strategy'
    'model_dir' : './models/', # model directory to save trained model 
    'save_total_limit' : 3, # number of model saved (include best model)
    'load_best_model_at_end' : True, # load a best model at the end of training and also save
    'metric_for_best_model' : 'loss', # metric for select the best model : loss -> eval_loss
    'greater_is_better' : False, # greater metric(loss) is better?

    # device config
    'device_map' : 'balanced', # gpu mapping

    # token config
    'max_length' : 512, # max length of encoding by tokenizer / 20240307 기존 128에서 512로 변경 (학습 데이터가 잘리는 것을 목격)
    'max_token' : 512, # max number of tokens generated by model(inference) : 
    'padding_side' : 'left', # location of padding 
    'padding' : 'max_length', # use tokenizer.encode(padding='max_length', args)
    'return_tensor' : 'pt', # tokenizer return tensor 'pt' : pytorch, 'tf' : tensorflow
}

# Trainer 
training_argument = TrainingArguments(
    do_eval=cfg['do_eval'],
    per_device_train_batch_size=cfg['train_batch_size'], #  128
    per_device_eval_batch_size=cfg['eval_batch_size'], # 16
    evaluation_strategy = cfg['train_strategy'], 
    # gradient_accumulation_steps=1, # 큰 배치 사이즈를 사용하지 못하는 경우 이전 스텝의 gradient를 축적하여 사용하는 것
    # max_steps=cfg['max_steps'], # default=1
    num_train_epochs=cfg['num_train_epochs'], # 10 , when select 'stpes' at train_strategy set as comment this line
    learning_rate=cfg['lr'], # 1e-4
    fp16=True,
    # logging_steps=cfg['logging_steps'], # how often write log
    # save_steps=cfg['save_steps'], # how often save 
    logging_dir = cfg['log_dir'],# + cfg['trainer_name'] + '-' + cfg['runtime'], # 학습 및 평가 로그 저장 디렉토리 -> 제대로 로그 저장이 안됨, 그냥 로그가 저장이 안되는 것 같다.
    logging_strategy=cfg['train_strategy'],
    output_dir = cfg['model_dir'] + cfg['trainer_name']+ '-' + cfg['runtime'],
    optim= cfg['optimizer'], # paged_adamw_8bit
    save_total_limit=cfg['save_total_limit'],
    load_best_model_at_end = cfg['load_best_model_at_end'], 
    save_strategy = cfg['train_strategy'],
    metric_for_best_model=cfg['metric_for_best_model'],
    greater_is_better = cfg['greater_is_better'],
)

# Data Preprocessing

In [3]:
# 데이터 로드 
data = pd.read_csv('./data/train_data_final.csv')

# 토크나이저 로드
# tokenizer = PreTrainedTokenizerFast.from_pretrained('LDCC/LDCC-SOLAR-10.7B',
#                                                     eos_token='</s>',
#                                                     #pad_token= -100,
#                                                     device_map=cfg['device_map'],
#                                                     # padding='right', # left is better 
#                                                    )


tokenizer = AutoTokenizer.from_pretrained('LDCC/LDCC-SOLAR-10.7B',
                                          eos_token='</s>',
                                          device_map=cfg['device_map'],
                                          padding_side=cfg['padding_side'],
                                         )
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# 데이터 포맷팅 및 토큰화
formatted_data = []
for _, row in tqdm(data.iterrows()):
    input_text = row['질문'] + tokenizer.eos_token + row['답변']
    input_enc = tokenizer.encode(input_text, return_tensors=cfg['return_tensor'],
                                 padding=cfg['padding'],
                                 truncation=cfg['shuffle'],
                                 max_length=cfg['max_length']) # every tokenized data has 128 length
    formatted_data.append(input_enc)

print('Done.')

6440it [00:02, 2662.87it/s]

Done.





In [5]:
# train_test_split
train_data, valid_data = train_test_split(formatted_data, 
                                          train_size=cfg['train_size'],
                                          test_size=cfg['test_size'],
                                          shuffle=cfg['shuffle'], 
                                          random_state=cfg['random_seed']
                                         )

In [6]:
#formatted_data = torch.cat(formatted_data, dim=0)
formatted_train_data = torch.cat(train_data, dim=0)
formatted_valid_data = torch.cat(valid_data, dim=0)

# Model Load

In [7]:
# 모델 로드

model_id = "LDCC/LDCC-SOLAR-10.7B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# model = AutoModelForCausalLM.from_pretrained(model_id,
#                                              quantization_config=bnb_config,
#                                              device_map=cfg['device_map'] 
#                                              # torch_dtype=torch.float32,
#                                              )

# load best model before
peft_model_id = './models/QLoRA + 4bit quantization + LDCC-SOLAR-10.7B-2403062219/checkpoint-414-best'
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                            device_map=cfg['device_map'],
                                            )
model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    #target_modules=["query_key_value"], 
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


trainable params: 31457280 || all params: 10894053376 || trainable%: 0.28875643357229686


In [8]:
# Trainer 정의
trainer = transformers.Trainer(
    model=model,
    train_dataset=formatted_train_data,
    eval_dataset=formatted_valid_data,
    args=training_argument,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False 

In [None]:
print('Start train!')
print(f'Start time : {datetime.now(timezone("Asia/Seoul"))}')
print(f'train epochs : {training_argument.num_train_epochs}') # training_argument.train_steps 로 변경 필요
# print(f'logging steps : {training_argument.logging_steps}')
print(f'logging dir : {training_argument.logging_dir}')
print(f'output_dir : {training_argument.output_dir}')
trainer.train()
print('End train!')
print(f'End time : {datetime.now(timezone("Asia/Seoul"))}')

Start train!
Start time : 2024-03-07 19:33:36.198487+09:00
train epochs : 5
logging dir : ./logs
output_dir : ./models/QLoRA + 4bit quantization + LDCC-SOLAR-10.7B-2403071930


Epoch,Training Loss,Validation Loss
1,0.833,0.588605


# Inference

In [None]:
model

In [None]:
# tokenizer = PreTrainedTokenizerFast.from_pretrained('LDCC/LDCC-SOLAR-10.7B',
#                                                     eos_token='</s>',
#                                                    device_map='balanced')

In [None]:
model.eval()

In [None]:
test = pd.read_csv('./data/test.csv')

preds = []

for test_q in tqdm(test['질문']):
    # 입력 텍스트를 토큰화 하고 모델 입력 형태로 변환
    input_ = tokenizer(test_q + tokenizer.eos_token, return_tensors='pt')

    # 답변 생성
    outputs = model.generate(input_.input_ids.to(model.device), 
                             max_new_tokens=cfg['max_token'], 
                            )

    # 생성된 텍스트(답변) 저장
    full_text = tokenizer.batch_decode(outputs.detach().cpu(), skip_special_tokens=False)
    # 질문과 답변의 사이를 나타내는 eos_token (</s>)를 찾아, 이후부터 출력
    full_text = full_text[0]
    answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
    answer_only = full_text[answer_start:].strip()
    answer_only = answer_only.replace('\n', ' ')
    preds.append(answer_only)

# Submission

In [None]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
modelEmb = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = modelEmb.encode(preds)
pred_embeddings.shape

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings

In [None]:
# 리더보드 제출을 위한 csv파일 생성

submit.to_csv(f'./submissions/' + cfg['trainer_name']+ '-' + cfg['runtime']+'512tokens'+'.csv', 
              index=False)