In [3]:
import pandas as pd
import numpy as np
import torch
import transformers
import bitsandbytes as bnb
import os
import wandb

from transformers import PreTrainedTokenizerFast, AdamW, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from tqdm import tqdm

#os.environ["TOKENIZERS_PARALLELISM"] = "true" 
#torch.backends.cuda.matmul.allow_tf32=True
#torch.set_float32_matmul_precision('medium')
#torch.backends.cudnn.benchmark = True

In [19]:
# 데이터 로드
data = pd.read_csv('./data/train.csv')
tokenizer = PreTrainedTokenizerFast.from_pretrained('LDCC/LDCC-SOLAR-10.7B',  eos_token='</s>')

max_length = 300

formatted_data = []
for _, row in tqdm(data.iterrows()):
  for q_col in ['질문_1', '질문_2']:
    for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
      input_text = row[q_col] + tokenizer.eos_token + row[a_col]
      input_ids = tokenizer.encode(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
      formatted_data.append(input_ids)
print('Done.')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LlamaTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
644it [00:02, 278.62it/s]

Done.





In [20]:
formatted_data = torch.cat(formatted_data, dim=0)

In [22]:
# 모델 로드

model_id = "LDCC/LDCC-SOLAR-10.7B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             #torch_dtype=torch.float32,
                                             
                                             )


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    #target_modules=["query_key_value"], 
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.03, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainer = transformers.Trainer(
    model=model,
    train_dataset=formatted_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        max_steps=50,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=10,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False 
trainer.train()

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

trainable params: 31457280 || all params: 5659561984 || trainable%: 0.555825346359525


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myijingue[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,1.737
20,1.3585
30,1.3678
40,1.3128


KeyboardInterrupt: 

In [6]:
model.save_pretrained("./QLoRA_1")
tokenizer.save_pretrained("./QLoRA_1")

('./QLoRA_1\\tokenizer_config.json',
 './QLoRA_1\\special_tokens_map.json',
 './QLoRA_1\\tokenizer.json')

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [35]:
test = pd.read_csv('./data/test.csv')
preds = []

# '질문' 컬럼의 각 질문에 대해 답변 생성
for test_question in tqdm(test['질문']):
    input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')

    # 답변 생성
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        # max_length=300,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    # 생성된 텍스트(답변) 저장
    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip()
        answer_only = answer_only.replace('\n', ' ')
        preds.append(answer_only)

  0%|                                                                                          | 0/130 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  1%|▋                                                                                 | 1/130 [00:00<01:13,  1.76it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  2%|█▎                                                                                | 2/130 [00:00<00:48,  2.63it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  2%|█▉                                                                                | 3/130 [00:01<00:40,  3.12it/s]A dec

In [36]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
from sentence_transformers import SentenceTransformer

model_tf = SentenceTransformer('distiluse-base-multilingual-cased-v1')

pred_embeddings = model_tf.encode(preds)
pred_embeddings.shape

(130, 512)

In [38]:
submit = pd.read_csv('./data/sample_submission.csv')
submit.iloc[:,1:] = pred_embeddings
submit.head()

  0.01434107  0.01434107  0.01434107  0.01434107  0.01434107  0.04668183
  0.01013472  0.01434107  0.01434107  0.07021035  0.01434107  0.04312139
 -0.03134401 -0.01313984  0.01434107  0.01434107  0.01434107  0.01434107
  0.02514496 -0.00658566  0.01434107  0.01434107  0.01434107 -0.01849415
  0.01434107  0.04312139  0.00777325  0.01434107  0.02743107  0.03587855
  0.01434107  0.01434107  0.01434107  0.01434107  0.01434107 -0.02878269
  0.01779351  0.04544792  0.01434107  0.02568698  0.01434107  0.01434107
  0.01434107  0.01434107  0.01434107 -0.05315246  0.01434107 -0.03134401
  0.01434107  0.01434107  0.01434107  0.03430324  0.01434107  0.01434105
  0.01434107  0.10294423  0.01779351  0.01434107  0.01434105  0.01434107
  0.01434107  0.00922401  0.01434107  0.01434107  0.01434107  0.01434107
  0.02743107  0.01434107  0.02558119  0.01434107  0.01434107  0.01122673
  0.02189182  0.01434107  0.01434106  0.01434106  0.01434106  0.01434106
  0.01434106  0.02743107 -0.0026663   0.01434106  0

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,0.014341,-0.027606,-0.032805,-0.02998,-0.103726,0.019318,0.010519,-0.065659,-0.009554,...,0.022845,-0.002845,0.020284,0.052013,-0.079881,-0.087341,-0.026903,-0.020639,0.027783,-0.031374
1,TEST_001,0.014341,-0.027606,-0.032805,-0.02998,-0.103726,0.019318,0.010519,-0.065659,-0.009554,...,0.022845,-0.002845,0.020284,0.052013,-0.079881,-0.087341,-0.026903,-0.020639,0.027783,-0.031374
2,TEST_002,0.035498,-0.011436,-0.056334,-0.034085,-0.049133,-0.008934,-0.028484,-0.079785,0.060099,...,0.015475,-0.01477,0.01136,0.015088,-0.045027,-0.054215,-0.060377,-0.016621,-0.030519,0.01126
3,TEST_003,0.010135,0.016149,-0.02686,-0.030011,-0.098167,0.031916,-0.093679,-0.066795,-0.030247,...,0.024087,0.035328,-0.003529,0.012564,-0.081047,-0.053023,-0.020717,-0.05233,0.019067,-0.02248
4,TEST_004,0.014341,-0.027606,-0.032805,-0.02998,-0.103726,0.019318,0.010519,-0.065659,-0.009554,...,0.022845,-0.002845,0.020284,0.052013,-0.079881,-0.087341,-0.026903,-0.020639,0.027783,-0.031374


In [39]:
submit.to_csv('./baseline_submit_lora.csv', index=False)