## Import

In [1]:
# %pip install -U bitsandbytes
# %pip install -U transformers
# %pip install -U peft
# %pip install -U accelerate
# %pip install -U trl 

In [1]:
import os,torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from transformers import TrainingArguments, set_seed
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
set_seed(42)
import wandb
import random
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
secret_wandb = "7df92b5f77c812550e3f38029dc2c0a7bb2b7caa"
wandb.login(key = secret_wandb)
run = wandb.init(
    project='Fine tuning Edentns-DataVortexS with train2', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Currently logged in as: [33mcoldbrew[0m ([33mx_team[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /shared/home/qhdrl527/.netrc


## Data Preprocessing

In [4]:
# # 데이터 로드

model_name = "Edentns/DataVortexS-10.7B-dpo-v1.1"

In [5]:
# from datasets import Dataset
# import pandas as pd
# data = pd.read_csv('./train_combined_after1000k.csv')
# questions = data['질문'].tolist()
# answers = data['답변'].tolist()

# formatted_data = [{'text': f"<s> Question: {q} Answer: {a} </s>"} for q, a in zip(questions, answers)]

# dataset = Dataset.from_dict({"text": [item['text'] for item in formatted_data]})

# print(dataset)


In [6]:
# formatted_data[:3]
# dataset.save_to_disk("train_combined_after1000k")

In [8]:
from datasets import load_from_disk

# 저장된 데이터셋 불러오기
dataset = load_from_disk("train_combined_1000k")

# 데이터셋 확인
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 10000000
})


## Model Fine-tuning

In [9]:
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)

model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,  
        device_map="auto",
        trust_remote_code=True,
        
    )

model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
#Adding the adapters in the layers
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = get_peft_model(model, peft_config)

## HyperParam

In [11]:
training_arguments = TrainingArguments(
    output_dir="Edentns-DataVortexS-train2",
    num_train_epochs=1,
    per_device_train_batch_size=64,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=500,
    logging_steps=1,
    learning_rate=3e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    #warmup_steps=100,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    run_name = "Edentns-DataVortexS-train_combined",
    report_to="wandb"
)

In [12]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

Map:   0%|          | 0/10000000 [00:00<?, ? examples/s]

In [13]:
trainer.train(resume_from_checkpoint = False)

OutOfMemoryError: CUDA out of memory. Tried to allocate 10.48 GiB. GPU 3 has a total capacity of 47.54 GiB of which 4.87 GiB is free. Including non-PyTorch memory, this process has 42.67 GiB memory in use. Of the allocated memory 39.23 GiB is allocated by PyTorch, and 2.32 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
new_model = "Edentns-DataVortexS-train_combined"

In [None]:

trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

## Model Inference

In [None]:
import os,torch
# 환경 변수 설정
os.environ["CUDA_VISIBLE_DEVICES"]= "2,3"

# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 출력
print('Device:', device)  # 출력결과: cuda
print('Count of using GPUs:', torch.cuda.device_count())   #출력결과: 1 (GPU #2 한개 사용하므로)
print('Current cuda device:', torch.cuda.current_device())  # 출력결과: 0 (수정 필요)

In [None]:
bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
tokenizer = AutoTokenizer.from_pretrained(new_model, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(
        new_model,
        load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,   #16비트 플로트
        device_map="auto",
        trust_remote_code=True,
    )
model.config.use_cache = True
model.eval()

In [None]:
#모델 인퍼런스 다른 방식 테스트

prompt = "방청 페인트의 종류에는 어떤 것들이 있는지 알고 계신가요? 또한, 원목사이딩을 사용하는 것에 어떤 단점이 있을까요?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=300)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
result

In [None]:
result[0]['generated_text'].split('[INST]')[1].split('[/INST]')[1]

In [None]:
#파이프라인 이용해 생성 및 파싱
test = pd.read_csv('./test.csv')

batch_size=8

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, 
                temperature=0.1,
                # top_k=1,
                # top_p=0.9,
                # repetition_penalty=1.2,
                # do_sample=True,
                # num_return_sequences=1,
                max_length=300,
                batch_size=batch_size,
               )

for i in tqdm(range(0, len(test), batch_size)):
    # 현재 배치의 질문들
    batch_questions = test['질문'][i:i+batch_size].tolist()
    # 프롬프트 형식 적용
    batch_prompts = [f"<s>[INST] {q} [/INST]" for q in batch_questions]
    # 배치 인퍼런스 실행
    batch_results = pipe(batch_prompts)

    # 각 결과에 대한 후처리 및 저장
    for batch_item in batch_results:
        for result in batch_item:  # 배치 결과 내의 각 결과 아이템을 순회
            parsed_ans = result['generated_text'].split('[INST]')[1].split('[/INST]')[1]
            preds.append(parsed_ans)

# 결과 출력
for i, (question, ans) in enumerate(zip(test['질문'], preds)):
    print(f"[ {i}번 ] 질문: {question}")
    print(f"[ 답변 ] {ans}\n")

print("총 생성된 답변 개수 : ", len(preds))

In [18]:
wandb.finish()


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▃███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
train/loss,█▆▅▅▄▄▃▄▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,10.0
train/global_step,8050.0
train/learning_rate,0.0
train/loss,0.098
train/total_flos,9.71820052660224e+17
train/train_loss,0.17649
train/train_runtime,69729.42
train/train_samples_per_second,0.924
train/train_steps_per_second,0.115


RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-65d07318-4a1536c5213f3dd561418848;6021ecb9-dda5-44d6-b5a3-82666eb634c5)

Repository Not Found for url: https://huggingface.co/api/models/Edentns-DataVortexS-for-RAG-10ep-32batch.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.

In [21]:
## repo
HUGGINGFACE_AUTH_TOKEN = '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' # https://huggingface.co/settings/token
MODEL_SAVE_HUB_PATH = 'Coldbrew9/Edentns-DataVortexS-for-RAG-10ep-32batch'
## Push to huggingface-hub
trainer.model.push_to_hub(
			MODEL_SAVE_HUB_PATH, 
			use_temp_dir=True, 
			use_auth_token=HUGGINGFACE_AUTH_TOKEN
)
trainer.tokenizer.push_to_hub(
			MODEL_SAVE_HUB_PATH, 
			use_temp_dir=True, 
			use_auth_token=HUGGINGFACE_AUTH_TOKEN
)



adapter_model.safetensors:   0%|          | 0.00/69.3M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Coldbrew9/Edentns-DataVortexS-for-RAG-10ep-32batch/commit/0e6b7049fb318852118d996fd0d7e0158160a8ac', commit_message='Upload tokenizer', commit_description='', oid='0e6b7049fb318852118d996fd0d7e0158160a8ac', pr_url=None, pr_revision=None, pr_num=None)

## Submission

In [13]:
len(preds)

130

In [14]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = model.encode(preds)
pred_embeddings.shape

(130, 512)

In [15]:
submit = pd.read_csv('./sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()

 -0.02813819 -0.00368087 -0.05695296 -0.01448731 -0.01499182 -0.02307036
 -0.01844569 -0.00130808  0.01417161  0.03094976  0.03849264 -0.06207465
  0.00393758 -0.01347351  0.03576537  0.00798695 -0.03056601  0.04143144
  0.00076889  0.01498904  0.04329271 -0.02675137  0.0346049  -0.00623232
  0.0353062  -0.01651229  0.0176557   0.04982275  0.01812216 -0.02415204
  0.01433525 -0.01744543  0.05964296  0.02376416 -0.01108292 -0.02718491
  0.04089959  0.02355115 -0.01252123  0.03506222 -0.01754483 -0.03482757
  0.02220426  0.02945272  0.00573382  0.02623901 -0.00509454  0.00016863
  0.05277279 -0.00069225  0.02733205  0.02318367  0.00217513  0.01637005
 -0.05604045  0.00494205  0.01714125 -0.01803621 -0.00031419  0.0081264
  0.03794401  0.03703235  0.01947733 -0.03370031 -0.0124019   0.00906325
  0.05539934 -0.00781548  0.05612678  0.02559411  0.05125496 -0.00486789
 -0.01251025  0.01531538 -0.04748904 -0.06422206  0.00557458  0.02202417
 -0.06422208 -0.01883578  0.03729887 -0.02348129  0.

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,0.037593,0.034268,0.033677,-0.003111,0.073718,0.009008,0.007748,0.032731,0.03528,...,-0.010802,-0.052762,-0.018415,-0.034756,0.005905,0.011873,0.048023,0.01965,-0.023089,0.011365
1,TEST_001,-0.030944,-0.018822,0.016158,-0.006363,0.099574,-0.001955,0.003211,-0.005945,-0.016818,...,-0.03851,-0.002504,-0.014256,-0.024417,-0.008081,0.060187,-0.001913,-0.019636,0.027921,0.013305
2,TEST_002,0.014098,-0.024947,-0.019317,-0.014282,0.141321,-0.029457,0.005782,-0.009128,0.043452,...,-0.046874,-0.005726,0.037619,-0.012767,-0.022908,0.023184,-0.018577,-0.010719,-0.045098,0.060008
3,TEST_003,-0.005496,0.031917,0.003253,0.026282,0.08413,-0.075227,-0.055292,0.020802,0.003915,...,-0.03001,-0.018717,0.052951,-0.043827,0.011371,0.033698,-0.025865,-0.05017,-0.027836,0.056391
4,TEST_004,-0.00348,0.020897,-0.019335,-0.019355,0.115744,0.01563,0.06805,0.032846,-0.01547,...,-0.012612,-0.005543,0.020916,0.014543,-0.03229,-0.000766,0.003989,0.039902,0.001197,0.050854


In [None]:
# 리더보드 제출을 위한 csv파일 생성
submit.to_csv(f'./{new_model}.csv', index=False)

In [17]:
print("done")

done


In [None]:
# model = model.merge_and_unload()

# model.push_to_hub(new_model, use_temp_dir=False)
# tokenizer.push_to_hub(new_model, use_temp_dir=False)