<a href="https://colab.research.google.com/github/HueyVault/study_NLPs/blob/main/codes/DeepLearnings/11_04_generativeLLM_HF_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 데이터 및 fine tunning 된 모델 셋팅
# !ls -al
# !unzip ./chapter_6_withvLLMme_finetuned_model.zip
# !unzip ./chapter_6_withvLLMme_preprocess.zip

In [2]:
import os
# 환경 변수에 따라 파일 경로 설정
if os.getenv('KAGGLE_KERNEL_RUN_TYPE') is not None:
    # Kaggle 환경
    print("kaggle")
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    os.environ['HF_TOKEN']= user_secrets.get_secret("HF_TOKEN")
    os.environ['OPENAI_API_KEY'] = user_secrets.get_secret("OPENAI_API_KEY")

    # file path
    train_file_path =  f'/kaggle/input/house-prices-advanced-regression-techniques/train.csv'
    test_file_path = f'/kaggle/input/house-prices-advanced-regression-techniques/test.csv'

elif 'google.colab' in str(get_ipython()):
    print("colab")
    from google.colab import userdata
    os.environ['HF_TOKEN'] = userdata.get("HF_TOKEN")
    os.environ['OPENAI_API_KEY'] = userdata.get("OPENAI_API_KEY")

else:
    # Docker 환경
    print("local")
    train_file_path = "../../datasets/train.csv"
    test_file_path = "../../datasets/test.csv"


colab


## 자연어 평가
- EM (Extract Match) : 텍스트 매칭(정규식, 의미, 통계 매칭)
- EX (Execution Accuracy) : 개발 환경에서 실행
- GPT-4 활용


In [3]:
!pip install transformers==4.40.1 bitsandbytes==0.43.1 accelerate==0.29.3 datasets==2.19.0 tiktoken==0.6.0 -qqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import pandas as pd
train_file_path='data/train.csv'
df_text2sql = pd.read_csv(train_file_path)
df_text2sql.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/train.csv'

In [None]:
df_text2sql.loc[1,'text']

## fine tuning 된 모델 이용한 응답 만들기

In [None]:

import torch
from transformers import BitsAndBytesConfig, pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
def make_inference_pipeline(model_id):
    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    # 버전 호환 문제로 삭제
    # # 양자화(모델 사이즈 축소)
    # quantization_config = BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     bnb_4bit_compute_dtype=torch.float16
    # )
    # model
    model = AutoModelForCausalLM.from_pretrained(model_id,
                                                 # quantization_config=quantization_config,
                                                #  load_in_4bit=True,
                                                #  bnb_4bit_compute_dtype=torch.float16, # 양자화 정의
                                                 device_map='auto'
                                                )

    # pipeline : 예측 초기화 설정
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

    return pipe

In [None]:
finetuning_model_id = 'otter35/yi-ko-6b-text2sql'
hf_pipe = make_inference_pipeline(finetuning_model_id)

In [None]:
def make_prompt(ddl, question, query=''):
    prompt = f"""당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question을 해결할 수 있는 SQL 쿼리를 생성하세요.
### DDL:
{ddl}
### Question:
{question}
### SQL:
{query}"""
    return prompt

In [None]:
# 필요한 컬럼 추출해 프롬프트 명령어 만들기
from tqdm import tqdm

for idx, row in tqdm(df_text2sql.iterrows(),
                     total = len(df_text2sql),
                     desc='Generating prompt'): # DataFrame row return
    prompt_command = make_prompt(row['context'],
                         row['question']
                        #  row['answer']
                        )
    df_text2sql.loc[idx, 'prompt'] = prompt_command
    pass
df_text2sql.head()

### 파인튜닝된 모델에 prompt로 질문과 답 얻기

In [None]:
example = df_text2sql.loc[1,'prompt']

In [None]:
results = hf_pipe(example, do_sample=False,
       return_full_text=False, max_length=512, truncation=True)
results