In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
import random
from litellm import completion, batch_completion
import os
import litellm

# OpenAI API key 선언
# os.environ["OPENAI_API_KEY"] = "sk-xxx..."

In [None]:
def get_random_section(long_string, length=1000):
    if len(long_string) <= length:
        return long_string

    start_index = random.randint(0, len(long_string) - length)

    return long_string[start_index:start_index + length]

In [None]:
# 데이터셋 로드
ds = load_dataset('alvanlii/finance-textbooks')['train']

# 데이터셋 샘플링 - get_random_section
texts = []
for bt in ds['book_text']:
    for i in range(2):
        texts.append(get_random_section(bt, 2048))

In [None]:
# chat prompt 포맷팅
qrys = []
for t in texts:
    messages = [
    {"content":"Your job is creating multi-hop reasoning questions in fluent Korean. You will be given a part of a text. Make a question based on it. The question should require multiple steps of reasoning related to the text. Return the question only without any other text.","role":"system"},
    { "content": t,"role": "user"}]
    qrys.append(messages)

# 1. raw text 데이터를 활용한 질문 생성
responses = batch_completion(
    model="gpt-4o-mini-2024-07-18",
    messages = qrys
)
resps = [i.choices[0].message.content for i in responses]
total_prompt_tokens_for_q = sum([r.usage.prompt_tokens for r in responses])
total_completion_tokens_for_q = sum([r.usage.completion_tokens for r in responses])
df = pd.DataFrame({'sampled_text':texts,'question':resps})

In [None]:
# 답변 생성용 prompt 포맷팅
qrys = []
for t in resps:
    messages = [
    {"content":"You are a skilled financial expert in Korea. Make a response for the question. DO NOT introduce yourself.","role":"system"},
    { "content": t,"role": "user"}]
    qrys.append(messages)

# 2. 생성된 질문에 대한 답변 생성
responses = batch_completion(
    model="gpt-4o-mini-2024-07-18",
    messages = qrys
)
resps = [i.choices[0].message.content for i in responses]
df['response'] = resps
total_prompt_tokens_for_a = sum([r.usage.prompt_tokens for r in responses])
total_completion_tokens_for_a = sum([r.usage.completion_tokens for r in responses])

In [None]:
print('total prompt tokens:', total_prompt_tokens_for_q + total_prompt_tokens_for_a)
print('prompt token costs: $', round((total_prompt_tokens_for_q + total_prompt_tokens_for_a) / 1000000 * 0.150, 6))
print('total completion tokens:', total_completion_tokens_for_q + total_completion_tokens_for_a)
print('completion token costs: $', round((total_completion_tokens_for_q + total_completion_tokens_for_a) / 1000000 * 0.600, 6))

In [None]:
# CSV 파일 저장
df.to_csv("output_path/result.csv")

# Excel 파일 저장
df.to_excel("output_path/result.xlsx")

# HuggingFace Hub 업로드 - token에 개인 HuggingFace 토큰을 입력해주시면 됩니다.
result_df = Dataset.from_pandas(df)
result_df.push_to_hub("hf/dataset", token="HF_TOKEN")

df.head()