In [1]:
from dotenv import load_dotenv
from huggingface_hub import login
import os

load_dotenv()
hftoken = os.getenv("HUGGINGFACE_TOKEN")
login(hftoken)

In [2]:
import pandas as pd
from datasets import load_dataset, Dataset
import random
from litellm import completion, batch_completion
import os
import litellm

# OpenAI API key 선언
# os.environ["OPENAI_API_KEY"] = "sk-xxx..."

In [3]:
def get_random_section(long_string, length=1000):
    if len(long_string) <= length:
        return long_string

    start_index = random.randint(0, len(long_string) - length)

    return long_string[start_index:start_index + length]

In [4]:
# 데이터셋 로드
ds = load_dataset('alvanlii/finance-textbooks')['train']

# 데이터셋 샘플링 - get_random_section
texts = []
for bt in ds['book_text']:
    for i in range(2):
        texts.append(get_random_section(bt, 2048))

README.md:   0%|          | 0.00/318 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/7.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60 [00:00<?, ? examples/s]

In [5]:
texts

['asized the need for financial services organizations to develop and implement policies and procedures to ensure that AI systems are fair and unbiased.\n10. **Customer Protection**: In 2019, the Consumer Financial Protection Bureau issued guidance on the use of AI in finance. The guidance emphasized the importance of developing AI systems that are transparent, explainable, and fair. It also emphasized the need for financial services organizations to develop and implement policies and procedures to ensure that AI systems are fair and unbiased.\n1. **Fairness**: AI systems must be designed to recognize and correct biases to ensure fair outcomes.\n2. **Transparency**: AI systems must be designed to provide clear explanations for their decisions to ensure accountability.\n3. **Accountability**: AI systems must be designed to identify responsibility when AI systems make wrong decisions.\n4. **Data Quality**: AI systems must be trained on high-quality data that is complete, consistent, and 

In [6]:
# chat prompt 포맷팅
qrys = []
for t in texts:
    messages = [
    {"content":"Your job is creating multi-hop reasoning questions in fluent Korean. You will be given a part of a text. Make a question based on it. The question should require multiple steps of reasoning related to the text. Return the question only without any other text.","role":"system"},
    { "content": t,"role": "user"}]
    qrys.append(messages)

# 1. raw text 데이터를 활용한 질문 생성
responses = batch_completion(
    model="gpt-4o-mini-2024-07-18",
    messages = qrys
)
resps = [i.choices[0].message.content for i in responses]
total_prompt_tokens_for_q = sum([r.usage.prompt_tokens for r in responses])
total_completion_tokens_for_q = sum([r.usage.completion_tokens for r in responses])
df = pd.DataFrame({'sampled_text':texts,'question':resps})

  Expected `CompletionTokensDetails` but got `dict` with value `{'audio_tokens': None, 'reasoning_tokens': 0}` - serialized value may not be as expected
  Expected `PromptTokensDetails` but got `dict` with value `{'audio_tokens': None, 'cached_tokens': 0}` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [7]:
# 답변 생성용 prompt 포맷팅
qrys = []
for t in resps:
    messages = [
    {"content":"You are a skilled financial expert in Korea. Make a response for the question. DO NOT introduce yourself.","role":"system"},
    { "content": t,"role": "user"}]
    qrys.append(messages)

# 2. 생성된 질문에 대한 답변 생성
responses = batch_completion(
    model="gpt-4o-mini-2024-07-18",
    messages = qrys
)
resps = [i.choices[0].message.content for i in responses]
df['response'] = resps
total_prompt_tokens_for_a = sum([r.usage.prompt_tokens for r in responses])
total_completion_tokens_for_a = sum([r.usage.completion_tokens for r in responses])

In [8]:
print('total prompt tokens:', total_prompt_tokens_for_q + total_prompt_tokens_for_a)
print('prompt token costs: $', round((total_prompt_tokens_for_q + total_prompt_tokens_for_a) / 1000000 * 0.150, 6))
print('total completion tokens:', total_completion_tokens_for_q + total_completion_tokens_for_a)
print('completion token costs: $', round((total_completion_tokens_for_q + total_completion_tokens_for_a) / 1000000 * 0.600, 6))

total prompt tokens: 63588
prompt token costs: $ 0.009538
total completion tokens: 63716
completion token costs: $ 0.03823


In [9]:
# HuggingFace Hub 업로드 - token에 개인 HuggingFace 토큰을 입력해주시면 됩니다.
result_df = Dataset.from_pandas(df)
result_df.push_to_hub("llmflow/raw_sample_dataset", token=hftoken)

df.head()

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Unnamed: 0,sampled_text,question,response
0,asized the need for financial services organiz...,금융 서비스 조직이 AI 시스템의 공정성과 투명성을 보장하기 위해 마련해야 할 정책...,AI 시스템의 공정성과 투명성을 보장하기 위한 정책 및 절차는 여러 가지 요소를 포...
1,"ions, including:\n• **Overfitting**: AI algori...","AI 알고리즘이 자산 배분에 사용될 때, 과적합이나 불투명성 등의 문제를 해결하기 ...",AI 알고리즘이 자산 배분에 사용될 때 과적합이나 불투명성 문제를 해결하기 위한 몇...
2,bal Imbalances**: Large trade deficits in the ...,"2008년 글로벌 금융 위기의 주요 원인 중 하나로, 외국 중앙은행들이 미국의 대규...",2008년 글로벌 금융 위기의 주요 원인 중 하나는 외국 중앙은행들이 미국의 대규모...
3,ribute to financial instability if they engage...,다른 금융 중개 기관들이 금융 불안정성을 초래할 수 있는 여러 방식이 있다. 이러한...,금융 중개 기관들이 자산 가치를 효과적으로 관리하지 못하거나 리스크 관리에 실패하게...
4,y 3: The Consultant's Conundrum\nMichael is a ...,마이클이 그의 연구 보고서의 진실성을 유지하는 것과 은행의 투자 매니저와의 관계를 ...,마이클이 직면하고 있는 상황에서 고려해야 할 주요 윤리적 원칙은 다음과 같습니다.\...
