In [None]:
from dotenv import load_dotenv
from huggingface_hub import login
import os

load_dotenv()
hftoken = os.getenv("HUGGINGFACE_TOKEN")
login(hftoken)

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
import random
from litellm import completion, batch_completion
import os
import litellm

# OpenAI API key 선언
# os.environ["OPENAI_API_KEY"] = "sk-xxx..."

In [None]:
import pandas as pd
import time
from tqdm import tqdm
from datasets import load_dataset

# 데이터셋 로드
ds = load_dataset('amphora/rewrite-se-quant')['train']

# 전체 데이터셋을 미리 데이터프레임으로 변환
df_ds = pd.DataFrame(ds)

# 데이터프레임 초기화
df = pd.DataFrame(columns=['query', 'question', 'response'])


batch_size = 200
total_length = len(ds['query'])

# 반복문 안에서 현재 배치의 데이터프레임 생성 및 기존 데이터프레임에 추가
for i in tqdm(range(0, total_length, batch_size), desc="Processing Batches", unit="batch"):
    batch_end = min(i + batch_size, total_length)  # 끝 인덱스가 전체 길이를 넘지 않도록 설정

    # 질문 생성용 prompt 포맷팅
    batch_qrys = []
    for t in df_ds['query'][i:batch_end]:
        messages = [
            {"content": "Your job is creating quantitative finance questions in fluent Korean. You will be given a English QF question collected from the web. Restructure it to a test-like question, in formal Korean language. Return the question only.", "role": "system"},
            {"content": t, "role": "user"}
        ]
        batch_qrys.append(messages)

    # 질문 생성
    responses = batch_completion(
        model="gpt-4o-mini-2024-07-18",
        messages=batch_qrys,
        # max_tokens=60
    )

    question_resps = []
    for response in responses:
        if isinstance(response, litellm.RateLimitError):
            question_resps.append("Error: Rate limit exceeded or other issue")
        elif hasattr(response, 'choices') and len(response.choices) > 0:
            question_resps.append(response.choices[0].message.content)
        else:
            question_resps.append("Error: Unexpected response format")

    # 질문 생성과 답변 생성 사이에 10초 간격 추가
    print(i, '10초 대기')
    time.sleep(10)
    print('대기 종료')

    # 답변 생성용 prompt 포맷팅
    batch_qrys = []
    for t in question_resps:
        messages = [
            {"content": "You are a skilled financial expert in Korea. Make a response for the question. DO NOT introduce yourself.", "role": "system"},
            {"content": t, "role": "user"}
        ]
        batch_qrys.append(messages)

    # 답변 생성
    responses = batch_completion(
        model="gpt-4o-mini-2024-07-18",
        messages=batch_qrys,
        # max_tokens=80
    )

    response_resps = []
    for resp in responses:  # 'i' 대신 'resp'로 변수명 변경
        if isinstance(resp, Exception):
            print(f"오류 발생: {str(resp)}")
        elif hasattr(resp, 'choices') and len(resp.choices) > 0:
            response_resps.append(resp.choices[0].message.content)
        else:
            response_resps.append("Error: Unexpected response format")


    print(i, '5초 대기')
    time.sleep(5)
    print('대기 종료')

    query_list = df_ds['query'].iloc[i:batch_end].tolist()

    # 현재 배치의 데이터프레임 생성 및 기존 데이터프레임에 추가
    batch_df = pd.DataFrame({
        'query': query_list,
        'question': question_resps,
        'response': response_resps
    })
    df = pd.concat([df, batch_df], ignore_index=True)

    print('total prompt tokens:', total_prompt_tokens_for_q + total_prompt_tokens_for_a)
    print('prompt token costs:', round((total_prompt_tokens_for_q + total_prompt_tokens_for_a) / 1000000 * 0.150, 6))
    print('total completion tokens:', total_completion_tokens_for_q + total_completion_tokens_for_a)
    print('completion token costs:', round((total_completion_tokens_for_q + total_completion_tokens_for_a) / 1000000 * 0.600, 6))
    print('total completion tokens_q:', total_completion_tokens_for_q)
    print('total completion tokens_a:', total_completion_tokens_for_a)

    # 원본 데이터셋의 query 길이
    original_length = len(ds["query"])

    # 생성된 질문과 답변의 길이 확인
    question_length = len(question_resps)
    response_length = len(response_resps)

    print(f"Original length: {original_length}")
    print(f"Generated question length: {question_length}")
    print(f"Generated response length: {response_length}")

    # 길이가 다른지 확인
    if question_length < original_length:
        print(f"Missing questions: {original_length - question_length}")

    if response_length < original_length:
        print(f"Missing responses: {original_length - response_length}")

    # 다음 배치를 처리하기 전 1분 간격 대기
    print(i, '1분 대기')
    time.sleep(60)
    print('대기 종료')

    

# 최종 결과 확인
df.head()

In [None]:
print(f"중복된 행의 개수: {df.duplicated().sum()}")

In [None]:
# 질문 생성 결과 길이 맞추기
if len(question_resps) < len(ds["query"]):
    question_resps.extend(["Error: Missing question"] * (len(ds["query"]) - len(question_resps)))

# 답변 생성 결과 길이 맞추기
if len(response_resps) < len(ds["query"]):
    response_resps.extend(["Error: Missing response"] * (len(ds["query"]) - len(response_resps)))


In [None]:
# Excel 파일 저장
# df.to_excel("output_path/result.xlsx")

# HuggingFace Hub 업로드 - token에 개인 HuggingFace 토큰을 입력해주시면 됩니다.
result_df = Dataset.from_pandas(df)
result_df.push_to_hub("LDC-ai/dataset", token=hftoken)