In [4]:
import os
import dotenv
from openai import OpenAI
import tiktoken
import textwrap

In [5]:
dotenv.load_dotenv()
password = os.getenv('GPT_API_KEY')
#print(password)
client = OpenAI(api_key=password)

In [6]:
def split_text_by_token(text, max_tokens=1500):
    enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
    sentences = text.split(". ")
    chunks = []
    current_chunk = ""
    current_tokens = 0

    for sentence in sentences:
        tokens = len(enc.encode(sentence))
        if current_tokens + tokens > max_tokens:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
            current_tokens = tokens
        else:
            current_chunk += sentence + ". "
            current_tokens += tokens

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [7]:
def convert_to_formal(text_chunk):
    prompt = f"""
다음 구어체 텍스트를 자연스럽고 일관된 문어체로 바꿔주세요. 단어 선택, 문장 구조, 연결성을 고려해 정제된 문장으로 바꿔주세요.
'어','음' 같은 구어체 표현을 제거하고, 어색한 단어는 유추해서 변경해주세요.

구어체:
{text_chunk}

문어체:
"""

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "너는 한국어 문장을 구어체에서 문어체로 바꿔주는 도우미야."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.4,
        max_tokens=1024
    )

    return response.choices[0].message.content.strip()

In [8]:
def process_travel_text():
    with open("travel_test.txt", "r", encoding="utf-8") as f:
        raw_text = f.read()

    chunks = split_text_by_token(raw_text, max_tokens=1500)

    formal_results = []
    for i, chunk in enumerate(chunks):
        print(f"[{i+1}/{len(chunks)}] 변환 중...")
        try:
            formal_text = convert_to_formal(chunk)
            formal_results.append(formal_text)
        except Exception as e:
            print(f"에러 발생 (chunk {i+1}): {e}")
            formal_results.append("[변환 실패한 구간]")

    full_formal_text = "\n\n".join(formal_results)

    with open("travel_test_formal.txt", "w", encoding="utf-8") as f:
        f.write(full_formal_text)

In [9]:
if __name__ == "__main__":
    process_travel_text()

[1/2] 변환 중...
[2/2] 변환 중...
