In [1]:
import re
import json
import os

def create_train_and_val_files(
    folder: str,
    user_file: str,                 # 예: "U15005.txt"
    train_output_file: str,         # 예: "U15005_train.jsonl"
    val_output_file: str            # 예: "U15005_val.jsonl"
):
    """
    [News of Interest to the user] -> train.jsonl
    [Questions] -> val.jsonl

    - findall을 이용해 전체 아이템 블록을 안정적으로 추출
    - 각 아이템을 줄 단위로 파싱
    - '10) ...', 'Question 11) ...' 등 두 자릿수 문제, 줄바꿈 문제를 줄임
    """

    # --------------------------------------------------
    # 1) 파일 경로 설정
    # --------------------------------------------------
    user_file_path = f'../../prompts/{folder}/{user_file}'
    hidden_positions_file = f'../../prompts/{folder}/metadata/hidden_positions.txt'

    train_output_path = f'../../gpt_finetuning_data/{train_output_file}'
    val_output_path = f'../../gpt_finetuning_data/{val_output_file}'

    # --------------------------------------------------
    # 2) user_file 읽어서 news_block, questions_block 분리
    # --------------------------------------------------
    with open(user_file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    news_pattern = r"\[News of Interest to the user\](.*?)\[Questions\]"
    questions_pattern = r"\[Questions\](.*)"

    news_block_match = re.search(news_pattern, content, flags=re.DOTALL)
    questions_block_match = re.search(questions_pattern, content, flags=re.DOTALL)

    if not news_block_match or not questions_block_match:
        raise ValueError("텍스트에서 [News of Interest to the user] 또는 [Questions] 블록을 찾을 수 없습니다.")

    news_block = news_block_match.group(1).strip()
    questions_block = questions_block_match.group(1).strip()

    # --------------------------------------------------
    # 3) news_block -> 아이템별 블록 추출 (findall)
    # --------------------------------------------------
    # ex: "1) User #15005 prefers most..." ~ "5: ..."
    # 다음 아이템은 "2) User..." 등등
    # 두 자릿수 "10) User..." 도 처리 가능.
    #
    # 정규식 설명:
    # (\d+\)\s*[\r\n]*User.*?)
    #  => "숫자+) 공백/줄바꿈 0회이상 User"로 시작하고, 그 뒤는 (.*?) (DOTALL로 \n포함) *? (lazy)
    # (?=\d+\)\s*[\r\n]*User|$)
    #  => 다음 아이템이 시작하기 전(동일 패턴) 또는 문서 끝($)에서 멈춤
    pattern_news_items = r"(\d+\)\s*[\r\n]*User.*?)(?=\d+\)\s*[\r\n]*User|$)"
    news_item_blocks = re.findall(pattern_news_items, news_block, flags=re.DOTALL)

    # --------------------------------------------------
    # 4) questions_block 전처리해서 질문 블록 추출 (findall)
    #    "Based on User ..." 라인 제거
    # --------------------------------------------------
    question_lines = questions_block.splitlines(keepends=True)
    filtered_q_lines = []
    for line in question_lines:
        if "Based on User" in line:
            continue
        filtered_q_lines.append(line)
    questions_block_cleaned = "".join(filtered_q_lines).strip()

    # ex: "Question 1) ... ~ ... 21:\n Question 2) ..."
    pattern_questions_items = r"(Question\s+\d+\).*?)(?=Question\s+\d+|$)"
    question_item_blocks = re.findall(pattern_questions_items, questions_block_cleaned, flags=re.DOTALL)

    # --------------------------------------------------
    # 5) hidden_positions.txt -> user_id별 정답 딕셔너리
    # --------------------------------------------------
    hidden_positions = {}
    with open(hidden_positions_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or ":" not in line:
                continue
            user_id_part, indexes_part = line.split(":", 1)
            user_id_part = user_id_part.strip()
            indexes_part = indexes_part.strip().strip("[]")  # "[18, 12]" -> "18, 12"
            indexes_list = [x.strip() for x in indexes_part.split(",") if x.strip()]
            indexes_list = [int(x) for x in indexes_list]
            hidden_positions[user_id_part] = indexes_list

    # user_file="U15005.txt" -> user_id="U15005"
    filename_only = os.path.basename(user_file_path)  # "U15005.txt"
    user_id = filename_only.split(".")[0]             # "U15005"
    question_answers = hidden_positions.get(user_id, [])

    # --------------------------------------------------
    # 6) 뉴스 아이템 파싱 -> train_items
    # --------------------------------------------------
    train_items = []
    for item_block in news_item_blocks:
        # item_block 예시:
        # "1) User #15005 prefers most Nå åpner byens nye ... among the following ...
        # 1: ...
        # 2: ...
        # ..."
        lines = item_block.split('\n')
        if not lines:
            continue

        # 첫 줄에서 "1) ", "10) " 같은 접두사 제거
        first_line = lines[0].strip()
        # 예: "1) User #15005 prefers most..."
        # ^\d+\)\s* -> 제거
        first_line = re.sub(r"^\d+\)\s*", "", first_line)

        # "prefers most X among the following" 찾기
        pattern_prefers = r"prefers most (.*?) among the following"
        found_article = re.search(pattern_prefers, first_line)
        if found_article:
            recommended_item_text = found_article.group(1).strip()
        else:
            recommended_item_text = "[missing article]"

        # 기사목록(1: ~, 2: ~, ...) 파싱
        article_lines = lines[1:]
        correct_index = -1
        for al in article_lines:
            match_al = re.match(r"(\d+):\s+(.*)", al.strip())
            if match_al:
                idx = match_al.group(1)
                text = match_al.group(2).strip()
                if text == recommended_item_text:
                    correct_index = idx
                    break

        # user_content: "[MASK]"로 치환
        user_first_line = re.sub(
            pattern_prefers,
            "prefers most [MASK] among the following",
            first_line
        )
        user_content_lines = [user_first_line] + article_lines
        user_content = "\n".join(user_content_lines).strip()

        if correct_index == -1:
            assistant_text = f"[Mask] is ?. {recommended_item_text}"
        else:
            assistant_text = f"[Mask] is {correct_index}. {recommended_item_text}"

        train_items.append({
            "system": "You are a bot designed to identify users’ news interests. Predict the index number of news items that best fits the position labeled [MASK].",
            "user": user_content,
            "assistant": assistant_text
        })

    # --------------------------------------------------
    # 7) 질문 아이템 파싱 -> val_items
    # --------------------------------------------------
    val_items = []
    question_count = 0
    for q_block in question_item_blocks:
        # ex: "Question 1) User #15005 prefers most ... 21:\n..."
        lines = q_block.split('\n')
        if not lines:
            continue

        question_count += 1
        first_line = lines[0].strip()
        # "Question 1) " 제거
        first_line = re.sub(r"^Question\s+\d+\)\s*", "", first_line)

        # "prefers most X among the following" -> "[MASK]"
        pattern_prefers = r"prefers most (.*?) among the following"
        user_first_line = re.sub(
            pattern_prefers,
            "prefers most [MASK] among the following",
            first_line
        )
        user_content_lines = [user_first_line] + lines[1:]
        user_content = "\n".join(user_content_lines).strip()

        # 정답 인덱스
        # question_count는 1부터 시작, question_answers는 0-based
        if (question_count - 1) < len(question_answers):
            correct_index_val = question_answers[question_count - 1]
        else:
            correct_index_val = -1

        # 기사목록에서 correct_index_val 매칭
        chosen_article_text = ""
        articles_only = lines[1:]
        for art_line in articles_only:
            match_al = re.match(r"(\d+):\s+(.*)", art_line.strip())
            if match_al:
                idx = int(match_al.group(1))
                text = match_al.group(2).strip()
                if idx == correct_index_val:
                    chosen_article_text = text
                    break

        if correct_index_val == -1 or not chosen_article_text:
            assistant_text = "[Mask] is ?."
        else:
            assistant_text = f"[Mask] is {correct_index_val}. {chosen_article_text}"

        val_items.append({
            "system": "You are a bot designed to identify users’ news interests. Predict the index number of news items that best fits the position labeled [MASK].",
            "user": user_content,
            "assistant": assistant_text
        })

    # --------------------------------------------------
    # 8) JSONL 파일로 저장
    # --------------------------------------------------
    with open(train_output_path, 'w', encoding='utf-8') as f_train:
        for it in train_items:
            messages = [
                {"role": "system", "content": it["system"]},
                {"role": "user", "content": it["user"]},
                {"role": "assistant", "content": it["assistant"]},
            ]
            out_dict = {"messages": messages}
            f_train.write(json.dumps(out_dict, ensure_ascii=False) + "\n")

    with open(val_output_path, 'w', encoding='utf-8') as f_val:
        for it in val_items:
            messages = [
                {"role": "system", "content": it["system"]},
                {"role": "user", "content": it["user"]},
                {"role": "assistant", "content": it["assistant"]},
            ]
            out_dict = {"messages": messages}
            f_val.write(json.dumps(out_dict, ensure_ascii=False) + "\n")

In [2]:
create_train_and_val_files(
    folder = "[fine2] 15001~15500/with_negative",
    user_file="U15057.txt",
    train_output_file="U15057_train.jsonl",
    val_output_file="U15057_val.jsonl"
)
