## 라이브러리 & Instuction 정의

In [23]:
import os
import re
import random
import json

instruction_demo_negative = """
You are a bot designed to identify users' news interests based on their [News of Interest to the user] and predict the index number of news items in [Questions] that best fit the position labeled [MASK].

Each news article contains only a title and category written in Norwegian.

There can be multiple lists in [News of Interest to the user], each with five news items.
Among the five news in each list, there is one news that the user is most interested in.

[Questions] can have multiple questions, each of which must be answered.
"""

## 함수 정의

In [26]:
def create_json(purpose, u_numbers, file_name, target_folder, random_ranking=True):
    """
    JSONL 파일 생성 함수
    - 실제 U#####.txt 파일 구조에 맞춰 정규식을 수정
    - [News of Interest to the user]는 건드리지 않고
      [Questions] 영역 아래의 "Question N) User #xxxxx prefers most [MASK] among the following M articles:"를 파싱
    - hidden_positions.txt에서 답안을 불러와서, 
      - random_ranking=True면 (정답, 나머지 4개 랜덤) 형태로 assistant 답변 구성
      - random_ranking=False면 정답 하나(positive 뉴스 번호)만 출력
    """

    target_folder = f'../../prompts/{target_folder}'
    data_dir = f'{target_folder}/{purpose}'
    metadata_dir = os.path.join(data_dir, 'metadata')
    positions_file = os.path.join(metadata_dir, 'hidden_positions.txt')

    # 1. system 메세지용 instruction
    instruction = instruction_demo_negative

    # 2. hidden_positions.txt 로딩
    hidden_positions = {}
    if os.path.exists(positions_file):
        with open(positions_file, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                match = re.match(r'(U\d+)\s*:\s*\[([^\]]+)\]', line)
                if match:
                    user_id = match.group(1)
                    positions_str = match.group(2)
                    positions = [int(pos.strip()) for pos in positions_str.split(',')]
                    hidden_positions[user_id] = positions
    else:
        print(f"주의: {positions_file} 파일을 찾을 수 없습니다. hidden_positions는 빈 dict로 처리됩니다.")

    # 3. [Questions] 영역에서 "Question N) User #U##### prefers most [MASK] among the following M articles:" 구문 찾는 정규식
    #    - 예: Question 1) User #15001 prefers most [MASK] among the following 21 articles:
    #    - 여러 개의 Question 블록이 있을 수 있으므로, DOTALL + 반복
    question_pattern = re.compile(
        r'Question\s+(\d+)\)\s+User\s+#U?(\d+)\s+prefers\s+most\s+\[MASK\]\s+'
        r'among\s+the\s+following\s+(\d+)\s+articles:\s*'
        r'(.*?)'  # 이 구역(기사 목록 등)을 모두 가져오기
        r'(?=Question\s+\d+\)|Please provide just the answers|$)', 
        re.DOTALL
    )

    data = []  # 최종 JSONL에 저장할 항목들

    # 4. 지정된 U번호들을 순회
    for u_num in u_numbers:
        user_id = f'U{u_num}'
        user_file = f'{user_id}.txt'
        user_file_path = os.path.join(data_dir, user_file)

        # (1) 파일 존재 여부 확인
        if not os.path.exists(user_file_path):
            print(f"파일 없음: {user_file_path}")
            continue

        # (2) 파일 읽기
        with open(user_file_path, 'r', encoding='utf-8') as f:
            prompt = f.read()

        # (3) [Questions] 영역의 패턴 매칭
        matches = list(re.finditer(question_pattern, prompt))
        if not matches:
            # 매칭된 Question 구역이 없으면 스킵
            print(f"매칭된 Question 없음: {user_file_path}")
            continue

        # (4) positions 불러오기 (hidden_positions.txt에서)
        #     해당 사용자 id가 없으면 []로 처리
        positions = hidden_positions.get(user_id, [])

        # (5) assistant가 답변할 내용을 Question별로 생성
        assistant_content_lines = []
        for idx, match_obj in enumerate(matches):
            q_num = int(match_obj.group(1))       # Question 번호
            total_articles = int(match_obj.group(3))  # "following 21 articles" -> 21

            # (A) hidden_positions.txt에 미리 정답이 있으면 사용, 아니면 1~total_articles 중 랜덤
            if idx < len(positions):
                correct_position = positions[idx]
            else:
                correct_position = random.randint(1, total_articles)

            if random_ranking:
                # (B) 추가로 4개 무작위 선택 (correct_position 제외)
                available_nums = [n for n in range(1, total_articles+1) if n != correct_position]
                if len(available_nums) >= 4:
                    additional_nums = random.sample(available_nums, 4)
                else:
                    # 혹시 total_articles가 매우 작아서 4개 못 뽑는 경우 예외처리
                    additional_nums = available_nums

                # (C) 최종 문자열 형식: "Question <q_num>: 정답번호, rand1, rand2, ..."
                numbers_str = f"{correct_position}, " + ", ".join(map(str, additional_nums))
            else:
                # random_ranking=False → 오직 정답 번호 하나만
                numbers_str = f"{correct_position}"

            assistant_content_lines.append(f"Question {q_num}: {numbers_str}")

        # 5. 하나의 data_entry 구성 (system, user, assistant)
        assistant_content = '\n'.join(assistant_content_lines)

        data_entry = {
            "messages": [
                {"role": "system", "content": instruction},
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": assistant_content}
            ]
        }
        data.append(data_entry)

    # 6. 결과 JSONL 파일로 저장
    output_dir = "../../gpt_finetuning_data"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    output_file = os.path.join(output_dir, file_name)
    with open(output_file, 'w', encoding='utf-8') as f:
        for data_entry in data:
            json_line = json.dumps(data_entry, ensure_ascii=False)
            f.write(json_line + '\n')

    print(f"처리된 데이터가 {output_file}으로 저장되었습니다.")

## 실행

In [11]:
# train_numbers = [i for i in range(15001,15056)]
# val_numbers = [i for i in range(15056,15071)]

# create_json(purpose='with_negative', 
#             u_numbers=train_numbers, 
#             file_name="[241216] train_negative_demo.jsonl", 
#             target_folder='[241216] user_prompts_demo_fine'
#             )

# create_json(purpose='with_negative', 
#             u_numbers=val_numbers, 
#             file_name="[241216] val_negative_demo.jsonl", 
#             target_folder="[241216] user_prompts_demo_fine"
#             )

In [32]:
train_numbers = [i for i in range(15001,15056)]
val_numbers = [i for i in range(15056,15071)]

# train_numbers = [15004, 15017, 15025, 15033, 15037, 15046, 15054, 15056, 15120, 15126, 15128, 15137]
# val_numbers = [15142, 15144, 15156, 15168]

train_numbers = [15235, 15141, 15290, 15270, 15458, 15331, 15298, 15321, 15485, 15014, 15294, 15041, 15130, 15253, 15052, 15340, 15418, 15360, 15003, 15231, 15167, 15257, 15053, 15072, 15251, 15046, 15310, 15205, 15224, 15104]
val_numbers = [15245, 15378, 15178, 15268, 15487, 15429, 15162, 15246, 15292, 15299, 15013, 15425]

create_json(purpose='with_negative', 
            u_numbers=train_numbers, 
            file_name="fine_train_negative_30.jsonl", 
            target_folder="[fine] 15001 ~ 15500",
            random_ranking=False
            )


create_json(purpose='with_negative', 
            u_numbers=val_numbers, 
            file_name="fine_val_negative_12.jsonl", 
            target_folder="[fine] 15001 ~ 15500",
            random_ranking=False
            )

처리된 데이터가 ../../gpt_finetuning_data\fine_train_negative_30.jsonl으로 저장되었습니다.
처리된 데이터가 ../../gpt_finetuning_data\fine_val_negative_12.jsonl으로 저장되었습니다.
