## 라이브러리 & Instuction 정의

In [5]:
import os
import re
import random
import json

instruction_demo_negative = """
You are a recommendation bot responsible for selecting the news article that the target user is most likely to prefer from a list of five candidate articles. The only information available for each candidate article is its title, which is written in Norwegian.
Your goal is to predict the index number of the news article that best fits in the position labeled [MASK].
"""

## 함수 정의

In [8]:
def create_json(purpose, u_numbers, file_name, target_folder):
    """
    JSONL 파일 생성 함수
    """

    target_folder = f'prompts/{target_folder}'
    data_dir = f'{target_folder}/{purpose}'
    metadata_dir = os.path.join(data_dir, 'metadata')
    positions_file = os.path.join(metadata_dir, 'hidden_positions.txt')

    instruction = instruction_demo_negative

    # hidden_positions.txt 읽기
    hidden_positions = {}
    with open(positions_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            match = re.match(r'(U\d+)\s*:\s*\[([^\]]+)\]', line)
            if match:
                user_id = match.group(1)
                positions_str = match.group(2)
                positions = [int(pos.strip()) for pos in positions_str.split(',')]
                hidden_positions[user_id] = positions

    data = []
    for u_num in u_numbers:
        user_id = f'U{u_num}'
        user_file = f'{user_id}.txt'
        user_file_path = os.path.join(data_dir, user_file)
        if not os.path.exists(user_file_path):
            continue

        with open(user_file_path, 'r', encoding='utf-8') as f:
            prompt = f.read()

        # 정규식 패턴 수정: User #U?\d+ 형태 매칭
        question_pattern = re.compile(
            r'User #U?\d+\s+prefers\s+most\s+\[MASK\]\s+among\s+the\s+following\s+five\s+articles:\s*'
            r'(1:[^\n]+)\s*'
            r'(2:[^\n]+)\s*'
            r'(3:[^\n]+)\s*'
            r'(4:[^\n]+)\s*'
            r'(5:[^\n]+)\s*'
            r'Question\s+(\d+)\.\s*The index number of the \[MASK\] is \?',
            re.DOTALL
        )

        matches = list(re.finditer(question_pattern, prompt))
        if not matches:
            continue

        questions = {}
        for m in matches:
            q_num = int(m.group(6))
            q_content = '\n'.join([m.group(i) for i in range(1,6)])
            questions[q_num] = q_content

        # 해당 사용자의 정답 위치 가져오기
        positions = hidden_positions.get(user_id, [])
        assistant_content_lines = []
        for idx, (q_num, q_content) in enumerate(sorted(questions.items())):
            if idx < len(positions):
                correct_position = positions[idx]
            else:
                correct_position = random.randint(1, 5)
            assistant_content_lines.append(f"Question {q_num}: {correct_position}")

        assistant_content = '\n'.join(assistant_content_lines)

        data_entry = {
            "messages": [
                {"role": "system", "content": instruction},
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": assistant_content}
            ]
        }
        data.append(data_entry)

    output_file = f'gpt_finetuning_data/{file_name}'
    with open(output_file, 'w', encoding='utf-8') as f:
        for data_entry in data:
            json_line = json.dumps(data_entry, ensure_ascii=False)
            f.write(json_line + '\n')

    print(f"처리된 데이터가 {output_file}으로 저장되었습니다.")

## 실행

In [9]:
train_numbers = [i for i in range(15001,15056)]
val_numbers = [i for i in range(15056,15071)]

create_json(purpose='with_negative', 
            u_numbers=train_numbers, 
            file_name="[241216] train_negative_demo.jsonl", 
            target_folder='[241216] user_prompts_demo_fine'
            )

create_json(purpose='with_negative', 
            u_numbers=val_numbers, 
            file_name="[241216] val_negative_demo.jsonl", 
            target_folder="[241216] user_prompts_demo_fine"
            )

처리된 데이터가 gpt_finetuning_data/[241216] train_negative_demo.jsonl으로 저장되었습니다.
처리된 데이터가 gpt_finetuning_data/[241216] val_negative_demo.jsonl으로 저장되었습니다.
