In [1]:
import os
import glob
import random
import pandas as pd
import datetime
import tiktoken
from collections import Counter, defaultdict
from IPython.display import display, Markdown

from openai import OpenAI
from dotenv import load_dotenv

load_dotenv() # .env 파일 로드
my_api_key = os.getenv("API_KEY") # 환경 변수에서 API 키 불러오기

client = OpenAI(
    api_key = my_api_key
)

# user DF에서 History column과 Train column을 합쳐서 history로 사용
file_name = "MIND_LLM.tsv"
user = pd.read_csv(f'../../../data/LLM/{file_name}', sep='\t', names=['User', 'History', 'Question'])

NNR_news = pd.read_csv('../../../data/baseline/MIND_news.tsv', sep='\t', names=['News ID','Category','SubCategory','Title','Abstract','URL','Title Entities','Abstract Entities'])
NNR_news = NNR_news.drop(columns=['URL', 'Title Entities', 'Abstract Entities'])

In [2]:
system_instruction = """
Summarize and describe User #OO's news preferences from the information presented in the following [News of interest to the user].
- Identify User's most preferred news.
- Identify User's least preferred news as well.
- Describe the user's overall news preferences 
"""

example_output_block  = """
The output is formatted as ## Example output ##
## Example output ##

[User #OO's News Preferences Summary]
Most Preferred News Topics: 
1. ~
2. ~
3. ~
...

Less Preferred News Topics:
1. ~
2. ~
3. ~
...

Overall:
########

"""

In [5]:
system_instruction = """
You are a news-preference analyst.  
Your job is to read a list of “News of Interest to the user” and write a concise, insight-oriented summary of that user’s tastes.  

Guidelines  

1. Count how many times each 'subcategory' appears as the “preferred most” choice, then rank them.  
2. Cluster similar subcategories into broader themes (e.g., all football-related subcategories → “American Football”).  
3. In the Most Preferred News Topics section, provide a list of the top 3-5 high-level themes, accompanied by 1-2 representative headlines.  
4. For Less Preferred News Topics, look at themes that were rarely or never chosen when offered and mention 3-5 of them.  
5. End with an Overall Summary (2-3 sentences) describing the user’s general interests, tone, and any noticeable nuances (e.g., “enjoys inspirational human-interest pieces but ignores recipe articles”).
"""

example_output_block  = """
Output format  

[User #OO’s News Preferences Summary]  
Most Preferred News Topics:  
1. Theme Name - brief description  
   • Example Headline  
   • Example Headline  

2. …  

Less Preferred News Topics:  
1. Theme Name - brief description  
2. …  

Overall:  
- Sentence 1  
- Sentence 2  

Important style rules  

* Keep bullet points short (max ~20 words each).   
* Write in the third person (e.g., “User #1 prefers…”).  
"""

In [2]:
system_instruction = """
You are a news-preference analyst

Read the list of “News of Interest to the user.” Each entry presents five articles and highlights the one the user found most interesting.

Based on these selections, write a brief Overall Summary in 2-4 sentences that captures the user’s general interests and highlights any noticeable patterns, including what the user tends to overlook or show less interest in.
"""

example_output_block  = """
Output format  

[User #OO’s News Preferences Summary]
User #OO is ... 
"""

### recurrent template 1

In [None]:
recurrent_instruction = """
You are a news-preference analyst

Task
Using the materials below, write an updated, stand-alone “News Preferences Summary” that considers the user’s latest interests
"""

recurrent_output_block  = """
Guidelines
1. Read the new “News of Interest to the user” list and, together with the previous summary, infer any changes in preferences.
2. If the new articles indicate shifting tastes, reorder, add, or remove themes accordingly.
3. Match exactly the original structure, headings, and bullet style:
    • Most Preferred News Topics - list 3-5 high-level themes, each followed by 1-3 illustrative headlines.
    • Less Preferred News Topics - list 3-5 themes that the user rarely or never chooses.
    • Overall - two sentences capturing overarching interests, tone, and nuances.
4. Keep every bullet ≤ 20 words and write in the third person (“User #1000 prefers …”).
5. Do not mention the update process, the guidelines, or this prompt in your output.

Output
Return only the updated summary in the specified format.
"""

### recurrent template 2

In [6]:
recurrent_instruction = """
You are a news-preference analyst

Task
Given a user’s “Previous Preferences Summary” and New “News of Interest to the user”, we analyze whether the user’s news preferences and habits have changed.
You must generate an updated and concise summary that captures the user's news reading preferences by considering both the 'Previous Preferences Summary' and the user's new 'News of Interest to the user'.
"""

recurrent_output_block  = """
Guidelines 
1. Keep bullet points short (max ~20 words each).   
2. Write in the third person (e.g., “User #1 prefers…”).

Output
The newly generated user preference summary must match the format of the previous preference summary.

[User #OO’s News Preferences Summary]  
Most Preferred News Topics:  
1. Theme Name - brief description  
   • Example Headline  
   • Example Headline  

2. …  

Less Preferred News Topics:  
1. Theme Name - brief description  
2. …  

Overall:  
- Sentence 1  
- Sentence 2  

"""

In [3]:
recurrent_instruction = """
You are a news-preference analyst

You are given two inputs:
The user’s “Previous Preferences Summary”, and A new list of “News of Interest to the user,” where each entry highlights the article the user found most interesting among five options.

Your task is to generate an updated News Preferences Summary that fully reflects the user’s current news reading preferences, taking into account both the previous summary and the newly observed selections.

Be sure to consider not only the types of content the user tends to prefer, but also those they are more likely to overlook or show less interest in.

The updated summary should follow the same format and tone as the original summary.
It should serve as a stand-alone summary of the user’s interests—not a comparison or commentary on the update process.
"""

recurrent_output_block  = """
Output Format:

[User #OO’s News Preferences Summary]  
User #OO is ...

"""

# 데이터 확인

In [9]:
display(Markdown("### 1. user<hr/>"))
# display(user[user['User']==1]['Question'][0])
display(user)
display(Markdown("### 2. NNR_news<hr/>"))
display(NNR_news)

### 1. user<hr/>

Unnamed: 0,User,History,Question
0,1,N58584 N12900 N6233 N51706 N18777 N593 N16793 ...,N31978 N24207 N58271 N14184 N36789
1,1,N58584 N12900 N6233 N51706 N18777 N593 N16793 ...,N57809 N50675 N61404 N32544 N47602
2,1,N58584 N12900 N6233 N51706 N18777 N593 N16793 ...,N28213 N29801 N60992 N19592 N62128
3,2,N56586 N3046 N4209 N22293 N40545 N44495 N38659...,N22061 N4247 N61648 N27560 N22407
4,2,N56586 N3046 N4209 N22293 N40545 N44495 N38659...,N3128 N58295 N17965 N34099 N55186
...,...,...,...
187613,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N8353 N59088 N33885 N50490 N17882
187614,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N63058 N39151 N44183 N26224 N23615
187615,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N61828 N41717 N8353 N48783 N46098
187616,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N17228 N4156 N51220 N12330 N54869


### 2. NNR_news<hr/>

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."
...,...,...,...,...,...
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b..."
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ..."


# 코드 실행

In [4]:
# =============================================================
# 1) 헬퍼 함수들
# =============================================================

def parse_list(origin_str):
    """문자열 형태의 History, Question(정답 포함)을 리스트로 변환"""
    """ 'N123, N456' => ['N123', 'N456'] """
    if not isinstance(origin_str, str):
        return []
    return [x.strip() for x in origin_str.split() if x.strip()]

def sample_negatives(
    positive_news_id,    
    user_id,
    user_history_set,
    global_news_set,
    user_ns_count_map,
    news_map,
    category_ns="category",  # "category" or "subcategory"
    num_negatives=4
):
    """positive 기사와 동일 user 의 click 카테고리를 고려해 negative 샘플링"""

    # 1) user 가 이미 본 기사 + 현재 positive 제외
    exclude_set = user_history_set[user_id] 

    candidate_list = []
    for news_id in sorted(global_news_set):
        if news_id in exclude_set:
            continue

        info = news_map.get(news_id, {})
        # 카테고리/서브카테고리 값 결정
        ns_val = info.get("SubCategory" if category_ns == "subcategory" else "Category", None)
        if ns_val is None:
            continue

        # 2) 사용자가 해당 카테고리를 이미 2번 이상 본 경우 skip (diversity 확보)
        if user_ns_count_map[user_id].get(ns_val, 0) >= 2:
            continue

        candidate_list.append(news_id)

    state = random.getstate()
    random.seed(positive_news_id)
    result = random.sample(candidate_list, min(len(candidate_list), num_negatives)) # 후보가 부족하면 가능한 만큼만 반환
    random.setstate(state)  # 이전 상태로 복원

    return result

def generate_demonstrations(
    user_id,
    history_ids,
    user_history_set,
    global_news_set,
    news_map,
    user_ns_count_map,
    purpose="with_negative",
    category_use=True,
    subcategory_use=True,
    category_ns="category"
):
    """사용자 히스토리를 natural language 형식으로 변환"""

    if purpose == "only_positive":
        # 클릭 히스토리만 나열
        lines = ["[Click History]",
                 f"The news articles that User #{user_id} clicked before are as follows:\n"]
        for idx, hid in enumerate(history_ids, start=1):
            info    = news_map.get(hid, {})
            title   = info.get("Title", f"(No Title:{hid})")
            cat     = info.get("Category", "unknown")
            subcat  = info.get("SubCategory", "unknown")

            # 카테고리/서브카테고리 출력 여부 결정
            if category_use and subcategory_use:
                suffix = f"[category : {cat} | subcategory : {subcat}]"
            elif category_use:
                suffix = f"[category : {cat}]"
            elif subcategory_use:
                suffix = f"[subcategory : {subcat}]"
            else:
                suffix = ""

            lines.append(f"{idx}. {title} {suffix}" if suffix else f"{idx}. {title}")
        lines.append("")
        return lines

    # ---------------- "with_negative" 모드 ----------------
    lines = ["[News of Interest to the user]\n"]

    for idx, hid in enumerate(history_ids, start=1):
        # positive 기사 정보
        pos_info    = news_map.get(hid, {})
        pos_title   = pos_info.get("Title", f"(No Title:{hid})")
        pos_cat     = pos_info.get("Category", "unknown")
        pos_sub     = pos_info.get("SubCategory", "unknown")

        # negative 4개 추출
        neg_ids = sample_negatives(
            positive_news_id=hid,
            user_id=user_id,
            user_history_set=user_history_set,
            global_news_set=global_news_set,
            user_ns_count_map=user_ns_count_map,
            news_map=news_map,
            category_ns=category_ns
        )

        combined = neg_ids + [hid]
        random.shuffle(combined)  # positive 위치 섞기

        # positive 문장
        if category_use and subcategory_use:
            pos_suffix = f"[category : {pos_cat} | subcategory : {pos_sub}]"
        elif category_use:
            pos_suffix = f"[category : {pos_cat}]"
        elif subcategory_use:
            pos_suffix = f"[subcategory : {pos_sub}]"
        else:
            pos_suffix = ""

        lead = (f"{idx}) User #{user_id} prefers most {pos_title} {pos_suffix} "
                f"among the following 5 articles:")
        lines.append(lead if pos_suffix else lead.replace(pos_suffix, ""))

        # 1 positive + 4 negative 기사 목록
        for i_c, cid in enumerate(combined, start=1):
            cinfo   = news_map.get(cid, {})
            ctitle  = cinfo.get("Title", f"(No Title:{cid})")
            ccat    = cinfo.get("Category", "unknown")
            csub    = cinfo.get("SubCategory", "unknown")

            if category_use and subcategory_use:
                suffix = f"[subcategory : {csub}]"
            elif category_use:
                suffix = f"[category : {ccat}]"
            elif subcategory_use:
                suffix = f"[subcategory : {csub}]"
            else:
                suffix = ""
            lines.append(f"{i_c}: {ctitle} {suffix}" if suffix else f"{i_c}: {ctitle}")
        lines.append("")

    return lines

def generate_question_block(
    user_id,
    question_str,
    news_map,
    category_use=True,
    subcategory_use=True,
    question_idx=1,
    method="top1",
):
    """Question(정답 포함)을 자연어 + [MASK] 형식으로 변환"""

    q_ids = parse_list(question_str)
    if len(q_ids) < 1:
        return [], None

    pos_id = q_ids[0]          # 첫 번째 ID를 정답으로 가정
    candidates = q_ids[:]
    random.shuffle(candidates)  # 정답 위치 무작위화

    # 정답이 몇 번째로 갔는지 저장 → hidden_positions 용
    pos_index = next((i for i, cid in enumerate(candidates, start=1) if cid == pos_id), -1)

    # 프롬프트 헤더 라인 (method 옵션별 문구 차이)
    if method == "ranking":
        header = (f"Question {question_idx}) [Ranking] User #{user_id}가 "
                  f"가장 좋아하는 기사는 [MASK]입니다.")
    elif method == "origin":
        header = (f"Question {question_idx}) [Origin] User #{user_id}가 "
                  f"가장 좋아하는 뉴스를 고르시오.")
    else:  # default "top1"
        header = (f"Question {question_idx}) User #{user_id} prefers most [MASK] "
                  f"among the following {len(candidates)} articles:")

    lines = [header]

    # 후보 기사 목록 작성
    for i_c, cid in enumerate(candidates, start=1):
        info = news_map.get(cid, {})
        title = info.get("Title", f"(No Title:{cid})")
        ccat = info.get("Category", "unknown")
        csub = info.get("SubCategory", "unknown")

        if category_use and subcategory_use:
            suffix = f"[category : {ccat} | subcategory : {csub}]"
        elif category_use and not subcategory_use:
            suffix = f"[category : {ccat}]"
        elif (not category_use) and subcategory_use:
            suffix = f"[subcategory : {csub}]"
        else:
            suffix = ""
        lines.append(f"{i_c}: {title} {suffix}" if suffix else f"{i_c}: {title}")

    lines.append("")
    return lines, pos_index

def split_into_chunks(seq, n, min_num=3):
    """seq를 n등분하되, 각 chunk는 최소 min_num 이상의 길이를 가져야 함. 불가능하면 n보다 적게 나눔"""
    if not seq:
        return []

    max_possible_chunks = len(seq) // min_num
    n = min(n, max_possible_chunks) if max_possible_chunks > 0 else 1

    base = len(seq) // n
    rem  = len(seq) % n
    sizes = [base + 1 if i < rem else base for i in range(n)]

    out, idx = [], 0
    for s in sizes:
        out.append(seq[idx:idx+s])
        idx += s

    return [c for c in out if c]
# -------------------------------------------------------------

def call_openai(system_prompt: str, user_prompt: str,
                client: OpenAI, model: str):
    messages = [
        {"role":"system", "content":system_prompt},
        {"role":"user",   "content":user_prompt}
    ]
    resp = client.chat.completions.create(
        model=model,
        messages=messages,
    )
    return resp.choices[0].message.content.strip()

# =============================================================
# 2) 프롬프트 + 요약 생성 메인 함수
# =============================================================

def create_prompts_MIND(
    user_df,
    news_df,
    output_folder="prompt_MIND",
    gpt_model="gpt-4o-mini",
    max_history=None,
    max_question=None,
    start_user=None,
    max_user=None,
    category_use=True,
    subcategory_use=True,
    category_ns="category",   # negative sampling 기준
    purpose="with_negative",  # 데모 블록 모드
    method="top1",            # question block 헤더 스타일
    do_summary=True,           # ChatGPT 요약 여부
    openai_client=None,        # openai python client 객체
    recurrent=True,                # ⬅️ 순차 요약 수행 여부
    num_chunks=5                   # ⬅️ 몇 등분으로 나눌지
):
    """전체 사용자 루프 돌며 프롬프트/메타데이터 생성"""

    # 0) tiktoken 인코더 준비 (토큰 수 계산용)
    encoding = tiktoken.encoding_for_model(gpt_model)
    output_folder = f'../../prompts/{output_folder}/{purpose}'

    # 1) User 범위 필터링
    if start_user is not None:
        user_df = user_df[user_df["User"] >= start_user]
    if max_user is not None:
        user_df = user_df[user_df["User"] <= max_user]

    # 2) 뉴스 ID → 메타정보 dict (news_map)
    news_map = {
        row[0]: {"Category": row[1], "SubCategory": row[2],
                 "Title": row[3], "Abstract": row[4] if len(row) > 4 else ""}
        for row in news_df.itertuples(index=False)
    }

    # 3) 사용자별 히스토리 (순서 유지 & 중복 제거)
    user_history_ordered_temp = defaultdict(list)   # 실제 순서를 유지한 리스트
    user_history_temp_set = defaultdict(set)        # 중복 체크용 & 빠른 검색용 set

    # 사용자별 뉴스 history(클릭 이력)를 중복 없이, 순서를 유지하며 저장
    
    for row in user_df.itertuples(index=False):
        uid = row.User
        hist_list = parse_list(row.History)
        for h in hist_list:
            if h not in user_history_temp_set[uid]:
                user_history_ordered_temp[uid].append(h)
                user_history_temp_set[uid].add(h)

    # dict 변환
    user_history_ordered = {uid: lst for uid, lst in user_history_ordered_temp.items()}
    user_history_set     = {uid: set(lst) for uid, lst in user_history_ordered.items()}

    # 3‑1) 사용자별 카테고리/서브카테고리 클릭 횟수
    user_ns_count_map = defaultdict(Counter)
    for uid, hist_ids in user_history_set.items():
        cnt = Counter()
        for nid in hist_ids:
            info = news_map.get(nid, {})
            ns_val = info.get("SubCategory" if category_ns == "subcategory" else "Category", None)
            if ns_val:
                cnt[ns_val] += 1
        user_ns_count_map[uid] = cnt

    # 전역 뉴스 집합 (negative 후보군)
    global_news_set = set().union(*user_history_set.values())

    # 4) 출력 폴더 비우기/생성
    if os.path.exists(output_folder):
        for f in glob.glob(os.path.join(output_folder, "*.txt")):
            os.remove(f)
    else:
        os.makedirs(output_folder, exist_ok=True)

    # 5) 메타데이터 저장용 리스트 초기화
    token_stats_list = []   # (uid, input_token_count, output_token_count, hist_len, q_count)
    hidden_positions_data = []  # question 정답 위치

    grouped = user_df.groupby("User")

    # ---------------- 사용자 루프 ----------------
    for user_id, group in grouped:

        # A) 히스토리 리스트 준비
        hist_list = user_history_ordered[user_id]
        if max_history and len(hist_list) > max_history:
            hist_list = hist_list[-max_history:]  # 최근 기록만 유지

        if not do_summary:
            final_summary = None

        elif len(hist_list) <= 5 or not recurrent:
            demo_txt = "\n".join(generate_demonstrations(
                user_id=user_id,
                history_ids=hist_list,
                user_history_set=user_history_set,
                global_news_set=global_news_set,
                news_map=news_map,
                user_ns_count_map=user_ns_count_map,
                purpose=purpose,
                category_use=category_use,
                subcategory_use=subcategory_use,
                category_ns=category_ns))
            
            example_block = example_output_block.replace("OO", str(user_id))
            content_txt   = demo_txt + "\n" + example_block
            final_summary = call_openai(
                system_instruction.replace("OO", str(user_id)),
                content_txt, openai_client, gpt_model
            )

        else:
            # ---- recurrent summary ----
            chunks = split_into_chunks(hist_list, num_chunks)

            # (1) 첫 chunk → 기본 instruction
            demo_txt = "\n".join(generate_demonstrations(
                user_id=user_id,
                history_ids=chunks[0],
                user_history_set=user_history_set,
                global_news_set=global_news_set,
                news_map=news_map,
                user_ns_count_map=user_ns_count_map,
                purpose=purpose,
                category_use=category_use,
                subcategory_use=subcategory_use,
                category_ns=category_ns))

            example_block = example_output_block.replace("OO", str(user_id))
            content_txt   = demo_txt + "\n" + example_block
            summary_prev  = call_openai(
                system_instruction.replace("OO", str(user_id)),
                content_txt, openai_client, gpt_model
            )

            # (2) 이후 chunk → recurrent_instruction 사용
            for ck in chunks[1:]:
                new_demo   = "\n".join(generate_demonstrations(               
                    user_id=user_id,
                    history_ids=ck,
                    user_history_set=user_history_set,
                    global_news_set=global_news_set,
                    news_map=news_map,
                    user_ns_count_map=user_ns_count_map,
                    purpose=purpose,
                    category_use=category_use,
                    subcategory_use=subcategory_use,
                    category_ns=category_ns))

                # summary_prev = summary_prev.replace(f"[User #{user_id}’s News Preferences Summary]", "")
                new_demo = new_demo.replace("[News of Interest to the user]\n", "")
                content_txt = (f"<User #{str(user_id)}'s Previous Preferences Summary>\n\n{summary_prev}\n\n"
                               f"<New “News of Interest to the user”>\n{new_demo}")

                print(content_txt)

                example_block = recurrent_output_block.replace("OO", str(user_id))
                content_txt   = content_txt + "\n" + example_block

                summary_prev = call_openai(
                    recurrent_instruction.replace("OO", str(user_id)), 
                    content_txt, openai_client, gpt_model
                )
            final_summary = summary_prev

        if do_summary:
            header_block = final_summary
        else:
            # 요약 disabled 이면 데모 자체를 history_text로 사용
            header_block = "\n".join(generate_demonstrations(
                user_id=user_id,
                history_ids=hist_list,
                user_history_set=user_history_set,
                global_news_set=global_news_set,
                news_map=news_map,
                user_ns_count_map=user_ns_count_map,
                purpose=purpose,
                category_use=category_use,
                subcategory_use=subcategory_use,
                category_ns=category_ns))


        # 3) Question 블록 생성
        row_records = group.to_dict("records")
        if max_question is not None and len(row_records) > max_question:
            row_records = row_records[:max_question]

        # q_lines = ["[Questions]",
        #            (f"Based on User #{user_id}'s News Preferences Summary, predict the index number "
        #             f"of the news article that best fits the position labeled [MASK] for each question.\n")]
        
        if do_summary:
            q_lines = ["[Questions]",
                        (f"Based on User #{user_id}'s preferences, predict the index number "
                        f"of the news article that best fits the position labeled [MASK] for each question.\n")]
        else:
            q_lines = ["[Questions]",
                        (f"Based on User #{user_id}'s preferences, predict the index number "
                        f"of the news article that best fits the position labeled [MASK] for each question.\n")]

        user_q_positions = []
        for idx_q, rr in enumerate(row_records, start=1):
            block_lines, pos_idx = generate_question_block(
                user_id=user_id,
                question_str=rr["Question"],
                news_map=news_map,
                category_use=category_use,
                subcategory_use=subcategory_use,
                question_idx=idx_q,
                method=method,
            )
            q_lines.extend(block_lines)
            if pos_idx is not None:
                user_q_positions.append(pos_idx)

        q_lines.append(f"Please provide just the answers to each of User #{user_id}'s question without any explanations.")

        # 4) 요약 + 질문 합치기
        final_prompt = "\n\n".join([header_block, "\n".join(q_lines)])

        # 5) 토큰 수 집계
        token_count = len(encoding.encode(final_prompt))
        question_count = len(row_records)
        hist_num = len(hist_list)

        # 6) 파일 저장
        out_path = os.path.join(output_folder, f"U{user_id}.txt")
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(final_prompt)

        # 7) 메타데이터 기록
        token_stats_list.append((user_id, token_count, question_count, hist_num, question_count))
        hidden_positions_data.append((user_id, user_q_positions))

    # ---------------- 메타데이터 파일 저장 ----------------
    meta_folder_path = os.path.join(output_folder, "metadata")
    if os.path.exists(meta_folder_path):
        for file_path in glob.glob(os.path.join(meta_folder_path, "*.txt")):
            os.remove(file_path)
    else:
        os.makedirs(meta_folder_path, exist_ok=True)

    # (1) output_metadata.txt : 사용자별 토큰/통계
    meta_file_path = os.path.join(meta_folder_path, "output_metadata.txt")
    sorted_list = sorted(token_stats_list, key=lambda x: x[0])
    total_tokens = total_q = 0

    with open(meta_file_path, "w", encoding="utf-8") as mf:
        for uid, tcount, qc, hnum, qnum in sorted_list:
            mf.write((f"User ID: U{uid:<5}  Input Tokens: {tcount:<6}  Output Tokens: {qc:<4}  "
                      f"History 수: {hnum:<3}  Question 수: {qnum}\n"))
            total_tokens += tcount
            total_q += qc
        mf.write("\n")
        mf.write(f"Total Input Tokens : {total_tokens}\n")
        mf.write(f"Total Output Tokens : {total_q}\n")

    # (2) hidden_positions.txt : [MASK] 정답 위치
    hidden_path = os.path.join(meta_folder_path, "hidden_positions.txt")
    with open(hidden_path, "w", encoding="utf-8") as hf:
        for uid, pos_list in hidden_positions_data:
            hf.write(f"U{uid:<5}: {pos_list}\n")

    print(f"[INFO] 프롬프트가 생성되었습니다: {output_folder}")
    print(f"[INFO] 메타데이터가 저장되었습니다: {meta_folder_path}")

## 실행
[create 함수]
- purpose = 어떤 목적으로 prompt를 생성할 것인지  [only_positive / with_negative]   
- model = 사용할 gpt (token 계산 용도)  [gpt-4o-mini / gpt-3.5-turbo]
- user_count = 몇 명의 user prompt를 생성할 것인지
- max_question = 최대 질문 수
- save_forder = 결과를 저장할 폴더 이름
- max_history: History에서 최근 n개만
- max_question: Question에서 n개 행만
- max_user: 사용자 상위 n명만 처리 (User 값이 숫자라고 가정)
- method : top1 / ranking / origin 

In [None]:
create_prompts_MIND(
    user_df=user,
    news_df=NNR_news,
    output_folder="MIND/[250424]cate,sub+cate-not summary",
    gpt_model="gpt-4o-mini",
    max_history=None,
    max_question=None,
    start_user=1,
    max_user=1000,
    category_use=True,
    subcategory_use=True,
    category_ns="category",
    purpose="with_negative",
    method="top1",
    do_summary=True,                  # 요약 실행
    openai_client=client,            # openai 라이브러트 사용
    recurrent=False,                #  순차 요약 수행 여부
    num_chunks=5
)


[INFO] 프롬프트가 생성되었습니다: ../../prompts/MIND/[250424]cate,sub+cate-not summary/with_negative
[INFO] 메타데이터가 저장되었습니다: ../../prompts/MIND/[250424]cate,sub+cate-not summary/with_negative\metadata


## ---