In [1]:
import os
import glob
import random
import pandas as pd
import datetime
import tiktoken
from collections import Counter, defaultdict
from IPython.display import display, Markdown

# user DF에서 History column과 Train column을 합쳐서 history로 사용
file_name = "MIND_LLM.tsv"
user = pd.read_csv(f'../../../data/LLM/{file_name}', sep='\t', names=['User', 'History', 'Question'])

NNR_news = pd.read_csv('../../../data/baseline/MIND_news.tsv', sep='\t', names=['News ID','Category','SubCategory','Title','Abstract','URL','Title Entities','Abstract Entities'])
NNR_news = NNR_news.drop(columns=['URL', 'Title Entities', 'Abstract Entities'])

In [15]:
system_instruction = """
Summarize and describe User #OO's news preferences from the information presented in the following [News of interest to the user].
- Consider User's most preferred news.
- Consider User's least preferred news as well.
"""

example_output_block  = """
## Example output ##
User #OO's News Preferences Summary:

Most Preferred News Topics: 
1. ~
2. ~
3. ~
...

Less Preferred News Topics:
1. ~
2. ~
3. ~
...

Overall:
"""

# 데이터 확인

In [2]:
display(Markdown("### 1. user<hr/>"))
# display(user[user['User']==1]['Question'][0])
display(user)
display(Markdown("### 2. NNR_news<hr/>"))
display(NNR_news)

### 1. user<hr/>

Unnamed: 0,User,History,Question
0,1,N58584 N12900 N6233 N51706 N18777 N593 N16793 ...,N31978 N24207 N58271 N14184 N36789
1,1,N58584 N12900 N6233 N51706 N18777 N593 N16793 ...,N57809 N50675 N61404 N32544 N47602
2,1,N58584 N12900 N6233 N51706 N18777 N593 N16793 ...,N28213 N29801 N60992 N19592 N62128
3,2,N56586 N3046 N4209 N22293 N40545 N44495 N38659...,N22061 N4247 N61648 N27560 N22407
4,2,N56586 N3046 N4209 N22293 N40545 N44495 N38659...,N3128 N58295 N17965 N34099 N55186
...,...,...,...
187613,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N8353 N59088 N33885 N50490 N17882
187614,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N63058 N39151 N44183 N26224 N23615
187615,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N61828 N41717 N8353 N48783 N46098
187616,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N17228 N4156 N51220 N12330 N54869


### 2. NNR_news<hr/>

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."
...,...,...,...,...,...
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b..."
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ..."


# 코드 실행

In [13]:


############################################
# 1) 헬퍼 함수들
############################################

def parse_history_list(history_str):
    if not isinstance(history_str, str):
        return []
    return [x.strip() for x in history_str.split() if x.strip()]

def parse_question_list(question_str):
    if not isinstance(question_str, str):
        return []
    return [x.strip() for x in question_str.split() if x.strip()]

def sample_negatives(
    positive_news_id,
    user_id,
    user_history_set,
    global_news_set,
    user_ns_count_map,
    news_map,
    category_ns="category",  # "category" or "subcategory"
    num_negatives=4
):
    exclude_set = set(user_history_set[user_id]) | {positive_news_id}
    
    candidate_list = []
    for news_id in global_news_set:
        if news_id in exclude_set:
            continue
        
        info = news_map.get(news_id, {})
        if category_ns == "subcategory":
            ns_val = info.get("SubCategory", None)
        else:
            ns_val = info.get("Category", None)
        
        if ns_val is None:
            continue
        
        # 사용자가 이미 해당 카테고리/서브카테고리를 2번 이상 본 경우 제외
        if user_ns_count_map[user_id].get(ns_val, 0) >= 2:
            continue
        
        candidate_list.append(news_id)
    
    if len(candidate_list) < num_negatives:
        return random.sample(candidate_list, len(candidate_list))
    else:
        return random.sample(candidate_list, num_negatives)

def generate_demonstrations(
    user_id,
    history_ids,
    user_history_set,
    global_news_set,
    news_map,
    user_ns_count_map,
    purpose="with_negative",
    category_use=True,
    subcategory_use=True,
    category_ns="category"
):
    """
    - 'only_positive': 사용자 히스토리를 단순히 나열.
    - 'with_negative': 사용자 히스토리 각 기사마다 negative 4개를 샘플링한 뒤 5개 묶음으로 출력.
    """
    if purpose == "only_positive":
        lines = ["[Click History]"]
        lines.append(f"User #{user_id}가 과거에 클릭한 뉴스 목록은 다음과 같습니다:\n")
        
        for idx, hid in enumerate(history_ids, start=1):
            info = news_map.get(hid, {})
            title = info.get("Title", f"(No Title:{hid})")
            cat = info.get("Category", "unknown")
            subcat = info.get("SubCategory", "unknown")
            
            # 출력 형식
            if category_use and subcategory_use:
                suffix = f"[category : {cat} | subcategory : {subcat}]"
            elif category_use and not subcategory_use:
                suffix = f"[category : {cat}]"
            elif (not category_use) and subcategory_use:
                suffix = f"[subcategory : {subcat}]"
            else:
                suffix = ""
            
            if suffix:
                lines.append(f"{idx}. {title} {suffix}")
            else:
                lines.append(f"{idx}. {title}")
        
        lines.append("")  # 빈 줄
        return lines
    
    else:
        # with_negative
        lines = ["[News of Interest to the user]\n"]
        
        for idx, hid in enumerate(history_ids, start=1):
            pos_info = news_map.get(hid, {})
            pos_title = pos_info.get("Title", f"(No Title:{hid})")
            pos_cat = pos_info.get("Category", "unknown")
            pos_sub = pos_info.get("SubCategory", "unknown")
            
            # negative 샘플링
            neg_ids = sample_negatives(
                positive_news_id=hid,
                user_id=user_id,
                user_history_set=user_history_set,
                global_news_set=global_news_set,
                user_ns_count_map=user_ns_count_map,
                news_map=news_map,
                category_ns=category_ns,
                num_negatives=4
            )
            
            combined = neg_ids + [hid]
            random.shuffle(combined)
            
            # 긍정(positive) 기사에 대한 표기
            if category_use and subcategory_use:
                pos_suffix = f"[subcategory : {pos_sub}]"
            elif category_use and not subcategory_use:
                pos_suffix = f"[category : {pos_cat}]"
            elif (not category_use) and subcategory_use:
                pos_suffix = f"[subcategory : {pos_sub}]"
            else:
                pos_suffix = ""
            
            if pos_suffix:
                lines.append(
                    f"{idx}) User #{user_id} prefers most <{pos_title} {pos_suffix}> among the following 5 articles:"
                )
            else:
                lines.append(
                    f"{idx}) User #{user_id} prefers most <{pos_title}> among the following 5 articles:"
                )
            
            # 5개 기사(1개 긍정 + 4개 부정) 출력
            for i_c, cid in enumerate(combined, start=1):
                cinfo = news_map.get(cid, {})
                ctitle = cinfo.get("Title", f"(No Title:{cid})")
                ccat = cinfo.get("Category", "unknown")
                csub = cinfo.get("SubCategory", "unknown")
                
                if category_use and subcategory_use:
                    suffix = f"[subcategory : {csub}]"
                elif category_use and not subcategory_use:
                    suffix = f"[category : {ccat}]"
                elif (not category_use) and subcategory_use:
                    suffix = f"[subcategory : {csub}]"
                else:
                    suffix = ""
                
                if suffix:
                    lines.append(f"{i_c}: {ctitle} {suffix}")
                else:
                    lines.append(f"{i_c}: {ctitle}")
            lines.append("")  # 빈 줄
        
        return lines

def generate_question_block(
    user_id,
    question_str,
    news_map,
    category_use=True,
    subcategory_use=True,
    question_idx=1,
    method="top1"
):
    """
    질문 부분(1개 positive + 4개 negative).
    무작위로 섞고, positive 기사의 위치를 찾아서 반환.
    """
    q_ids = parse_question_list(question_str)
    if len(q_ids) < 1:
        return [], None
    
    pos_id = q_ids[0]
    candidates = q_ids[:]
    random.shuffle(candidates)
    
    pos_index = -1
    for i, cid in enumerate(candidates, start=1):
        if cid == pos_id:
            pos_index = i
            break
    
    if method == "ranking":
        header = f"Question {question_idx}) [Ranking] User #{user_id}가 가장 좋아하는 기사는 [MASK]입니다."
    elif method == "origin":
        header = f"Question {question_idx}) [Origin] User #{user_id}가 가장 좋아하는 뉴스를 고르시오."
    else:
        header = f"Question {question_idx}) User #{user_id} prefers most [MASK] among the following {len(candidates)} articles:"
    
    lines = [header]
    
    for i_c, cid in enumerate(candidates, start=1):
        info = news_map.get(cid, {})
        title = info.get("Title", f"(No Title:{cid})")
        ccat = info.get("Category", "unknown")
        csub = info.get("SubCategory", "unknown")
        
        if category_use and subcategory_use:
            suffix = f"[subcategory : {csub}]"
        elif category_use and not subcategory_use:
            suffix = f"[category : {ccat}]"
        elif (not category_use) and subcategory_use:
            suffix = f"[subcategory : {csub}]"
        else:
            suffix = ""
        
        if suffix:
            lines.append(f"{i_c}: {title} {suffix}")
        else:
            lines.append(f"{i_c}: {title}")
    
    lines.append("")  # 빈 줄
    return lines, pos_index


############################################
# 2) Summarization을 포함한 메인 함수
############################################

def create_prompts_MIND(
    user_df,
    news_df,
    output_folder="prompt_MIND",
    gpt_model="gpt-4",
    max_history=None,
    max_question=None,
    start_user=None,
    max_user=None,
    category_use=True,
    subcategory_use=True,
    category_ns="category",   # "category" or "subcategory"
    purpose="with_negative",  # "only_positive" or "with_negative"
    method="top1",            # "top1", "ranking", "origin", 등
    do_summary=True,          # 요약할지 여부
    openai_client=None,       # openai(혹은 Azure openai 스타일) 클라이언트 객체
):
    """
    (1) 사용자 히스토리 / 데모 블록 생성
    (2) ChatGPT API 호출하여 요약(옵션)
    (3) [Questions] 블록 붙이기
    (4) 최종 프롬프트 저장 + 메타데이터
    
    - do_summary=True 인 경우, openai_client와 system_instruction 제공 필요
    """
    # 0) 토크나이저 준비
    encoding = tiktoken.encoding_for_model(gpt_model)
    output_folder = f'../../prompts/{output_folder}/{purpose}'
    
    # 1) 유저 범위 필터링
    if start_user is not None:
        user_df = user_df[user_df["User"] >= start_user]
    if max_user is not None:
        user_df = user_df[user_df["User"] <= max_user]
    
    # 2) news_map 만들기
    news_map = {}
    for row in news_df.itertuples(index=False):
        nid = row[0]  # News ID
        news_map[nid] = {
            "Category": row[1],
            "SubCategory": row[2],
            "Title": row[3],
            "Abstract": row[4] if len(row) > 4 else ""
        }
    
    # 3) 사용자별 히스토리 구조
    user_history_ordered_temp = defaultdict(list)
    user_history_temp_set = defaultdict(set)
    
    for row in user_df.itertuples(index=False):
        uid = row.User
        hist_list = parse_history_list(row.History)
        for h in hist_list:
            if h not in user_history_temp_set[uid]:
                user_history_ordered_temp[uid].append(h)
                user_history_temp_set[uid].add(h)
    
    user_history_ordered = {}
    user_history_set = {}
    for uid, ordered_list in user_history_ordered_temp.items():
        user_history_ordered[uid] = ordered_list
        user_history_set[uid] = set(ordered_list)
    
    # 3-1) 사용자별 카테고리/서브카테고리 카운트
    user_ns_count_map = defaultdict(Counter)
    for uid, hist_ids in user_history_set.items():
        ccount = Counter()
        for nid in hist_ids:
            info = news_map.get(nid, {})
            if category_ns == "subcategory":
                ns_val = info.get("SubCategory", None)
            else:
                ns_val = info.get("Category", None)
            if ns_val:
                ccount[ns_val] += 1
        user_ns_count_map[uid] = ccount
    
    # 전역 뉴스 세트
    global_news_set = set()
    for s in user_history_set.values():
        global_news_set |= s
    
    # 4) 출력 폴더 정리
    if os.path.exists(output_folder):
        for f in glob.glob(os.path.join(output_folder, "*.txt")):
            os.remove(f)
    else:
        os.makedirs(output_folder, exist_ok=True)
    
    # 5) 메타데이터 수집
    token_stats_list = []
    hidden_positions_data = []
    
    grouped = user_df.groupby("User")
    
    # 사용자별 처리
    for user_val, group in grouped:
        user_id = user_val
        
        # A) 히스토리 (중복 제거+순서 유지)
        hist_list = user_history_ordered[user_id]
        
        # B) max_history로 자르기
        if max_history is not None and len(hist_list) > max_history:
            hist_list = hist_list[-max_history:]
        
        # 1) 데모 블록 생성
        demo_lines = generate_demonstrations(
            user_id=user_id,
            history_ids=hist_list,
            user_history_set=user_history_set,
            global_news_set=global_news_set,
            news_map=news_map,
            user_ns_count_map=user_ns_count_map,
            purpose=purpose,
            category_use=category_use,
            subcategory_use=subcategory_use,
            category_ns=category_ns
        )
        # 텍스트화
        demonstration_text = "\n".join(demo_lines)

        instruction = system_instruction.replace("OO", str(user_id))
        example_block = example_output_block.replace("OO", str(user_id))

        # demonstration_text에 이어 붙이기
        demonstration_text += "\n" + example_block   

        
        
        print(f"user : {user_id}")
        print(f"instruction : {instruction}")
        print(f"demonstration_text : {demonstration_text}\n")
        
        # 2) ChatGPT로 요약(옵션)
        if do_summary and openai_client is not None and instruction is not None:
            messages = [
                {"role": "system", "content": instruction},
                {"role": "user", "content": demonstration_text}
            ]
            response = openai_client.chat.completions.create(
                model=gpt_model,
                messages=messages
            )
            summary_text = response.choices[0].message.content.strip()
        else:
            summary_text = demonstration_text
        
        # 3) Questions 파트 생성
        row_records = group.to_dict("records")
        if max_question is not None and len(row_records) > max_question:
            row_records = row_records[:max_question]
        
        q_lines = []
        q_lines.append("[Questions]")
        q_lines.append(
            f"Based on User #{user_id}'s News Preferences Summary, predict the index number of the news article "
            f"that best fits the position labeled [MASK] for each question.\n"
        )
        
        user_q_positions = []
        for idx_q, rr in enumerate(row_records, start=1):
            q_str = rr["Question"]
            block_lines, pos_idx = generate_question_block(
                user_id=user_id,
                question_str=q_str,
                news_map=news_map,
                category_use=category_use,
                subcategory_use=subcategory_use,
                question_idx=idx_q,
                method=method
            )
            q_lines.extend(block_lines)
            if pos_idx is not None:
                user_q_positions.append(pos_idx)
        
        q_lines.append(
            f"Please provide just the answers to each of User #{user_id}'s question without any explanations."
        )
        
        # 4) 요약 + 질문 합치기
        final_lines = [summary_text, "\n".join(q_lines)]
        final_prompt = "\n\n".join(final_lines)
        
        # 5) 토큰 수 계산
        token_count = len(encoding.encode(final_prompt))
        question_count = len(row_records)
        hist_num = len(hist_list)
        
        # 6) 파일로 저장
        out_path = os.path.join(output_folder, f"U{user_id}.txt")
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(final_prompt)
        
        # 7) 메타데이터
        token_stats_list.append((user_id, token_count, question_count, hist_num, question_count))
        hidden_positions_data.append((user_id, user_q_positions))
    
    # -------------------------
    # 메타데이터 저장
    # -------------------------
    meta_folder_path = os.path.join(output_folder, "metadata")
    if os.path.exists(meta_folder_path):
        for file_path in glob.glob(os.path.join(meta_folder_path, "*.txt")):
            os.remove(file_path)
    else:
        os.makedirs(meta_folder_path, exist_ok=True)
    
    # output_metadata.txt
    meta_file_path = os.path.join(meta_folder_path, "output_metadata.txt")
    sorted_list = sorted(token_stats_list, key=lambda x: x[0])
    total_tokens = 0
    total_q = 0
    
    with open(meta_file_path, "w", encoding="utf-8") as mf:
        for (uid, tcount, qc, hnum, qnum) in sorted_list:
            line = (
                f"User ID: U{uid:<5}  "
                f"Input Tokens: {tcount:<6}  "
                f"Output Tokens: {qc:<4}  "
                f"History 수: {hnum:<3}  "
                f"Question 수: {qnum}"
            )
            total_tokens += tcount
            total_q += qc
            mf.write(line + "\n")
        mf.write("\n")
        mf.write(f"Total Input Tokens : {total_tokens}\n")
        mf.write(f"Total Output Tokens : {total_q}\n")
    
    # hidden_positions.txt
    hidden_path = os.path.join(meta_folder_path, "hidden_positions.txt")
    with open(hidden_path, "w", encoding="utf-8") as hf:
        for uid, pos_list in hidden_positions_data:
            hf.write(f"U{uid:<5}: {pos_list}\n")
    
    print(f"[INFO] 프롬프트가 생성되었습니다: {output_folder}")
    print(f"[INFO] 메타데이터가 저장되었습니다: {meta_folder_path}")


## 실행
[create 함수]
- purpose = 어떤 목적으로 prompt를 생성할 것인지  [only_positive / with_negative]   
- model = 사용할 gpt (token 계산 용도)  [gpt-4o-mini / gpt-3.5-turbo]
- user_count = 몇 명의 user prompt를 생성할 것인지
- max_question = 최대 질문 수
- save_forder = 결과를 저장할 폴더 이름
- max_history: History에서 최근 n개만
- max_question: Question에서 n개 행만
- max_user: 사용자 상위 n명만 처리 (User 값이 숫자라고 가정)
- method : top1 / ranking / origin 

In [14]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv() # .env 파일 로드
my_api_key = os.getenv("API_KEY") # 환경 변수에서 API 키 불러오기

client = OpenAI(
    api_key = my_api_key
)


create_prompts_MIND(
    user_df=user,
    news_df=NNR_news,
    output_folder="요약임시",
    gpt_model="gpt-4",
    max_history=None,
    max_question=None,
    start_user=1,
    max_user=2,
    category_use=True,
    subcategory_use=True,
    category_ns="subcategory",
    purpose="with_negative",
    method="top1",
    do_summary=True,                  # 요약 실행
    openai_client=client            # openai 라이브러트 사용
)


user : 1
instruction : 
Summarize and describe User #1's news preferences from the information presented in the following [News of interest to the user].
- Consider User's most preferred news.
- Consider User's least preferred news as well.

demonstration_text : [News of Interest to the user]

1) User #1 prefers most <2020 Chevrolet Corvette C8 vs. C8.R: Here's How the Race Car is Different [subcategory : autossports]> among the following 5 articles:
1: Why Healthy People Faint and How to Treat Them [subcategory : medical]
2: Man sets pet lion on electrician who tried to collect pay, police say [subcategory : newsworld]
3: Body of missing Alabama girl found; 2 being charged [subcategory : newscrime]
4: Woman found dead in Ohio home was likely killed by her Great Danes, police say [subcategory : newsus]
5: 2020 Chevrolet Corvette C8 vs. C8.R: Here's How the Race Car is Different [subcategory : autossports]

2) User #1 prefers most <'Joker' Continues to Smash Box Office Expectations in 2

## ---