In [3]:
import os
import glob
import random
import pandas as pd
import datetime
import tiktoken
from collections import Counter, defaultdict
from IPython.display import display, Markdown

# user DF에서 History column과 Train column을 합쳐서 history로 사용
file_name = "MIND_LLM.tsv"
user = pd.read_csv(f'../../../data/LLM/{file_name}', sep='\t', names=['User', 'History', 'Question'])

NNR_news = pd.read_csv('../../../data/baseline/MIND_news.tsv', sep='\t', names=['News ID','Category','SubCategory','Title','Abstract','URL','Title Entities','Abstract Entities'])
NNR_news = NNR_news.drop(columns=['URL', 'Title Entities', 'Abstract Entities'])

# 데이터 확인

In [4]:
display(Markdown("### 1. user<hr/>"))
# display(user[user['User']==1]['Question'][0])
display(user)
display(Markdown("### 2. NNR_news<hr/>"))
display(NNR_news)

### 1. user<hr/>

Unnamed: 0,User,History,Question
0,1,N58584 N12900 N6233 N51706 N18777 N593 N16793 ...,N31978 N24207 N58271 N14184 N36789
1,1,N58584 N12900 N6233 N51706 N18777 N593 N16793 ...,N57809 N50675 N61404 N32544 N47602
2,1,N58584 N12900 N6233 N51706 N18777 N593 N16793 ...,N28213 N29801 N60992 N19592 N62128
3,2,N56586 N3046 N4209 N22293 N40545 N44495 N38659...,N22061 N4247 N61648 N27560 N22407
4,2,N56586 N3046 N4209 N22293 N40545 N44495 N38659...,N3128 N58295 N17965 N34099 N55186
...,...,...,...
187613,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N8353 N59088 N33885 N50490 N17882
187614,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N63058 N39151 N44183 N26224 N23615
187615,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N61828 N41717 N8353 N48783 N46098
187616,37639,N22185 N56586 N3500 N61826 N10629 N9824 N2745 ...,N17228 N4156 N51220 N12330 N54869


### 2. NNR_news<hr/>

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."
...,...,...,...,...,...
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b..."
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ..."


# 코드 실행

In [21]:
import os
import glob
import random
import pandas as pd
import tiktoken
from collections import defaultdict, Counter

############################################
# 1) Helper Functions
############################################

def parse_history_list(history_str):
    """
    MIND 데이터셋에서 History는 공백 구분으로 여러 News ID가 나열된 문자열.
    예: "N58584 N12900 N6233"
    """
    if not isinstance(history_str, str):
        return []
    return [x.strip() for x in history_str.split() if x.strip()]

def parse_question_list(question_str):
    """
    MIND 데이터셋에서 Question은 공백 구분으로 5개 News ID가 주어짐.
    첫 번째가 실제 positive, 나머지 4개는 negative.
    예: "N31978 N24207 N58271 N14184 N36789"
    """
    if not isinstance(question_str, str):
        return []
    return [x.strip() for x in question_str.split() if x.strip()]


############################################
# 2) Negative Sampling for History
############################################

def sample_negatives(
    positive_news_id,
    user_id,
    user_history_set,
    global_news_set,
    user_ns_count_map,
    news_map,
    category_ns="category",  # "category" or "subcategory"
    num_negatives=4
):
    """
    'with_negative' 모드에서 History의 각 기사마다 negative 4개 추출.

    1) user_id가 이미 읽은 뉴스 제외
    2) (카테고리 or 서브카테고리) 기준으로, 2번 이상 읽은 항목은 제외
    3) 전역 세트 중 4개 무작위 추출
    """
    exclude_set = set(user_history_set[user_id]) | {positive_news_id}
    
    candidate_list = []
    for news_id in global_news_set:
        if news_id in exclude_set:
            continue
        
        info = news_map.get(news_id, {})
        if category_ns == "subcategory":
            # 서브카테고리 사용
            ns_val = info.get("SubCategory", None)
        else:
            # 기본 카테고리 사용
            ns_val = info.get("Category", None)
        
        # user_ns_count_map[user_id][ns_val] >= 2 인 경우 제외
        if ns_val is None:
            continue
        # 즉, 유저가 이 ns_val을 2번 이상 읽었다면 제외
        if user_ns_count_map[user_id].get(ns_val, 0) >= 2:
            continue
        
        candidate_list.append(news_id)
    
    if len(candidate_list) < num_negatives:
        return random.sample(candidate_list, len(candidate_list))
    else:
        return random.sample(candidate_list, num_negatives)


############################################
# 3) History (demonstration) generator
############################################

def generate_demonstrations(
    user_id,
    history_ids,
    user_history_set,
    global_news_set,
    news_map,
    user_ns_count_map,
    purpose="with_negative",
    category_use=True,
    subcategory_use=True,
    category_ns="category"
):
    """
    - only_positive:
        History를 단순 나열 (Negative 없음)
    - with_negative:
        History 내 각 뉴스마다 negative 4개 골라 5개 묶음.
        여기서 subcategory / category 표시를 선택적으로 함.
    """
    if purpose == "only_positive":
        lines = ["[Click History]"]
        lines.append(f"The news articles that User #{user_id} clicked before are as follows:\n")
        
        for idx, hid in enumerate(history_ids, start=1):
            info = news_map.get(hid, {})
            title = info.get("Title", f"(No Title:{hid})")
            cat = info.get("Category", "unknown")
            subcat = info.get("SubCategory", "unknown")
            
            # 출력형식 결정
            # category_use=True, subcategory_use=True => [category : X | subcategory : Y]
            # category_use=True, subcategory_use=False => [category : X]
            # category_use=False, subcategory_use=True => [subcategory : Y]
            # 둘 다 False => ''
            line_suffix = ""
            if category_use and subcategory_use:
                line_suffix = f"[category : {cat} | subcategory : {subcat}]"
            elif category_use and not subcategory_use:
                line_suffix = f"[category : {cat}]"
            elif (not category_use) and subcategory_use:
                line_suffix = f"[subcategory : {subcat}]"
            else:
                line_suffix = ""
            
            if line_suffix:
                lines.append(f"{idx}. {title} {line_suffix}")
            else:
                lines.append(f"{idx}. {title}")
        
        lines.append("")  # 빈 줄
        return lines
    
    else:
        # with_negative
        lines = ["[News of Interest to the user]\n"]
        
        for idx, hid in enumerate(history_ids, start=1):
            pos_info = news_map.get(hid, {})
            pos_title = pos_info.get("Title", f"(No Title:{hid})")
            pos_cat = pos_info.get("Category", "unknown")
            pos_sub = pos_info.get("SubCategory", "unknown")
            
            # negative 4개 샘플링
            neg_ids = sample_negatives(
                positive_news_id=hid,
                user_id=user_id,
                user_history_set=user_history_set,
                global_news_set=global_news_set,
                user_ns_count_map=user_ns_count_map,
                news_map=news_map,
                category_ns=category_ns,
                num_negatives=4
            )
            
            combined = neg_ids + [hid]
            random.shuffle(combined)
            
            # positive 기사에 대한 표시용 문구
            # (category_use / subcategory_use)
            if category_use and subcategory_use:
                pos_suffix = f"[category : {pos_cat} | subcategory : {pos_sub}]"
            elif category_use and not subcategory_use:
                pos_suffix = f"[category : {pos_cat}]"
            elif (not category_use) and subcategory_use:
                pos_suffix = f"[subcategory : {pos_sub}]"
            else:
                pos_suffix = ""
            
            if pos_suffix:
                lines.append(
                    f"{idx}) User #{user_id} prefers most {pos_title} {pos_suffix} among the following 5 articles:"
                )
            else:
                lines.append(
                    f"{idx}) User #{user_id} prefers most {pos_title} among the following 5 articles:"
                )
            
            # 각 candidate 출력
            for i_c, cid in enumerate(combined, start=1):
                cinfo = news_map.get(cid, {})
                ctitle = cinfo.get("Title", f"(No Title:{cid})")
                ccat = cinfo.get("Category", "unknown")
                csub = cinfo.get("SubCategory", "unknown")
                
                # negative 기사 suffix
                if category_use and subcategory_use:
                    suffix = f"[category : {ccat} | subcategory : {csub}]"
                elif category_use and not subcategory_use:
                    suffix = f"[category : {ccat}]"
                elif (not category_use) and subcategory_use:
                    suffix = f"[subcategory : {csub}]"
                else:
                    suffix = ""
                
                if suffix:
                    lines.append(f"{i_c}: {ctitle} {suffix}")
                else:
                    lines.append(f"{i_c}: {ctitle}")
            lines.append("")  # 빈 줄
        
        return lines

############################################
# 4) Question generator
############################################

def generate_question_block(
    user_id,
    question_str,
    news_map,
    category_use=True,
    subcategory_use=True,
    question_idx=1,
    method="top1"
):
    """
    Question에는 이미 1개 positive + 4개 negative가 있음.
    => 섞어서 positive(= 첫 번째 ID)가 몇 번째 idx인지 찾는다.
    
    'method'에 따라 질문 문구 다르게.
    출력 시 category_use / subcategory_use 적용.
    
    return:
      lines(list of str),
      pos_index (정답 위치)
    """
    q_ids = parse_question_list(question_str)
    if len(q_ids) < 1:
        return [], None
    
    pos_id = q_ids[0]
    candidates = q_ids[:]
    random.shuffle(candidates)
    
    # 정답 위치
    pos_index = -1
    for i, cid in enumerate(candidates, start=1):
        if cid == pos_id:
            pos_index = i
            break
    
    # method별 헤더 문구
    if method == "ranking":
        header = f"Question {question_idx}) [Ranking] User #{user_id} prefers most [MASK] among these articles."
    elif method == "origin":
        header = f"Question {question_idx}) [Origin] Which news does User #{user_id} like best?"
    else:
        # top1 (default)
        header = f"Question {question_idx}) User #{user_id} prefers most [MASK] among the following {len(candidates)} articles:"
    
    lines = [header]
    
    for i_c, cid in enumerate(candidates, start=1):
        info = news_map.get(cid, {})
        title = info.get("Title", f"(No Title:{cid})")
        ccat = info.get("Category", "unknown")
        csub = info.get("SubCategory", "unknown")
        
        # category/subcategory 표시
        if category_use and subcategory_use:
            suffix = f"[category : {ccat} | subcategory : {csub}]"
        elif category_use and not subcategory_use:
            suffix = f"[category : {ccat}]"
        elif (not category_use) and subcategory_use:
            suffix = f"[subcategory : {csub}]"
        else:
            suffix = ""
        
        if suffix:
            lines.append(f"{i_c}: {title} {suffix}")
        else:
            lines.append(f"{i_c}: {title}")
    
    lines.append("")  # 빈 줄
    return lines, pos_index

############################################
# 5) Main function
############################################

def create_prompts_MIND(
    user_df,
    news_df,
    output_folder="prompt_MIND",
    model_name="gpt-4",
    max_history=None,
    max_question=None,
    start_user=None,
    max_user=None,
    category_use=True,
    subcategory_use=True,
    category_ns="category",   # "category" or "subcategory"
    purpose="with_negative",  # "only_positive" or "with_negative"
    method="top1"             # "top1", "ranking", "origin", etc.
):
    """
    (1) History 파트
       - only_positive: History 나열
       - with_negative: History 각 기사마다 negative 4개 샘플링
         (단, 여기서의 positive 위치는 hidden_positions에 기록하지 않음)
    (2) Questions 파트
       - 1 pos + 4 neg 이미 있음
       - 섞어서 정답 위치를 hidden_positions에 기록
       - method 파라미터에 따라 질문 헤더 문구를 조금씩 변경
    (3) Negative Sampling 시 (category_ns):
       - category_ns = "category" => 사용자가 2번 이상 본 Category 제외
       - category_ns = "subcategory" => 사용자가 2번 이상 본 SubCategory 제외
    (4) category_use / subcategory_use => 출력에서 어떻게 표기할지 결정
    (5) hidden_positions.txt에는 오직 "Question" 정답 인덱스만 기록
    (6) History 중복 제거
    """
    encoding = tiktoken.encoding_for_model(model_name)
    
    # 범위 필터
    if start_user is not None:
        user_df = user_df[user_df["User"] >= start_user]
    if max_user is not None:
        user_df = user_df[user_df["User"] <= max_user]
    
    # News map
    news_map = {}
    for row in news_df.itertuples(index=False):
        nid = row[0]
        news_map[nid] = {
            "Category": row[1],
            "SubCategory": row[2],
            "Title": row[3],
            "Abstract": row[4] if len(row) > 4 else ""
        }
    
    # 1) 사용자별 history 목록 (임시), ns count (category or subcategory)
    user_history_temp = defaultdict(list)
    user_ns_count_map = defaultdict(Counter)  # user -> Counter({ns_val: count})
    
    # (a) row별로 history 모으기
    for row in user_df.itertuples(index=False):
        uid = row.User
        hist_list = parse_history_list(row.History)
        user_history_temp[uid].extend(hist_list)
    
    # (b) set 형태로 중복 제거
    user_history_set = {}
    for uid, hist_list in user_history_temp.items():
        user_history_set[uid] = set(hist_list)
    
    # (c) Category/Subcategory 카운트
    for uid, hist_ids in user_history_set.items():
        ccount = Counter()
        for nid in hist_ids:
            info = news_map.get(nid, {})
            if category_ns == "subcategory":
                ns_val = info.get("SubCategory", None)
            else:
                ns_val = info.get("Category", None)
            if ns_val:
                ccount[ns_val] += 1
        user_ns_count_map[uid] = ccount
    
    # 전역 뉴스
    global_news_set = set()
    for s in user_history_set.values():
        global_news_set |= s
    
    # output folder
    if os.path.exists(output_folder):
        for f in glob.glob(os.path.join(output_folder, "*.txt")):
            os.remove(f)
    else:
        os.makedirs(output_folder, exist_ok=True)
    
    # 메타데이터
    token_stats_list = []
    hidden_positions_data = []  # 오직 Question 정답 인덱스
    
    grouped = user_df.groupby("User")
    for user_val, group in grouped:
        user_id = user_val
        
        # History (set -> list, 중복 제거됨)
        hist_list = list(user_history_set[user_id])
        
        # max_history 제한
        if max_history is not None and len(hist_list) > max_history:
            hist_list = hist_list[-max_history:]
        
        # 1) History part
        demo_lines = generate_demonstrations(
            user_id=user_id,
            history_ids=hist_list,
            user_history_set=user_history_set,
            global_news_set=global_news_set,
            news_map=news_map,
            user_ns_count_map=user_ns_count_map,
            purpose=purpose,
            category_use=category_use,
            subcategory_use=subcategory_use,
            category_ns=category_ns
        )
        
        if isinstance(demo_lines, list):
            final_lines = demo_lines
        else:
            final_lines = demo_lines  # 혹시 반환형이 다르면 수정
        
        # 2) Questions
        row_records = group.to_dict("records")
        if max_question is not None and len(row_records) > max_question:
            row_records = row_records[:max_question]
        
        q_lines = []
        q_lines.append("[Questions]")
        q_lines.append(
            f"Based on User #{user_id}'s preferences, predict the index number of the news article "
            f"that best fits the position labeled [MASK] for each question.\n"
        )
        
        user_q_positions = []
        
        for idx_q, rr in enumerate(row_records, start=1):
            q_str = rr["Question"]
            block_lines, pos_idx = generate_question_block(
                user_id=user_id,
                question_str=q_str,
                news_map=news_map,
                category_use=category_use,
                subcategory_use=subcategory_use,
                question_idx=idx_q,
                method=method
            )
            q_lines.extend(block_lines)
            if pos_idx is not None:
                user_q_positions.append(pos_idx)
        
        q_lines.append(
            f"Please provide just the answers to each of User #{user_id}'s question without any explanations."
        )
        
        final_lines += q_lines
        final_prompt = "\n".join(final_lines)
        
        # 토큰 계산
        token_count = len(encoding.encode(final_prompt))
        question_count = len(row_records)
        
        # 저장
        out_path = os.path.join(output_folder, f"U{user_id}.txt")
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(final_prompt)
        
        # 메타데이터
        hist_num = len(hist_list)
        token_stats_list.append((user_id, token_count, question_count, hist_num, question_count))
        
        # 질문 정답 위치
        hidden_positions_data.append((user_id, user_q_positions))
    
    # -------------------------
    # 메타데이터 저장
    # -------------------------
    meta_folder_path = os.path.join(output_folder, "metadata")
    if os.path.exists(meta_folder_path):
        for file_path in glob.glob(os.path.join(meta_folder_path, "*.txt")):
            os.remove(file_path)
    else:
        os.makedirs(meta_folder_path, exist_ok=True)
    
    # output_metadata.txt
    meta_file_path = os.path.join(meta_folder_path, "output_metadata.txt")
    sorted_list = sorted(token_stats_list, key=lambda x: x[0])
    total_tokens = 0
    total_q = 0
    
    with open(meta_file_path, "w", encoding="utf-8") as mf:
        for (uid, tcount, qc, hnum, qnum) in sorted_list:
            line = (
                f"User ID: U{uid:<5}  "
                f"Input Tokens: {tcount:<6}  "
                f"Output Tokens: {qc:<4}  "
                f"History 수: {hnum:<3}  "
                f"Question 수: {qnum}"
            )
            total_tokens += tcount
            total_q += qc
            mf.write(line + "\n")
        mf.write("\n")
        mf.write(f"Total Input Tokens : {total_tokens}\n")
        mf.write(f"Total Output Tokens : {total_q}\n")
    
    # hidden_positions.txt
    hidden_path = os.path.join(meta_folder_path, "hidden_positions.txt")
    with open(hidden_path, "w", encoding="utf-8") as hf:
        for uid, pos_list in hidden_positions_data:
            hf.write(f"U{uid:<5}: {pos_list}\n")
    
    print(f"[INFO] Prompts created in: {output_folder}")
    print(f"[INFO] Metadata saved at: {meta_folder_path}")


## 실행
[create 함수]
- purpose = 어떤 목적으로 prompt를 생성할 것인지  [only_positive / with_negative]   
- model = 사용할 gpt (token 계산 용도)  [gpt-4o-mini / gpt-3.5-turbo]
- user_count = 몇 명의 user prompt를 생성할 것인지
- max_question = 최대 질문 수
- save_forder = 결과를 저장할 폴더 이름
- max_history: History에서 최근 n개만
- max_question: Question에서 n개 행만
- max_user: 사용자 상위 n명만 처리 (User 값이 숫자라고 가정)
- method : top1 / ranking / origin 

In [22]:
# 1) Only Positive
create_prompts_MIND(
    user_df=user,
    news_df=NNR_news,
    output_folder="[MIND]_onlyPos(subcate, ns-subcate)",
    model_name="gpt-4o-mini",
    max_history=None,     
    max_question=None,    
    start_user=1,
    max_user=1000,
    category_use=False,
    subcategory_use=True,   # 서브카테고리도 같이 표시
    category_ns="subcategory",  # negative sampling은 subcategory 기준으로 2회 이상 읽은 것 제외
    purpose="only_positive",
    method="top1"
)

# 2) With Negative
create_prompts_MIND(
    user_df=user,
    news_df=NNR_news,
    output_folder="[MIND]_withNeg(subcate, ns-subcate)",
    model_name="gpt-4o-mini",
    max_history=None,
    max_question=None,
    start_user=1,
    max_user=1000,
    category_use=False,
    subcategory_use=True,   # 서브카테고리도 같이 표시
    category_ns="subcategory",  # negative sampling은 subcategory 기준으로 2회 이상 읽은 것 제외
    purpose="with_negative",
    method="top1"
)

[INFO] Prompts created in: [MIND]_onlyPos(subcate, ns-subcate)
[INFO] Metadata saved at: [MIND]_onlyPos(subcate, ns-subcate)\metadata
[INFO] Prompts created in: [MIND]_withNeg(subcate, ns-subcate)
[INFO] Metadata saved at: [MIND]_withNeg(subcate, ns-subcate)\metadata


## ---

In [10]:
def get_category_counts(user_df: pd.DataFrame, news_df: pd.DataFrame, user_id: int):
    """
    특정 사용자의 History에 포함된 뉴스들의 Category 및 SubCategory 개수를 계산하여 출력.
    
    Parameters:
    - user_df (pd.DataFrame): 사용자 데이터프레임
    - news_df (pd.DataFrame): 뉴스 데이터프레임
    - user_id (int): 조회할 사용자 ID
    
    Returns:
    - category_counts (pd.Series): Category별 개수 (내림차순 정렬)
    - subcategory_counts (pd.Series): SubCategory별 개수 (내림차순 정렬)
    """
    # 특정 User의 History 가져오기
    user_history_series = user_df.loc[user_df["User"] == user_id, "History"]
    
    # 해당 유저의 기록이 없으면 예외 처리
    if user_history_series.empty:
        print(f"User {user_id}에 대한 데이터가 없습니다.")
        return None, None

    user_history = user_history_series.values[0]  # 같은 값이므로 첫 번째 값만 가져옴
    
    # 뉴스 ID 추출
    news_ids = [entry.split(",")[0] for entry in user_history.split(";")]

    # news_df에서 해당 뉴스 ID 필터링
    filtered_news = news_df[news_df["News ID"].isin(news_ids)]

    # Category 및 SubCategory 개수 집계 및 정렬
    category_counts = filtered_news["Category"].value_counts()
    subcategory_counts = filtered_news["SubCategory"].value_counts()

    return category_counts, subcategory_counts


In [11]:
# 예제 실행
category_counts, subcategory_counts = get_category_counts(user, NNR_news, 1000)

# 결과 출력
print("Category 개수:\n", category_counts)
print("\nSubCategory 개수:\n", subcategory_counts)

Category 개수:
 Category
nyheter     32
100sport    15
pluss        5
meninger     1
Name: count, dtype: int64

SubCategory 개수:
 SubCategory
sortrondelag     14
trondheim        10
vintersport       6
fotball           6
nordtrondelag     5
okonomi           3
magasin           2
sprek             1
ballsport         1
andreidretter     1
utenriks          1
kultur            1
kronikker         1
nyheter           1
Name: count, dtype: int64
