In [None]:
import os
import glob
import random
import pandas as pd
import datetime
import tiktoken
from collections import Counter, defaultdict
from IPython.display import display, Markdown

# user DF에서 History column과 Train column을 합쳐서 history로 사용
file_name = "MIND_LLM.tsv"
user_df = pd.read_csv(f'../../../data/LLM/{file_name}', sep='\t', names=['User', 'History', 'Question'])

news_df = pd.read_csv('../../../data/baseline/MIND_news.tsv', sep='\t', names=['News ID','Category','SubCategory','Title','Abstract','URL','Title Entities','Abstract Entities'])
news_df = news_df.drop(columns=['URL', 'Title Entities', 'Abstract Entities'])

In [None]:
display(Markdown("### 1. user<hr/>"))
# display(user[user['User']==1]['Question'][0])
display(user_df)
display(Markdown("### 2. NNR_news<hr/>"))
display(news_df)

In [None]:
start_user=1
max_user=5

# 1) User 범위 필터링
if start_user is not None:
    user_df = user_df[user_df["User"] >= start_user]
if max_user is not None:
    user_df = user_df[user_df["User"] <= max_user]

In [None]:
user_df

In [None]:
# 2) 뉴스 ID → 메타정보 dict (news_map)
news_map = {
    row[0]: {"Category": row[1], "SubCategory": row[2],
                "Title": row[3], "Abstract": row[4] if len(row) > 4 else ""}
    for row in news_df.itertuples(index=False)
}

In [None]:
# 3) 사용자별 히스토리 (순서 유지 & 중복 제거)
user_history_ordered_temp = defaultdict(list)
user_history_temp_set = defaultdict(set)  # 중복 체크용

In [None]:
def parse_history_list(history_str):
    """문자열 형태의 History를 리스트로 변환"""
    if not isinstance(history_str, str):
        return []
    return [x.strip() for x in history_str.split() if x.strip()]


def parse_question_list(question_str):
    """문자열 형태의 Question(정답 포함) -> 리스트"""
    if not isinstance(question_str, str):
        return []
    return [x.strip() for x in question_str.split() if x.strip()]

In [None]:
# 사용자별 뉴스 history(클릭 이력)를 중복 없이, 순서를 유지하며 저장
for row in user_df.itertuples(index=False):
    uid = row.User
    hist_list = parse_history_list(row.History)
    for h in hist_list:
        if h not in user_history_temp_set[uid]:
            user_history_ordered_temp[uid].append(h)
            user_history_temp_set[uid].add(h)

In [None]:
user_history_ordered_temp

In [None]:
user_history_temp_set

In [None]:
# dict 변환
user_history_ordered = {uid: lst for uid, lst in user_history_ordered_temp.items()}
user_history_set = {uid: set(lst) for uid, lst in user_history_ordered.items()}

In [None]:
user_history_ordered

In [None]:
type(user_history_set)

In [None]:
category_ns="subcategory"

# 3‑1) 사용자별 카테고리/서브카테고리 클릭 횟수
user_ns_count_map = defaultdict(Counter)
for uid, hist_ids in user_history_set.items():
    ccount = Counter()
    for nid in hist_ids:
        info = news_map.get(nid, {})
        ns_val = info.get("SubCategory" if category_ns == "subcategory" else "Category", None)
        if ns_val:
            ccount[ns_val] += 1
    user_ns_count_map[uid] = ccount

In [None]:
user_ns_count_map

In [None]:
news_map.get('N19639', {})

In [None]:
# 전역 뉴스 집합 (negative 후보군)
global_news_set = set().union(*user_history_set.values())

In [None]:
global_news_set

In [None]:
def sample_negatives(
    positive_news_id,
    user_id,
    user_history_set,
    global_news_set,
    user_ns_count_map,
    news_map,
    category_ns="category",  # "category" or "subcategory"
    num_negatives=4
):
    """positive 기사와 동일 user 의 click 카테고리를 고려해 negative 샘플링"""

    # 1) user 가 이미 본 기사 + 현재 positive 제외
    # exclude_set = set(user_history_set[user_id]) | {positive_news_id}
    exclude_set = user_history_set[user_id] 

    candidate_list = []
    for news_id in global_news_set:
        if news_id in exclude_set:
            continue

        info = news_map.get(news_id, {})
        # 카테고리/서브카테고리 값 결정
        ns_val = info.get("SubCategory" if category_ns == "subcategory" else "Category", None)
        if ns_val is None:
            continue

        # 2) 사용자가 해당 카테고리를 이미 2번 이상 본 경우 skip (diversity 확보)
        if user_ns_count_map[user_id].get(ns_val, 0) >= 2:
            continue

        candidate_list.append(news_id)

    state = random.getstate()
    random.seed(user_id)

    # 후보가 부족하면 가능한 만큼만 반환
    result = random.sample(candidate_list, min(len(candidate_list), num_negatives))
    
    random.setstate(state)  # 이전 상태로 복원
    return result


def generate_demonstrations(
    user_id,
    history_ids,
    user_history_set,
    global_news_set,
    news_map,
    user_ns_count_map,
    purpose="with_negative",
    category_use=True,
    subcategory_use=True,
    category_ns="category"
):
    """사용자 히스토리를 natural language 형식으로 변환"""

    if purpose == "only_positive":
        # 클릭 히스토리만 나열
        lines = ["[Click History]",
                 f"User #{user_id}가 과거에 클릭한 뉴스 목록은 다음과 같습니다:\n"]
        for idx, hid in enumerate(history_ids, start=1):
            info = news_map.get(hid, {})    # news_map에서 해당하는 news_id 데이터 추출출
            title = info.get("Title", f"(No Title:{hid})")
            cat = info.get("Category", "unknown")
            subcat = info.get("SubCategory", "unknown")

            # 카테고리/서브카테고리 출력 여부 결정
            if category_use and subcategory_use:
                suffix = f"[category : {cat} | subcategory : {subcat}]"
            elif category_use and not subcategory_use:
                suffix = f"[category : {cat}]"
            elif (not category_use) and subcategory_use:
                suffix = f"[subcategory : {subcat}]"
            else:
                suffix = ""

            lines.append(f"{idx}. {title} {suffix}" if suffix else f"{idx}. {title}")
        lines.append("")
        return lines

    # ---------------- "with_negative" 모드 ----------------
    lines = ["[News of Interest to the user]\n"]

    for idx, hid in enumerate(history_ids, start=1):
        # positive 기사 정보
        pos_info = news_map.get(hid, {})    # news_map에서 해당하는 news_id 데이터 추출출
        pos_title = pos_info.get("Title", f"(No Title:{hid})")
        pos_cat = pos_info.get("Category", "unknown")
        pos_sub = pos_info.get("SubCategory", "unknown")

        # negative 4개 추출
        neg_ids = sample_negatives(
            positive_news_id=hid,
            user_id=user_id,
            user_history_set=user_history_set,
            global_news_set=global_news_set,
            user_ns_count_map=user_ns_count_map,
            news_map=news_map,
            category_ns=category_ns,
            num_negatives=4,
        )

        combined = neg_ids + [hid]
        random.shuffle(combined)  # positive 위치 섞기

        # positive 문장
        if category_use and subcategory_use:
            pos_suffix = f"[category : {pos_cat} | subcategory : {pos_sub}]"
        elif category_use and not subcategory_use:
            pos_suffix = f"[category : {pos_cat}]"
        elif (not category_use) and subcategory_use:
            pos_suffix = f"[subcategory : {pos_sub}]"
        else:
            pos_suffix = ""

        lead = (f"{idx}) User #{user_id} prefers most {pos_title} {pos_suffix} "
                f"among the following 5 articles:")
        lines.append(lead if pos_suffix else lead.replace(pos_suffix, ""))

        # 1 positive + 4 negative 기사 목록
        for i_c, cid in enumerate(combined, start=1):
            cinfo = news_map.get(cid, {})
            ctitle = cinfo.get("Title", f"(No Title:{cid})")
            ccat = cinfo.get("Category", "unknown")
            csub = cinfo.get("SubCategory", "unknown")

            if category_use and subcategory_use:
                suffix = f"[subcategory : {csub}]"
            elif category_use and not subcategory_use:
                suffix = f"[category : {ccat}]"
            elif (not category_use) and subcategory_use:
                suffix = f"[subcategory : {csub}]"
            else:
                suffix = ""
            lines.append(f"{i_c}: {ctitle} {suffix}" if suffix else f"{i_c}: {ctitle}")
        lines.append("")

    return lines

In [None]:
max_history = None
purpose = 'with_negative'
category_use =False
subcategory_use=True
grouped = user_df.groupby("User")

In [71]:

for user_id, group in grouped:
    # A) 히스토리 리스트 준비
    hist_list = user_history_ordered[user_id]
    if max_history is not None and len(hist_list) > max_history:
        hist_list = hist_list[-max_history:]  # 최근 기록만 유지

    # 1) 데모 블록 생성
    demo_lines = generate_demonstrations(
        user_id=user_id,
        history_ids=hist_list,
        user_history_set=user_history_set,      # user의 history (중복 제거거)
        global_news_set=global_news_set,        # 전체 뉴스 집합 (test user들이 한 번이라도 클릭했던 news들들)
        news_map=news_map,                      # news 데이터 (dict 형태) 
        user_ns_count_map=user_ns_count_map,    # user history의 category(or subcategory) 분포
        purpose=purpose,
        category_use=category_use,
        subcategory_use=subcategory_use,
        category_ns=category_ns,
    )
    demonstration_text = "\n".join(demo_lines)

    print(f'uid : {user_id}')
    print(demonstration_text)

uid : 1
[News of Interest to the user]

1) User #1 prefers most 2020 Chevrolet Corvette C8 vs. C8.R: Here's How the Race Car is Different [subcategory : autossports] among the following 5 articles:
1: Robert Evans, 'Chinatown' Producer and Paramount Chief, Dies at 89 [subcategory : movienews]
2: Tom Draper, Black Music Industry Pioneer, Dies at 79 [subcategory : musicnews]
3: The Water on These Airlines Is so Bad You Shouldn't Even Wash Your Hands With It, Study Finds [subcategory : travelnews]
4: This is the best exercise to pump up aging muscles [subcategory : fitness]
5: 2020 Chevrolet Corvette C8 vs. C8.R: Here's How the Race Car is Different [subcategory : autossports]

2) User #1 prefers most 'Joker' Continues to Smash Box Office Expectations in 2nd Weekend [subcategory : movienews] among the following 5 articles:
1: The Water on These Airlines Is so Bad You Shouldn't Even Wash Your Hands With It, Study Finds [subcategory : travelnews]
2: Tom Draper, Black Music Industry Pioneer,

In [None]:
for user_id, group in grouped:
    # A) 히스토리 리스트 준비
    hist_list = user_history_ordered[user_id]
    if max_history is not None and len(hist_list) > max_history:
        hist_list = hist_list[-max_history:]  # 최근 기록만 유지
    print(hist_list)

In [None]:
type(user_history_set)

In [7]:
def split_into_chunks(seq, n):
    """seq를 n등분(앞쪽에 나머지 분배)한 리스트리스트 반환"""
    if n > len(seq):
        n = len(seq)
    base = len(seq) // n
    rem  = len(seq) % n
    sizes = [base + 1 if i < rem else base for i in range(n)]
    out, idx = [], 0
    for s in sizes:
        out.append(seq[idx:idx+s])
        idx += s
    return [c for c in out if c]

seq = ['N58584', 'N12900', 'N6233', 'N51706', 'N18777', 'N593', 'N16793', 'N846', 'N18285', 'N46846', 'N53436']

split_into_chunks(seq, 5)

[['N58584', 'N12900', 'N6233'],
 ['N51706', 'N18777'],
 ['N593', 'N16793'],
 ['N846', 'N18285'],
 ['N46846', 'N53436']]

In [16]:
def split_into_chunks(seq, n, min_num=1):
    """seq를 n등분하되, 각 chunk는 최소 min_num 이상의 길이를 가져야 함. 불가능하면 n보다 적게 나눔"""
    if not seq:
        return []

    max_possible_chunks = len(seq) // min_num
    n = min(n, max_possible_chunks) if max_possible_chunks > 0 else 1

    base = len(seq) // n
    rem  = len(seq) % n
    sizes = [base + 1 if i < rem else base for i in range(n)]

    out, idx = [], 0
    for s in sizes:
        out.append(seq[idx:idx+s])
        idx += s

    return [c for c in out if c]

seq = ['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10', 'N11', 'N12', 'N13', 'N14']
print(split_into_chunks(seq, 5, min_num=3))

[['N1', 'N2', 'N3', 'N4'], ['N5', 'N6', 'N7', 'N8'], ['N9', 'N10', 'N11'], ['N12', 'N13', 'N14']]
