In [1]:
import pandas as pd
import numpy as np
import os
import random
from datetime import timedelta
from IPython.display import display, Markdown

# Original Data 로드

In [2]:
# user
filtered_users = pd.read_csv('experiment_data/baseline/user.tsv', sep='\t', names=['User', 'History', 'Train', 'Test'])

# news
news_total_raw = pd.read_csv('experiment_data/baseline/total_news(raw).tsv', sep='\t', names=['News ID', 'Category', 'Sub-Category', 'Title', 'Body', 'ID', 'Publish', 'Click time history'])

In [3]:
# 'Publish' 열을 datetime 형식으로 변환
news_total_raw['Publish'] = pd.to_datetime(news_total_raw['Publish'])

# 필터링할 기간 설정
start_date = '2017-02-05 08:00:00'
end_date = '2017-02-12 08:00:00'

# 해당 기간의 데이터 필터링
filtered_news = news_total_raw[(news_total_raw['Publish'] >= start_date) & (news_total_raw['Publish'] <= end_date)]

filtered_news

Unnamed: 0,News ID,Category,Sub-Category,Title,Body,ID,Publish,Click time history
20594,N20920,100sport,fotball,– Det er ikke lenger rakettforskning hva som m...,Saken oppdateres. Liverpool har bare vunnet 1 ...,19a34660c90eb82b374968b10266031803566c60,2017-02-05 08:37:58,2017-02-12 22:53:29
20598,N20924,nyheter,innenriks,Mann fløyet til sykehus etter scooterulykke i ...,Saken oppdateres. Ulykken skjedde ved Gvepsebo...,9d3f899d5004938b2d111067af45daf6aad23bc9,2017-02-05 08:35:23,"2017-02-05 18:09:35,2017-02-05 18:13:02,2017-0..."
20607,N20933,nyheter,utenriks,Domstol avviser Trumps krav om å gjeninnføre i...,Saken oppdateres. Da anken ble levert til en d...,3981470ff3a9c109727dfba0a00227b02e2a79f9,2017-02-05 09:16:49,"2017-02-05 18:37:45,2017-02-05 18:40:55,2017-0..."
20622,N20948,pluss,magasin,Nidaorsdomen har fått nytt alter,Saken oppdateres. Kanskje først og fremst på g...,bbdffe5b3d84fa37ddc52ceb6b2ddb6a8b91a1dd,2017-02-05 10:54:37,"2017-02-13 17:18:36,2017-02-15 06:39:14"
20636,N20962,100sport,vintersport,Varden Meråker tok NM-gull på kvinnestafetten:...,Saken oppdateres. Varden Meråker med Anniken J...,1115dd9322148c1796518ae12ef90d7845c7b7af,2017-02-05 12:09:10,"2017-02-05 21:28:34,2017-02-05 21:35:48,2017-0..."
...,...,...,...,...,...,...,...,...
23252,N23620,nyheter,trondheim,Snapchat: God farsdag!,Saken oppdateres. Søndag morgen får brukere av...,35b769b697573ebc285a94b07d1ba465c3b562a5,2017-02-12 07:33:06,"2017-02-15 08:20:14,2017-02-15 09:51:00,2017-0..."
23298,N23668,100sport,fotball,"– Spalvis er ikke skadefri, og det vil ta noe ...",Saken oppdateres. Det har gått åtte dager side...,e2a641c2e35153843ac85320a09a18dca8e1fa37,2017-02-10 10:24:34,"2017-02-15 19:14:30,2017-02-16 14:50:47,2017-0..."
23428,N23798,100sport,fotball,Mike Jensen takker FCK-fansen for pipekonserten,Saken oppdateres. Lørdag sikret Rosenborg seg ...,6daf49aa6cd75b749d51995933d4f12151470257,2017-02-12 07:04:42,"2017-02-16 08:05:16,2017-02-16 08:56:50,2017-0..."
23481,N23854,nyheter,sortrondelag,Røykutvikling i bolig på Fannrem,Saken oppdateres. Politiet i Sør-Trøndelag var...,22ce4fd1c9a6cecc754c20838d78ae0933c77976,2017-02-11 08:22:40,"2017-02-16 19:34:21,2017-02-18 06:10:14"


# behaviors 생성 함수 (2번째꺼 사용)

In [6]:
def parse_news_datetime_string(s: str):
    """
    'N2,2017-01-15 08:00:03;N2445,2017-01-16 00:17:07;...' 처럼
    세미콜론(';')으로 구분된 뉴스+시간 목록을 파싱하여,
    (뉴스ID 리스트, "뉴스ID,시간"을 세미콜론으로 이어붙인 최종 문자열)을 반환.
    """
    if not isinstance(s, str):
        return [], ""

    tokens = s.split(';')
    news_ids = []
    news_ids_with_time = []
    for t in tokens:
        t = t.strip()
        if not t:
            continue
        # 예: "N2,2017-01-15 08:00:03" -> parts = ["N2", "2017-01-15 08:00:03"]
        parts = t.split(',')
        if len(parts) != 2:
            continue
        nid = parts[0].strip()
        ntime = parts[1].strip()
        news_ids.append(nid)
        # "N2,2017-01-15 08:00:03" 처럼 다시 합침
        news_ids_with_time.append(f"{nid},{ntime}")

    # 세미콜론으로 다시 연결
    news_ids_with_time_str = ";".join(news_ids_with_time)
    return news_ids, news_ids_with_time_str


def transform_test_df(test_df: pd.DataFrame) -> pd.DataFrame:
    """
    기존 get_train_test_negative_samples()가 생성한 test_df를 받아서
    1) user 컬럼을 'U{숫자}' 형태로 바꿔주고
    2) candidate_news와 clicked를 합쳐서 "N123-1" "N777-0" ... 형태로 만들며
    3) clicked 컬럼을 삭제한 뒤 반환
    """
    # 복사본 생성
    df = test_df.copy()

    # 1) user 컬럼 변환 (int -> U{int})
    df['user'] = 'U' + df['user'].astype(str)

    # 2) candidate_news / clicked 합치기
    candidate_splitted = df['candidate_news'].str.split()
    clicked_splitted   = df['clicked'].str.split()
    merged_list = []
    for cands, clicks in zip(candidate_splitted, clicked_splitted):
        merged_list.append(" ".join([f"{cand}-{click}" for cand, click in zip(cands, clicks)]))
    df['candidate_news'] = merged_list

    # 3) clicked 컬럼 삭제
    df.drop(columns=['clicked'], inplace=True)

    return df


def get_train_test_negative_samples(
    user_list: pd.DataFrame,
    news_list: pd.DataFrame,
    max_users: int = 1000,
    train_negative_count: int = 4,
    test_negative_count: int = 20,
    train_max_candidate: int = None,
    test_max_candidate: int = None
):
    """
    사용자 df(user_list)와 뉴스 df(news_list)를 받아서,
    1) Train 기반 negative sampling -> train_df
    2) Test  기반 negative sampling -> test_df
    의 두 가지 결과를 TSV 파일로 저장하고, (train_df, test_df)를 반환한다.

    - train_df : [user, clicked_news, candidate_news, clicked]
    - test_df  : [user, clicked_news, candidate_news, clicked] (untransformed)
      * 이 test_df를 transform_test_df()에 넣으면 baseline/test 에서 필요한 형태가 됨.

    이때 user_list에는 History, Train, Test 각 열에
      "N2,2017-01-15 08:00:03;N2445,2017-01-16 00:17:07;..." 와 같은 형식이 들어있다고 가정.
      로직에서 negative sampling시에는 뉴스ID만 쓰지만,
      최종 LLM 파일에서는 History를 (History + Train) 형태로 뉴스ID+시간을 합쳐 저장하도록 변경.
    """
    # -----------------------------
    # 1) news_list 전처리: Publish -> datetime
    # -----------------------------
    news_list = news_list.copy()
    news_list['Publish'] = pd.to_datetime(news_list['Publish'], errors='coerce')
    # NaT(변환 실패) 제거
    news_list = news_list.dropna(subset=['Publish'])

    # (News ID -> row) 매핑
    news_map = {}
    for idx, row in news_list.iterrows():
        nid = row['News ID']
        # 중복 News ID가 있다면 첫 번째만 사용
        if nid not in news_map:
            news_map[nid] = row

    # 결과 저장용 리스트
    train_results = []
    test_results = []

    # LLM용: user -> (History+Train을 "뉴스ID,시간;뉴스ID,시간" 형태로 복원한 문자열)
    user_full_history_str_with_time = {}

    # -----------------------------
    # 2) 사용자 반복
    # -----------------------------
    for i, user_row in user_list.iterrows():
        user_str = user_row['User']  # 예: 'U1'
        user_int = int(user_str[1:]) # -> 1 (ex. "U1" -> 1)
        if user_int > max_users:
            break

        # History 파싱
        history_ids, history_ids_with_time_str = parse_news_datetime_string(
            str(user_row['History']) if pd.notnull(user_row['History']) else ""
        )
        # Train 파싱
        train_ids, train_ids_with_time_str = parse_news_datetime_string(
            str(user_row['Train']) if pd.notnull(user_row['Train']) else ""
        )
        # Test 파싱
        test_ids, _ = parse_news_datetime_string(
            str(user_row['Test']) if pd.notnull(user_row['Test']) else ""
        )

        # train/test 후보 뉴스들(앞에서 train_max_candidate or test_max_candidate 개 사용)
        if (train_max_candidate is not None) and (len(train_ids) > train_max_candidate):
            train_list = train_ids[:train_max_candidate]
        else:
            train_list = train_ids

        if (test_max_candidate is not None) and (len(test_ids) > test_max_candidate):
            test_list = test_ids[:test_max_candidate]
        else:
            test_list = test_ids

        # -----------------------------
        # LLM에서 쓸 "History" = 원본 History + Train
        # => 세미콜론으로 연결
        # -----------------------------
        # 예) 
        #  history_ids_with_time_str = "N2,2017-01-15 08:00:03;N2445,2017-01-16 00:17:07"
        #  train_ids_with_time_str   = "N21013,2017-02-06 02:12:29;N21021,2017-02-06 09:00:55"
        # -> "N2,2017-01-15 08:00:03;N2445,2017-01-16 00:17:07;N21013,2017-02-06 02:12:29;N21021,2017-02-06 09:00:55"
        if history_ids_with_time_str and train_ids_with_time_str:
            full_history_str = history_ids_with_time_str + ";" + train_ids_with_time_str
        elif history_ids_with_time_str:  # train이 비었을 때
            full_history_str = history_ids_with_time_str
        else:  # history가 비었거나(둘 다 비었으면 train만 있을 수도)
            full_history_str = train_ids_with_time_str

        user_full_history_str_with_time[user_int] = full_history_str

        # --------------------------
        # [A] Train DataFrame 생성
        # --------------------------
        # clicked_news = History 만 (ID만 공백으로)
        clicked_news_str_for_train = " ".join(history_ids)

        # negative 추출 시, 이미 본 뉴스로 간주할 ID = History + Train 전체
        used_ids_for_train = set(history_ids + train_ids)

        for pos_id in train_list:  
            if pos_id not in news_map:
                continue

            pos_publish = news_map[pos_id]['Publish']
            time_window_start = pos_publish - timedelta(hours=24)

            # 24시간 이내 후보
            candidate_mask = (
                (news_list['Publish'] >= time_window_start) &
                (news_list['Publish'] <= pos_publish) &
                (~news_list['News ID'].isin(used_ids_for_train))
            )
            candidate_df = news_list.loc[candidate_mask].drop_duplicates(subset='News ID')

            needed = train_negative_count
            selected_list = []

            if len(candidate_df) >= needed:
                selected_list.append(candidate_df.sample(n=needed, random_state=42))
            else:
                selected_list.append(candidate_df)
                needed -= len(candidate_df)

                # fallback: 더 과거
                extra_mask = (
                    (news_list['Publish'] < time_window_start) &
                    (~news_list['News ID'].isin(used_ids_for_train))
                )
                extra_df = news_list.loc[extra_mask].drop_duplicates(subset='News ID')
                extra_df = extra_df.assign(TimeDiff=(time_window_start - extra_df['Publish']).abs())
                extra_df = extra_df.sort_values(by='TimeDiff').head(needed)
                selected_list.append(extra_df)

            selected_negatives = pd.concat(selected_list).drop_duplicates(subset='News ID')
            if len(selected_negatives) > train_negative_count:
                selected_negatives = selected_negatives.sample(n=train_negative_count, random_state=42)

            neg_ids = selected_negatives['News ID'].tolist()
            candidate_list = [pos_id] + neg_ids
            candidate_news_str = " ".join(candidate_list)

            clicked_arr = ["1"] + ["0"] * len(neg_ids)
            clicked_str = " ".join(clicked_arr)

            train_results.append({
                'user': user_int,
                'clicked_news': clicked_news_str_for_train,
                'candidate_news': candidate_news_str,
                'clicked': clicked_str
            })

        # --------------------------
        # [B] Test DataFrame 생성
        # --------------------------
        # clicked_news = History + (Train에서 실제 사용한 positive 뉴스들)
        clicked_news_for_test_list = history_ids + train_list
        clicked_news_str_for_test  = " ".join(clicked_news_for_test_list)

        # negative 추출 시, 이미 본 뉴스로 간주할 ID = History + Train 전체
        used_ids_for_test = set(history_ids + train_ids)

        for pos_id in test_list:
            if pos_id not in news_map:
                continue

            pos_publish = news_map[pos_id]['Publish']
            time_window_start = pos_publish - timedelta(hours=24)

            candidate_mask = (
                (news_list['Publish'] >= time_window_start) &
                (news_list['Publish'] <= pos_publish) &
                (~news_list['News ID'].isin(used_ids_for_test))
            )
            candidate_df = news_list.loc[candidate_mask].drop_duplicates(subset='News ID')

            needed = test_negative_count
            selected_list = []

            if len(candidate_df) >= needed:
                selected_list.append(candidate_df.sample(n=needed, random_state=42))
            else:
                selected_list.append(candidate_df)
                needed -= len(candidate_df)

                # fallback
                extra_mask = (
                    (news_list['Publish'] < time_window_start) &
                    (~news_list['News ID'].isin(used_ids_for_test))
                )
                extra_df = news_list.loc[extra_mask].drop_duplicates(subset='News ID')
                extra_df = extra_df.assign(TimeDiff=(time_window_start - extra_df['Publish']).abs())
                extra_df = extra_df.sort_values(by='TimeDiff').head(needed)
                selected_list.append(extra_df)

            selected_negatives = pd.concat(selected_list).drop_duplicates(subset='News ID')
            if len(selected_negatives) > test_negative_count:
                selected_negatives = selected_negatives.sample(n=test_negative_count, random_state=42)

            neg_ids = selected_negatives['News ID'].tolist()
            candidate_list = [pos_id] + neg_ids
            candidate_news_str = " ".join(candidate_list)

            clicked_arr = ["1"] + ["0"]*len(neg_ids)
            clicked_str = " ".join(clicked_arr)

            test_results.append({
                'user': user_int,
                'clicked_news': clicked_news_str_for_test,
                'candidate_news': candidate_news_str,
                'clicked': clicked_str
            })

    # -----------------------------
    # 3) DataFrame 생성
    # -----------------------------
    train_df = pd.DataFrame(train_results, columns=['user','clicked_news','candidate_news','clicked'])
    test_df  = pd.DataFrame(test_results,  columns=['user','clicked_news','candidate_news','clicked'])
    
    # baseline에서 사용하는 형태(test 변환)
    test_df_transformed = transform_test_df(test_df)

    # -----------------------------
    # 4) TSV 저장
    # -----------------------------
    train_path = 'experiment_data/baseline/train'
    test_path  = 'experiment_data/baseline/test'
    llm_path   = 'experiment_data/LLM'
    os.makedirs(train_path, exist_ok=True)
    os.makedirs(test_path,  exist_ok=True)
    os.makedirs(llm_path,   exist_ok=True)

    train_filename = f'{train_path}/user{max_users}_ns{train_negative_count}_cd{train_max_candidate}.tsv'
    test_filename  = f'{test_path}/user{max_users}_ns{test_negative_count}_cd{test_max_candidate}.tsv'
    llm_filename   = f'{llm_path}/user{max_users}_ns{test_negative_count}_cd{test_max_candidate}.tsv'

    # 4-1) train & test (baseline) 저장
    train_df.to_csv(train_filename, sep='\t', index=False)
    test_df_transformed.to_csv(test_filename, sep='\t', header=False)

    # -----------------------------
    # 4-2) LLM용 파일
    #  - clicked 컬럼 제거
    #  - History는 (원본 History + 원본 Train)을 합친 시간포맷으로 교체
    # -----------------------------
    llm_df = test_df.copy()  # untransformed 형태

    # 칼럼명 변경: user -> User, clicked_news -> History, candidate_news -> Question
    llm_df.rename(columns={
        'user': 'User',
        'clicked_news': 'History',
        'candidate_news': 'Question'
    }, inplace=True)

    # 'clicked' 컬럼 제거
    llm_df.drop(columns=['clicked'], inplace=True)

    # History를 (History + Train) 포맷으로 된 문자열로 교체
    # user_full_history_str_with_time[user_int] 를 매핑
    llm_df['History'] = llm_df['User'].map(user_full_history_str_with_time)

    # 최종 저장 (header=False 등은 기존 스타일을 유지)
    llm_df.to_csv(llm_filename, sep='\t', index=False, header=False)

    print(f"[Saved] {train_filename}")
    print(f"[Saved] {test_filename}")
    print(f"[Saved] {llm_filename}")

    return train_df, test_df_transformed


In [6]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta
from collections import defaultdict

# ----------------------------------------------------------------
# 1) 유틸 함수: parse, transform
# ----------------------------------------------------------------

def parse_news_datetime_string(s: str):
    """
    "N2,2017-01-15 08:00:03;N2445,2017-01-16 00:17:07;..."
    형태를 파싱해 (뉴스ID 리스트, "뉴스ID,시간" 세미콜론 연결 문자열)을 반환.
    """
    if not isinstance(s, str):
        return [], ""

    tokens = s.split(';')
    news_ids = []
    news_ids_with_time = []
    for t in tokens:
        t = t.strip()
        if not t:
            continue
        parts = t.split(',')
        if len(parts) != 2:
            continue
        nid = parts[0].strip()
        ntime = parts[1].strip()
        news_ids.append(nid)
        news_ids_with_time.append(f"{nid},{ntime}")

    news_ids_with_time_str = ";".join(news_ids_with_time)
    return news_ids, news_ids_with_time_str


def transform_test_df(test_df: pd.DataFrame) -> pd.DataFrame:
    """
    생성된 test_df (user, clicked_news, candidate_news, clicked)를
    baseline/test 형식에 맞도록 변환.

    - user: 정수 -> "U{정수}"로
    - candidate_news & clicked -> "N123-1" "N777-0"... 형태로 합침
    - clicked 칼럼은 최종적으로 제거
    """
    df = test_df.copy()

    # 1) user -> "U{int}" 형태
    df['user'] = 'U' + df['user'].astype(str)

    # 2) candidate_news / clicked 합치기
    candidate_splitted = df['candidate_news'].str.split()
    clicked_splitted   = df['clicked'].str.split()
    merged_list = []
    for cands, clicks in zip(candidate_splitted, clicked_splitted):
        merged_list.append(
            " ".join([f"{cand}-{click}" for cand, click in zip(cands, clicks)])
        )
    df['candidate_news'] = merged_list

    # 3) clicked 제거
    df.drop(columns=['clicked'], inplace=True)
    return df

# ----------------------------------------------------------------
# 2) Train negative sample 함수
#    (2017-02-05 08:00:00 ~ 2017-02-12 08:00:00 고정 범위 + usage_limit 로직)
# ----------------------------------------------------------------

def sample_train_negatives_fixed_period_with_usage_limit(
    pos_id: str,                    # positive 뉴스 ID
    news_map: dict,                 # News ID -> row
    news_list: pd.DataFrame,        # 전체 뉴스 DataFrame
    used_ids_for_train: set,        # 사용자가 이미 읽은 뉴스(History+Train)
    train_negative_count: int,      # negative가 필요한 개수
    user_neg_usage_count: dict,     # 해당 사용자가 negative로 몇 번 썼는지 카운트
    train_period_start: pd.Timestamp,
    train_period_end: pd.Timestamp,
    max_usage_try_step: int = 10,   # usage_limit를 최대 몇 번까지 올려볼지(임의 설정)
    user_random_state: int = None
):
    """
    요구사항:
    - "2017-02-05 08:00:00 ~ 2017-02-12 08:00:00" 범위 내에서
    - (History+Train)에 속하지 않는 뉴스만 후보
    - 한 번 negative로 쓰인 뉴스는 재사용 X (usage_limit=1)하지만, 
      후보 부족 시 usage_limit=2,3,...로 순차 증가하여 재사용 허용

    반환:
      - negative ID 리스트
    """
    if pos_id not in news_map:
        return []

    # 후보: train 기간 내 발행 + 사용자가 이미 읽은 뉴스 제외
    candidate_mask = (
        (news_list['Publish'] >= train_period_start) &
        (news_list['Publish'] < train_period_end) &
        (~news_list['News ID'].isin(used_ids_for_train))
    )
    candidate_df = news_list.loc[candidate_mask].drop_duplicates(subset='News ID')

    # candidate가 필요한 개수보다 많으면 sample
    if len(candidate_df) > train_negative_count:
        # 우선 전부 후보로 만들고, 아래 usage_limit에서 다시 필터링
        candidate_ids = candidate_df['News ID'].tolist()
    else:
        # 부족하면 그냥 전부 후보로
        candidate_ids = candidate_df['News ID'].tolist()

    # usage_limit 로직
    needed = train_negative_count
    neg_ids = []
    usage_limit = 1

    while needed > 0 and usage_limit <= max_usage_try_step:
        # user_neg_usage_count[cid] < usage_limit인 후보만
        valid_candidates = [cid for cid in candidate_ids 
                            if user_neg_usage_count[cid] < usage_limit]

        if len(valid_candidates) >= needed:
            # 필요한 만큼만 랜덤으로 추출
            chosen = pd.Series(valid_candidates).sample(
                n=needed, random_state=user_random_state
            ).tolist()
            neg_ids.extend(chosen)
            needed = 0
        else:
            # 부족하면 전부 추가
            neg_ids.extend(valid_candidates)
            needed -= len(valid_candidates)
            usage_limit += 1

    # 혹시 neg_ids가 너무 많으면 잘라냄 (이론상 중복은 없겠지만 안전차원 중복제거)
    neg_ids = list(dict.fromkeys(neg_ids))
    if len(neg_ids) > train_negative_count:
        neg_ids = pd.Series(neg_ids).sample(n=train_negative_count, random_state=user_random_state).tolist()

    # 사용된 neg_ids usage_count 업데이트
    for nid in neg_ids:
        user_neg_usage_count[nid] += 1

    return neg_ids


# ----------------------------------------------------------------
# 3) Test negative (각 positive의 Publish 시각 기준 24h 내 + fallback)
# ----------------------------------------------------------------

def sample_test_negatives_publish_based(
    pos_id: str,
    news_map: dict,
    news_list: pd.DataFrame,
    used_ids_for_test: set,
    test_negative_count: int,
    test_ns_publish: int,
    user_random_state: int = None
):
    """
    - pos_id의 publish 시각을 pos_publish라 할 때,
      [pos_publish - 24h, pos_publish] 사이에 발행된 기사에서 뽑기
    - 부족하면 더 과거로 fallback
    - (History + Train + Test positive) 제외
    """
    if pos_id not in news_map:
        return []

    pos_publish = news_map[pos_id]['Publish']
    if pd.isnull(pos_publish):
        return []

    time_window_start = pos_publish - timedelta(hours=test_ns_publish)

    candidate_mask = (
        (news_list['Publish'] >= time_window_start) &
        (news_list['Publish'] <= pos_publish) &
        (~news_list['News ID'].isin(used_ids_for_test))
    )
    candidate_df_24h = news_list.loc[candidate_mask].drop_duplicates(subset='News ID')

    needed = test_negative_count
    selected_list = []

    if len(candidate_df_24h) >= needed:
        selected_list.append(candidate_df_24h.sample(n=needed, random_state=user_random_state))
    else:
        selected_list.append(candidate_df_24h)
        needed -= len(candidate_df_24h)
        # Fallback
        extra_mask = (
            (news_list['Publish'] < time_window_start) &
            (~news_list['News ID'].isin(used_ids_for_test))
        )
        extra_df = news_list.loc[extra_mask].drop_duplicates(subset='News ID')
        extra_df = extra_df.assign(TimeDiff=(time_window_start - extra_df['Publish']).abs())
        extra_df = extra_df.sort_values(by='TimeDiff').head(needed)
        selected_list.append(extra_df)

    selected_negatives = pd.concat(selected_list).drop_duplicates(subset='News ID')
    if len(selected_negatives) > test_negative_count:
        selected_negatives = selected_negatives.sample(n=test_negative_count, random_state=user_random_state)

    neg_ids = selected_negatives['News ID'].tolist()
    return neg_ids


# ----------------------------------------------------------------
# 4) 메인 함수
# ----------------------------------------------------------------

def get_train_test_negative_samples(
    user_list: pd.DataFrame,
    news_list: pd.DataFrame,
    max_users: int = 1000,
    train_negative_count: int = 4,
    test_negative_count: int = 20,
    train_max_candidate: int = None,
    test_max_candidate: int = None,
    test_ns_publish: int = 24,
    random_state: int = 42
):
    """
    요구사항:
    1) Train negative: "2017-02-05 08:00:00 ~ 2017-02-12 08:00:00"에서만 뽑는다
       - (History+Train) 제외
       - 한 번 negative로 사용된 뉴스는 재사용 X
         (후보 부족하면 usage_limit += 1로 점진적 허용)
    2) Test negative: 기존처럼 "positive 뉴스의 Publish 시각 -24h ~ Publish 시각"
       - (History+Train+Test positive) 제외
       - 부족하면 과거 fallback
    3) 각 user마다 다른 random_seed
       - 예: user=1 → user_random_state = 42 + 1=43
             user=2 → 42+2=44 ...
    """

    # Train 기간
    train_period_start = pd.to_datetime("2017-02-05 08:00:00")
    train_period_end   = pd.to_datetime("2017-02-12 08:00:00")

    # 1) news_list 전처리
    news_list = news_list.copy()
    news_list['Publish'] = pd.to_datetime(news_list['Publish'], errors='coerce')
    news_list.dropna(subset=['Publish'], inplace=True)

    # 중복 News ID -> 첫 번째만
    news_map = {}
    for idx, row in news_list.iterrows():
        nid = row['News ID']
        if nid not in news_map:
            news_map[nid] = row

    # 결과 저장
    train_results = []
    test_results  = []

    # LLM 용(user -> History+Train)
    user_full_history_str_with_time = {}

    # 2) 사용자 반복
    for i, user_row in user_list.iterrows():
        user_str = user_row['User']  # 예: "U1"
        user_int = int(user_str[1:]) # -> 1

        if user_int > max_users:
            break

        # user마다 다른 random_seed 설정
        user_random_state = None
        if random_state is not None:
            user_random_state = random_state + user_int

        # History / Train / Test 파싱
        history_ids, history_ids_with_time_str = parse_news_datetime_string(
            str(user_row['History']) if pd.notnull(user_row['History']) else ""
        )
        train_ids, train_ids_with_time_str = parse_news_datetime_string(
            str(user_row['Train']) if pd.notnull(user_row['Train']) else ""
        )
        test_ids, test_ids_with_time_str = parse_news_datetime_string(
            str(user_row['Test']) if pd.notnull(user_row['Test']) else ""
        )

        if train_max_candidate is not None and len(train_ids) > train_max_candidate:
            train_list = train_ids[:train_max_candidate]
        else:
            train_list = train_ids

        if test_max_candidate is not None and len(test_ids) > test_max_candidate:
            test_list = test_ids[:test_max_candidate]
        else:
            test_list = test_ids

        # LLM용 History = (History + Train)
        if history_ids_with_time_str and train_ids_with_time_str:
            full_history_str = history_ids_with_time_str + ";" + train_ids_with_time_str
        elif history_ids_with_time_str:
            full_history_str = history_ids_with_time_str
        else:
            full_history_str = train_ids_with_time_str

        user_full_history_str_with_time[user_int] = full_history_str

        # -----------------------------
        # [A] Train negative sampling
        # -----------------------------
        used_ids_for_train = set(history_ids + train_ids)
        clicked_news_str_for_train = " ".join(history_ids)

        # 사용자별 negative usage_count
        user_neg_usage_count = defaultdict(int)

        for pos_id in train_list:
            # pos_id가 news_map에 있어야 함 (없으면 continue)
            if pos_id not in news_map:
                continue

            # Train sampling (고정 기간 + usage_limit)
            neg_ids = sample_train_negatives_fixed_period_with_usage_limit(
                pos_id=pos_id,
                news_map=news_map,
                news_list=news_list,
                used_ids_for_train=used_ids_for_train,
                train_negative_count=train_negative_count,
                user_neg_usage_count=user_neg_usage_count,
                train_period_start=train_period_start,
                train_period_end=train_period_end,
                user_random_state=user_random_state
            )

            candidate_list = [pos_id] + neg_ids
            clicked_arr = ["1"] + ["0"] * len(neg_ids)

            train_results.append({
                'user': user_int,
                'clicked_news': clicked_news_str_for_train,
                'candidate_news': " ".join(candidate_list),
                'clicked': " ".join(clicked_arr)
            })

        # -----------------------------
        # [B] Test negative sampling
        # -----------------------------
        # (History+Train+Test positive) 제외
        used_ids_for_test = set(history_ids + train_ids + test_ids)
        clicked_news_str_for_test = " ".join(history_ids + train_list)

        for pos_id in test_list:
            if pos_id not in news_map:
                continue

            neg_ids = sample_test_negatives_publish_based(
                pos_id=pos_id,
                news_map=news_map,
                news_list=news_list,
                used_ids_for_test=used_ids_for_test,
                test_negative_count=test_negative_count,
                test_ns_publish=test_ns_publish,
                user_random_state=user_random_state
            )

            candidate_list = [pos_id] + neg_ids
            clicked_arr = ["1"] + ["0"] * len(neg_ids)

            test_results.append({
                'user': user_int,
                'clicked_news': clicked_news_str_for_test,
                'candidate_news': " ".join(candidate_list),
                'clicked': " ".join(clicked_arr)
            })

    # (3) DataFrame 만들기
    train_df = pd.DataFrame(train_results, columns=['user','clicked_news','candidate_news','clicked'])
    test_df  = pd.DataFrame(test_results,  columns=['user','clicked_news','candidate_news','clicked'])

    # test 변환
    test_df_transformed = transform_test_df(test_df)

    # (4) TSV 저장
    train_path = 'experiment_data/baseline/train'
    test_path  = 'experiment_data/baseline/test'
    llm_path   = 'experiment_data/LLM'
    os.makedirs(train_path, exist_ok=True)
    os.makedirs(test_path,  exist_ok=True)
    os.makedirs(llm_path,   exist_ok=True)

    train_filename = f'{train_path}/user{max_users}_ns{train_negative_count}_cd{train_max_candidate}.tsv'
    test_filename  = f'{test_path}/user{max_users}_ns{test_negative_count}_cd{test_max_candidate}.tsv'
    llm_filename   = f'{llm_path}/user{max_users}_ns{test_negative_count}_cd{test_max_candidate}.tsv'

    # 저장
    train_df.to_csv(train_filename, sep='\t', index=False)
    test_df_transformed.to_csv(test_filename, sep='\t', header=False, index=False)

    # LLM용
    llm_df = test_df.copy()
    llm_df.rename(columns={'user': 'User', 'clicked_news': 'History','candidate_news': 'Question'}, inplace=True)
    llm_df.drop(columns=['clicked'], inplace=True)
    llm_df['History'] = llm_df['User'].map(user_full_history_str_with_time)
    llm_df.to_csv(llm_filename, sep='\t', index=False, header=False)

    print(f"[Saved] {train_filename}")
    print(f"[Saved] {test_filename}")
    print(f"[Saved] {llm_filename}")

    return train_df, test_df_transformed


## 실행
[get_train_test_negative_samples 함수]
- user_list = user behaviors DF  
- news_list = news DF 
- max_users = 몇 명의 user에 대한 Data를 생성할 것인지
- negative_count = negative sample 수수
- train_max_candidate =  각 user 마다 Train feature에서 최대 몇개의 candidate_news를 사용할 것인지 (None : 제한 없음)
- test_max_candidate = 각 user 마다 Test feature에서 최대 몇개의 candidate_news를 사용할 것인지 (None : 제한 없음)

In [7]:
# 실행 예시
train_df, test_df = get_train_test_negative_samples(
    user_list=filtered_users,
    news_list=news_total_raw,
    max_users=16000,
    train_negative_count=4,
    test_negative_count=4,
    train_max_candidate=None,  
    test_max_candidate=20,
    test_ns_publish=1    
)

display(Markdown("### [ Train-based DataFrame ]<hr/>"))
display(train_df)
display(Markdown("### [ Test-based DataFrame ]<hr/>"))
display(test_df)

[Saved] experiment_data/baseline/train/user16000_ns4_cdNone.tsv
[Saved] experiment_data/baseline/test/user16000_ns4_cd20.tsv
[Saved] experiment_data/LLM/user16000_ns4_cd20.tsv


### [ Train-based DataFrame ]<hr/>

Unnamed: 0,user,clicked_news,candidate_news,clicked
0,1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N21013 N21297 N22485 N23141 N21607,1 0 0 0 0
1,1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N21021 N21592 N22250 N21841 N21337,1 0 0 0 0
2,1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N21756 N21609 N22257 N21867 N21357,1 0 0 0 0
3,1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N21817 N21923 N22625 N21472 N23913,1 0 0 0 0
4,1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N21725 N22580 N22941 N21098 N22921,1 0 0 0 0
...,...,...,...,...
197249,16000,N11 N5805 N9159 N8664 N9614 N10460 N10311 N166...,N22104 N21329 N21662 N21289 N22858,1 0 0 0 0
197250,16000,N11 N5805 N9159 N8664 N9614 N10460 N10311 N166...,N22790 N21357 N21764 N21297 N22059,1 0 0 0 0
197251,16000,N11 N5805 N9159 N8664 N9614 N10460 N10311 N166...,N22829 N21368 N22778 N22054 N22663,1 0 0 0 0
197252,16000,N11 N5805 N9159 N8664 N9614 N10460 N10311 N166...,N22786 N22541 N22941 N21337 N21529,1 0 0 0 0


### [ Test-based DataFrame ]<hr/>

Unnamed: 0,user,clicked_news,candidate_news
0,U1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N22972-1 N22963-0 N22975-0 N22962-0 N22964-0
1,U1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N23283-1 N23263-0 N23262-0 N23269-0 N23260-0
2,U1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N24116-1 N24110-0 N24115-0 N24195-0 N24293-0
3,U1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N24109-1 N24103-0 N24108-0 N24260-0 N24098-0
4,U1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N23981-1 N24074-0 N23974-0 N23968-0 N23975-0
...,...,...,...
145897,U16000,N11 N5805 N9159 N8664 N9614 N10460 N10311 N166...,N23056-1 N23041-0 N23046-0 N23051-0 N23048-0
145898,U16000,N11 N5805 N9159 N8664 N9614 N10460 N10311 N166...,N23173-1 N23392-0 N23156-0 N23154-0 N23151-0
145899,U16000,N11 N5805 N9159 N8664 N9614 N10460 N10311 N166...,N23034-1 N23035-0 N23036-0 N23037-0 N23049-0
145900,U16000,N11 N5805 N9159 N8664 N9614 N10460 N10311 N166...,N23204-1 N23205-0 N23624-0 N23203-0 N23389-0


In [25]:
display(train_df)
display(test_df)

Unnamed: 0,user,clicked_news,candidate_news,clicked
0,1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N21013 N21145 N20925 N20933 N22090,1 0 0 0 0
1,1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N21021 N20948 N21143 N20933 N21451,1 0 0 0 0
2,1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N21756 N21504 N21297 N21505 N21362,1 0 0 0 0
3,1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N21817 N21362 N21623 N21391 N21308,1 0 0 0 0
4,1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N21725 N21304 N21505 N21329 N21289,1 0 0 0 0
...,...,...,...,...
12983,1000,N10 N2381 N1353 N7167 N7103 N7376 N10572 N1272...,N21857 N21651 N21357 N21662 N21481,1 0 0 0 0
12984,1000,N10 N2381 N1353 N7167 N7103 N7376 N10572 N1272...,N21892 N21662 N21374 N21613 N21676,1 0 0 0 0
12985,1000,N10 N2381 N1353 N7167 N7103 N7376 N10572 N1272...,N21756 N21306 N21592 N21337 N21297,1 0 0 0 0
12986,1000,N10 N2381 N1353 N7167 N7103 N7376 N10572 N1272...,N22382 N22385 N22225 N22145 N22183,1 0 0 0 0


Unnamed: 0,user,clicked_news,candidate_news
0,U1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N22972-1 N23125-0 N22921-0 N22933-0 N23124-0 N...
1,U1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N23283-1 N23035-0 N23392-0 N23085-0 N23019-0 N...
2,U1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N24116-1 N23996-0 N23929-0 N24088-0 N23938-0 N...
3,U1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N24109-1 N23978-0 N23909-0 N24075-0 N23922-0 N...
4,U1,N2 N2445 N6043 N6053 N5917 N5555 N5733 N5701 N...,N23981-1 N23923-0 N23847-0 N23779-0 N23809-0 N...
...,...,...,...
9302,U1000,N10 N2381 N1353 N7167 N7103 N7376 N10572 N1272...,N23865-1 N23686-0 N23708-0 N23816-0 N23728-0 N...
9303,U1000,N10 N2381 N1353 N7167 N7103 N7376 N10572 N1272...,N24231-1 N24195-0 N24231-0 N24049-0 N24160-0 N...
9304,U1000,N10 N2381 N1353 N7167 N7103 N7376 N10572 N1272...,N24232-1 N24125-0 N24046-0 N23986-0 N24001-0 N...
9305,U1000,N10 N2381 N1353 N7167 N7103 N7376 N10572 N1272...,N24127-1 N24042-0 N23948-0 N24098-0 N23962-0 N...


In [36]:
news_total_raw[news_total_raw['News ID']=='N24295']

Unnamed: 0,News ID,Category,Sub-Category,Title,Body,ID,Publish,Click time history
23915,N24295,nyheter,sortrondelag,Fv. 705 stenges etter brann i lastebil,Saken oppdateres. Lørdag formiddag tok det fyr...,ae36cef9aac9b272cf06bf7849ac84ef676a6e6f,2017-02-18 12:21:32,"2017-02-18 21:56:31,2017-02-18 21:56:39,2017-0..."


In [21]:
user_counts = test_df['user'].value_counts()

# 가장 많이 등장한 user와 그 개수
most_frequent_user = user_counts.idxmax()
most_frequent_count = user_counts.max()

# 평균 값 계산
average_count = user_counts.mean()

print(f"가장 많이 등장한 user: {most_frequent_user} - {most_frequent_count}개")
print(f"평균 값: {average_count:.2f}")

가장 많이 등장한 user: U655 - 20개
평균 값: 9.31
