In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import os
import math
import pickle
import scipy.sparse as sp
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# =========================================================
# 1. 환경 설정 및 시드 고정
# =========================================================
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 경로 설정 (Step 0에서 저장한 processed_data 폴더 경로)
base_path = "/content/drive/MyDrive/unstructured"
data_path = f"{base_path}/k5_filtered"
movie_final_path = f"{base_path}/movie_data_final_clean.csv"

Using device: cuda


**1. Lamda2 = 0 인 경우**

In [None]:
# =========================================================
# 2. 데이터 로드 (Pre-split Data 사용)
# =========================================================
if not os.path.exists(data_path) or not os.path.exists(movie_final_path):
    print("Error: 데이터 경로를 확인해주세요.")
else:
    # 1. 데이터 읽기
    train_df = pd.read_csv(f"{data_path}/train.csv")
    val_df   = pd.read_csv(f"{data_path}/val.csv")
    test_df  = pd.read_csv(f"{data_path}/test.csv")

    # 2. 매핑 정보 로드
    with open(f"{data_path}/user2idx.pkl", 'rb') as f: user2idx = pickle.load(f)
    with open(f"{data_path}/item2idx.pkl", 'rb') as f: item2idx = pickle.load(f)

    n_users = len(user2idx)
    n_items = len(item2idx)

    print(f"데이터 로드 완료. User: {n_users}, Item: {n_items}")
    print(f"전체 Train 데이터 수: {len(train_df)}")

    # =========================================================
    # 3. 데이터 분리 (Positive vs Hard Negative)
    # =========================================================
    # 1) Positive Data (4점 이상): 그래프 연결 및 정답 학습용
    train_pos_df = train_df[train_df['rating'] >= 4.0].copy()

    # 2) Hard Negative Data (2점 이하): 오답 학습용
    train_neg_df = train_df[train_df['rating'] <= 2.0].copy()

    print(f"Positive Interactions (Graph용): {len(train_pos_df)}")
    print(f"Hard Negative Interactions: {len(train_neg_df)}")

    # 딕셔너리 생성
    # Positive: 유저가 좋아하는 아이템 목록
    user_pos_items = train_pos_df.groupby("user")["item"].apply(set).to_dict()

    # Hard Negative: 유저가 싫어하는 아이템 목록 (샘플링 때 사용)
    user_hard_neg_items = train_neg_df.groupby("user")["item"].apply(list).to_dict()

    # =========================================================
    # 4. Doc2Vec 학습 및 매트릭스 생성 (Feature Extraction)
    # =========================================================
    print("\n=== Doc2Vec Training ===")
    movie_df = pd.read_csv(movie_final_path)
    movie_df['overview_clean'] = movie_df['overview_clean'].fillna('unknown')

    # TaggedDocument 생성 (ID는 원본 movieId 사용 -> 나중에 item2idx로 매핑)
    documents = [TaggedDocument(str(row['overview_clean']).split(), [str(row['movieId'])])
                 for _, row in movie_df.iterrows()]

    # LightGCN 차원(64)과 일치시킴
    d2v_model = Doc2Vec(documents, vector_size=64, window=5, min_count=1, workers=4, epochs=20, seed=42)

    # Embedding Matrix 생성 (item_idx 순서에 맞게 정렬)
    doc2vec_weights = np.zeros((n_items, 64))
    cnt = 0
    for movie_id, idx in item2idx.items():
        # Doc2Vec 모델에 해당 영화 ID가 있는지 확인
        if str(movie_id) in d2v_model.dv:
            doc2vec_weights[idx] = d2v_model.dv[str(movie_id)]
            cnt += 1
        else:
            # 없으면 랜덤 초기화 (작은 값)
            doc2vec_weights[idx] = np.random.normal(0, 0.01, 64)

    doc2vec_weights = torch.FloatTensor(doc2vec_weights).to(device)
    print(f"Doc2Vec Matrix Created. Mapped: {cnt}/{n_items}")


    # Save Doc2Vec embeddings to disk for reuse
    doc2vec_save_path = f"{base_path}/doc2vec_embeddings_64d.pkl"
    with open(doc2vec_save_path, 'wb') as f:
        pickle.dump({
            'embeddings': doc2vec_weights.cpu().numpy(),
            'item2idx': item2idx,
            'n_items': n_items,
            'vector_size': 64,
            'created_at': str(pd.Timestamp.now())
        }, f)
    print(f"✅ Doc2Vec embeddings saved to: {doc2vec_save_path}")
    # =========================================================
    # 5. LightGCN용 인접행렬 생성 (Positive 데이터만 사용!)
    # =========================================================
    def get_adj_mat(n_users, n_items, pos_df):
        """
        반드시 4점 이상인 pos_df만 넣어서 그래프를 만들어야 함
        """
        n_nodes = n_users + n_items
        user_np = pos_df['user'].values
        item_np = pos_df['item'].values

        R = sp.coo_matrix((np.ones(len(user_np)), (user_np, item_np)), shape=(n_users, n_items))

        top_part = sp.hstack([sp.csr_matrix((n_users, n_users)), R])
        bot_part = sp.hstack([R.T, sp.csr_matrix((n_items, n_items))])
        A = sp.vstack([top_part, bot_part])

        rowsum = np.array(A.sum(1))
        d_inv_sqrt = np.power(rowsum, -0.5).flatten()
        d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
        d_mat_inv_sqrt = sp.diags(d_inv_sqrt)

        norm_adj = d_mat_inv_sqrt.dot(A).dot(d_mat_inv_sqrt).tocoo()
        indices = torch.LongTensor(np.vstack((norm_adj.row, norm_adj.col)))
        values = torch.FloatTensor(norm_adj.data)

        return torch.sparse_coo_tensor(indices, values, torch.Size(norm_adj.shape)).to(device)

    print("그래프 생성 중 (Positive Edge Only)...")
    Adj_Matrix = get_adj_mat(n_users, n_items, train_pos_df)
    print("그래프 생성 완료!")

    # =========================================================
    # 6. 모델 정의 (LightGCN with Doc2Vec Init)
    # =========================================================
    class LightGCN_Doc2Vec(nn.Module):
        def __init__(self, n_users, n_items, dim, layers, A_hat, doc2vec_weights):
            super().__init__()
            self.n_users = n_users
            self.n_items = n_items
            self.dim = dim
            self.layers = layers
            self.A_hat = A_hat

            # User는 랜덤 초기화
            self.user_emb = nn.Embedding(n_users, dim)
            nn.init.normal_(self.user_emb.weight, std=0.1)

            # [핵심] Item은 Doc2Vec으로 초기화 (freeze=False: 학습 가능)
            self.item_emb = nn.Embedding.from_pretrained(doc2vec_weights, freeze=False)

            # Content Loss 계산용 고정 벡터 (Buffer) - 학습 안 됨
            self.register_buffer('fixed_doc2vec', doc2vec_weights.clone().detach())

        def get_all_embeddings(self):
            users = self.user_emb.weight
            items = self.item_emb.weight
            all_emb = torch.cat([users, items], dim=0)

            embs = [all_emb]
            for _ in range(self.layers):
                all_emb = torch.sparse.mm(self.A_hat, all_emb)
                embs.append(all_emb)

            out = torch.stack(embs, dim=0).mean(dim=0)
            return out[:self.n_users], out[self.n_users:]

    # =========================================================
    # 7. Loss & Sampling (Hard Negative + Content Loss)
    # =========================================================
    def sample_batch_with_hard_neg(batch_size, user_pos_items, user_hard_neg_items, n_items, hard_prob=0.5):
        users = np.random.choice(list(user_pos_items.keys()), size=batch_size)
        pos = []
        neg = []

        for u in users:
            # Positive Sampling
            pos.append(np.random.choice(list(user_pos_items[u])))

            # Hard Negative Sampling (50% 확률)
            if (u in user_hard_neg_items) and (len(user_hard_neg_items[u]) > 0) and (random.random() < hard_prob):
                neg.append(np.random.choice(user_hard_neg_items[u]))
            else:
                # Random Negative
                while True:
                    n = np.random.randint(0, n_items)
                    if n not in user_pos_items[u]:
                        neg.append(n); break

        return (torch.LongTensor(users).to(device),
                torch.LongTensor(pos).to(device),
                torch.LongTensor(neg).to(device))

    def bpr_loss_with_content(u_f, i_pos_f, i_neg_f, u_0, i_pos_0, i_neg_0, fixed_vec, lambda1, lambda2):
        """
        u_f, i_pos_f, i_neg_f : 전파된 임베딩 (BPR용)
        u_0, i_pos_0, i_neg_0 : 초기 임베딩 (Regularization용)
        fixed_vec             : 고정된 Doc2Vec 벡터 (Content Loss용)
        """
        # 1. BPR Loss
        pos_scores = (u_f * i_pos_f).sum(dim=1)
        neg_scores = (u_f * i_neg_f).sum(dim=1)
        bpr_loss = torch.mean(torch.nn.functional.softplus(-(pos_scores - neg_scores)))

        # 2. L2 Regularization (E^0 기준)
        reg_loss = 0.5 * (u_0.norm(2).pow(2) + i_pos_0.norm(2).pow(2) + i_neg_0.norm(2).pow(2)) / u_0.size(0)

        # 3. Content Loss (E^0 vs Fixed Doc2Vec)
        content_loss = F.mse_loss(i_pos_0, fixed_vec)

        return bpr_loss + lambda1 * reg_loss + lambda2 * content_loss

    # =========================================================
    # 8. 평가 함수 (Precision, Recall, NDCG, HitRate)
    # =========================================================
    def ndcg_at_k(rank, k):
        if rank is None or rank >= k: return 0.0
        return 1.0 / math.log2(rank + 2)

    def evaluate(model, df_eval, k=10):
        model.eval()
        users_final, items_final = model.get_all_embeddings()
        hits, ndcg, precision, recall, total_users = 0, 0, 0, 0, 0

        with torch.no_grad():
            for u_idx, group in df_eval.groupby('user'):
                total_users += 1
                target_items = set(group['item'].values)

                scores = torch.matmul(users_final[u_idx], items_final.t())

                # Train에서 본 아이템(Positive Only)은 마스킹
                if u_idx in user_pos_items:
                    scores[list(user_pos_items[u_idx])] = -1e9

                _, topk = torch.topk(scores, k)
                topk = topk.cpu().tolist()

                num_correct = 0
                dcg, idcg = 0.0, 0.0

                for i, item_id in enumerate(topk):
                    if item_id in target_items:
                        num_correct += 1
                        dcg += 1.0 / np.log2(i + 2)

                num_targets = len(target_items)
                for i in range(min(num_targets, k)):
                    idcg += 1.0 / np.log2(i + 2)

                if num_correct > 0: hits += 1
                precision += num_correct / k
                recall += num_correct / num_targets
                if idcg > 0: ndcg += dcg / idcg

        return {'HitRate': hits/total_users, 'Precision': precision/total_users, 'Recall': recall/total_users, 'NDCG': ndcg/total_users}

    # =========================================================
    # 9. 학습 실행 (Best Model Selection 적용)
    # =========================================================
    dim = 64
    layers = 3
    batch_size = 1024
    epochs = 50
    lr = 1e-3

    # [설정] 하이퍼파라미터
    lambda1 = 1e-5  # Reg (Base 실험 결과 반영)
    lambda2 = 0     # Content Loss (Doc2Vec Init만 쓰려면 0, 규제하려면 1e-3 등)

    model = LightGCN_Doc2Vec(n_users, n_items, dim, layers, Adj_Matrix, doc2vec_weights).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Best Model 저장 변수
    best_recall = 0.0
    best_epoch = 0
    best_model_path = "best_doc2vec_lightgcn.pt"

    print(f"\n=== Training Start (Lambda2={lambda2}, Hard Negative Applied) ===")

    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0

        # 배치 수는 Positive Data 기준
        num_batches = len(train_pos_df) // batch_size + 1

        for _ in range(num_batches):
            # 1. 샘플링 (Hard Negative 포함)
            users, pos, neg = sample_batch_with_hard_neg(batch_size, user_pos_items, user_hard_neg_items, n_items, hard_prob=0.5)

            # 2. 전파 (BPR 용)
            u_final, i_final = model.get_all_embeddings()
            u_f = u_final[users]
            i_pos_f = i_final[pos]
            i_neg_f = i_final[neg]

            # 3. 초기값 (Reg & Content 용)
            u_0 = model.user_emb.weight[users]
            i_pos_0 = model.item_emb.weight[pos]
            i_neg_0 = model.item_emb.weight[neg]

            # 4. 고정된 Doc2Vec 추출
            fixed_vec = model.fixed_doc2vec[pos]

            # 5. Loss 계산
            loss = bpr_loss_with_content(u_f, i_pos_f, i_neg_f,
                                         u_0, i_pos_0, i_neg_0,
                                         fixed_vec, lambda1, lambda2)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # 매 Epoch마다 Validation 평가
        val_metrics = evaluate(model, val_df, k=10)
        current_recall = val_metrics['Recall']

        print(f"[Epoch {epoch:02d}] Loss: {total_loss/num_batches:.4f} | Val Recall: {current_recall:.4f} | NDCG: {val_metrics['NDCG']:.4f}")

        if current_recall > best_recall:
            best_recall = current_recall
            best_epoch = epoch
            torch.save(model.state_dict(), best_model_path)
            print(f"   >>> Best Model Updated!")

    print(f"\n=== 학습 종료. Best Epoch: {best_epoch}, Best Val Recall: {best_recall:.4f} ===")

    # Final Test with Best Model
    model.load_state_dict(torch.load(best_model_path))
    test_metrics = evaluate(model, test_df, k=10)
    print("\n=== Final Test Result (Best Model) ===")
    print(f"Hit: {test_metrics['HitRate']:.4f}, Prec: {test_metrics['Precision']:.4f}, Recall: {test_metrics['Recall']:.4f}, NDCG: {test_metrics['NDCG']:.4f}")

데이터 로드 완료. User: 671, Item: 3485
전체 Train 데이터 수: 88595
Positive Interactions (Graph용): 46243
Hard Negative Interactions: 11327

=== Doc2Vec Training ===
Doc2Vec Matrix Created. Mapped: 3485/3485
그래프 생성 중 (Positive Edge Only)...
그래프 생성 완료!


  d_inv_sqrt = np.power(rowsum, -0.5).flatten()



=== Training Start (Lambda2=0, Hard Negative Applied) ===
[Epoch 01] Loss: 0.6464 | Val Recall: 0.1036 | NDCG: 0.0558
   >>> Best Model Updated!
[Epoch 02] Loss: 0.5943 | Val Recall: 0.1111 | NDCG: 0.0599
   >>> Best Model Updated!
[Epoch 03] Loss: 0.5369 | Val Recall: 0.1186 | NDCG: 0.0639
   >>> Best Model Updated!
[Epoch 04] Loss: 0.4911 | Val Recall: 0.1201 | NDCG: 0.0647
   >>> Best Model Updated!
[Epoch 05] Loss: 0.4665 | Val Recall: 0.1216 | NDCG: 0.0656
   >>> Best Model Updated!
[Epoch 06] Loss: 0.4447 | Val Recall: 0.1291 | NDCG: 0.0674
   >>> Best Model Updated!
[Epoch 07] Loss: 0.4257 | Val Recall: 0.1336 | NDCG: 0.0687
   >>> Best Model Updated!
[Epoch 08] Loss: 0.4162 | Val Recall: 0.1321 | NDCG: 0.0680
[Epoch 09] Loss: 0.3974 | Val Recall: 0.1366 | NDCG: 0.0699
   >>> Best Model Updated!
[Epoch 10] Loss: 0.3865 | Val Recall: 0.1381 | NDCG: 0.0702
   >>> Best Model Updated!
[Epoch 11] Loss: 0.3791 | Val Recall: 0.1366 | NDCG: 0.0702
[Epoch 12] Loss: 0.3625 | Val Recall: 

**2. Lamda2 = 1e-2 -> (0.01)인 경우**

In [None]:
# =========================================================
# 2. 데이터 로드 (Pre-split Data 사용)
# =========================================================
if not os.path.exists(data_path) or not os.path.exists(movie_final_path):
    print("Error: 데이터 경로를 확인해주세요.")
else:
    # 1. 데이터 읽기
    train_df = pd.read_csv(f"{data_path}/train.csv")
    val_df   = pd.read_csv(f"{data_path}/val.csv")
    test_df  = pd.read_csv(f"{data_path}/test.csv")

    # 2. 매핑 정보 로드
    with open(f"{data_path}/user2idx.pkl", 'rb') as f: user2idx = pickle.load(f)
    with open(f"{data_path}/item2idx.pkl", 'rb') as f: item2idx = pickle.load(f)

    n_users = len(user2idx)
    n_items = len(item2idx)

    print(f"데이터 로드 완료. User: {n_users}, Item: {n_items}")
    print(f"전체 Train 데이터 수: {len(train_df)}")

    # =========================================================
    # 3. 데이터 분리 (Positive vs Hard Negative)
    # =========================================================
    # 1) Positive Data (4점 이상): 그래프 연결 및 정답 학습용
    train_pos_df = train_df[train_df['rating'] >= 4.0].copy()

    # 2) Hard Negative Data (2점 이하): 오답 학습용
    train_neg_df = train_df[train_df['rating'] <= 2.0].copy()

    print(f"Positive Interactions (Graph용): {len(train_pos_df)}")
    print(f"Hard Negative Interactions: {len(train_neg_df)}")

    # 딕셔너리 생성
    # Positive: 유저가 좋아하는 아이템 목록
    user_pos_items = train_pos_df.groupby("user")["item"].apply(set).to_dict()

    # Hard Negative: 유저가 싫어하는 아이템 목록 (샘플링 때 사용)
    user_hard_neg_items = train_neg_df.groupby("user")["item"].apply(list).to_dict()

    # =========================================================
    # 4. Doc2Vec 학습 및 매트릭스 생성 (Feature Extraction)
    # =========================================================
    print("\n=== Doc2Vec Training ===")
    movie_df = pd.read_csv(movie_final_path)
    movie_df['overview_clean'] = movie_df['overview_clean'].fillna('unknown')

    # TaggedDocument 생성 (ID는 원본 movieId 사용 -> 나중에 item2idx로 매핑)
    documents = [TaggedDocument(str(row['overview_clean']).split(), [str(row['movieId'])])
                 for _, row in movie_df.iterrows()]

    # LightGCN 차원(64)과 일치시킴
    d2v_model = Doc2Vec(documents, vector_size=64, window=5, min_count=1, workers=4, epochs=20, seed=42)

    # Embedding Matrix 생성 (item_idx 순서에 맞게 정렬)
    doc2vec_weights = np.zeros((n_items, 64))
    cnt = 0
    for movie_id, idx in item2idx.items():
        # Doc2Vec 모델에 해당 영화 ID가 있는지 확인
        if str(movie_id) in d2v_model.dv:
            doc2vec_weights[idx] = d2v_model.dv[str(movie_id)]
            cnt += 1
        else:
            # 없으면 랜덤 초기화 (작은 값)
            doc2vec_weights[idx] = np.random.normal(0, 0.01, 64)

    doc2vec_weights = torch.FloatTensor(doc2vec_weights).to(device)
    print(f"Doc2Vec Matrix Created. Mapped: {cnt}/{n_items}")


    # Save Doc2Vec embeddings to disk for reuse
    doc2vec_save_path = f"{base_path}/doc2vec_embeddings_64d.pkl"
    with open(doc2vec_save_path, 'wb') as f:
        pickle.dump({
            'embeddings': doc2vec_weights.cpu().numpy(),
            'item2idx': item2idx,
            'n_items': n_items,
            'vector_size': 64,
            'created_at': str(pd.Timestamp.now())
        }, f)
    print(f"✅ Doc2Vec embeddings saved to: {doc2vec_save_path}")
    # =========================================================
    # 5. LightGCN용 인접행렬 생성 (Positive 데이터만 사용!)
    # =========================================================
    def get_adj_mat(n_users, n_items, pos_df):
        """
        반드시 4점 이상인 pos_df만 넣어서 그래프를 만들어야 함
        """
        n_nodes = n_users + n_items
        user_np = pos_df['user'].values
        item_np = pos_df['item'].values

        R = sp.coo_matrix((np.ones(len(user_np)), (user_np, item_np)), shape=(n_users, n_items))

        top_part = sp.hstack([sp.csr_matrix((n_users, n_users)), R])
        bot_part = sp.hstack([R.T, sp.csr_matrix((n_items, n_items))])
        A = sp.vstack([top_part, bot_part])

        rowsum = np.array(A.sum(1))
        d_inv_sqrt = np.power(rowsum, -0.5).flatten()
        d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
        d_mat_inv_sqrt = sp.diags(d_inv_sqrt)

        norm_adj = d_mat_inv_sqrt.dot(A).dot(d_mat_inv_sqrt).tocoo()
        indices = torch.LongTensor(np.vstack((norm_adj.row, norm_adj.col)))
        values = torch.FloatTensor(norm_adj.data)

        return torch.sparse_coo_tensor(indices, values, torch.Size(norm_adj.shape)).to(device)

    print("그래프 생성 중 (Positive Edge Only)...")
    Adj_Matrix = get_adj_mat(n_users, n_items, train_pos_df)
    print("그래프 생성 완료!")

    # =========================================================
    # 6. 모델 정의 (LightGCN with Doc2Vec Init)
    # =========================================================
    class LightGCN_Doc2Vec(nn.Module):
        def __init__(self, n_users, n_items, dim, layers, A_hat, doc2vec_weights):
            super().__init__()
            self.n_users = n_users
            self.n_items = n_items
            self.dim = dim
            self.layers = layers
            self.A_hat = A_hat

            # User는 랜덤 초기화
            self.user_emb = nn.Embedding(n_users, dim)
            nn.init.normal_(self.user_emb.weight, std=0.1)

            # [핵심] Item은 Doc2Vec으로 초기화 (freeze=False: 학습 가능)
            self.item_emb = nn.Embedding.from_pretrained(doc2vec_weights, freeze=False)

            # Content Loss 계산용 고정 벡터 (Buffer) - 학습 안 됨
            self.register_buffer('fixed_doc2vec', doc2vec_weights.clone().detach())

        def get_all_embeddings(self):
            users = self.user_emb.weight
            items = self.item_emb.weight
            all_emb = torch.cat([users, items], dim=0)

            embs = [all_emb]
            for _ in range(self.layers):
                all_emb = torch.sparse.mm(self.A_hat, all_emb)
                embs.append(all_emb)

            out = torch.stack(embs, dim=0).mean(dim=0)
            return out[:self.n_users], out[self.n_users:]

    # =========================================================
    # 7. Loss & Sampling (Hard Negative + Content Loss)
    # =========================================================
    def sample_batch_with_hard_neg(batch_size, user_pos_items, user_hard_neg_items, n_items, hard_prob=0.5):
        users = np.random.choice(list(user_pos_items.keys()), size=batch_size)
        pos = []
        neg = []

        for u in users:
            # Positive Sampling
            pos.append(np.random.choice(list(user_pos_items[u])))

            # Hard Negative Sampling (50% 확률)
            if (u in user_hard_neg_items) and (len(user_hard_neg_items[u]) > 0) and (random.random() < hard_prob):
                neg.append(np.random.choice(user_hard_neg_items[u]))
            else:
                # Random Negative
                while True:
                    n = np.random.randint(0, n_items)
                    if n not in user_pos_items[u]:
                        neg.append(n); break

        return (torch.LongTensor(users).to(device),
                torch.LongTensor(pos).to(device),
                torch.LongTensor(neg).to(device))

    def bpr_loss_with_content(u_f, i_pos_f, i_neg_f, u_0, i_pos_0, i_neg_0, fixed_vec, lambda1, lambda2):
        """
        u_f, i_pos_f, i_neg_f : 전파된 임베딩 (BPR용)
        u_0, i_pos_0, i_neg_0 : 초기 임베딩 (Regularization용)
        fixed_vec             : 고정된 Doc2Vec 벡터 (Content Loss용)
        """
        # 1. BPR Loss
        pos_scores = (u_f * i_pos_f).sum(dim=1)
        neg_scores = (u_f * i_neg_f).sum(dim=1)
        bpr_loss = torch.mean(torch.nn.functional.softplus(-(pos_scores - neg_scores)))

        # 2. L2 Regularization (E^0 기준)
        reg_loss = 0.5 * (u_0.norm(2).pow(2) + i_pos_0.norm(2).pow(2) + i_neg_0.norm(2).pow(2)) / u_0.size(0)

        # 3. Content Loss (E^0 vs Fixed Doc2Vec)
        content_loss = F.mse_loss(i_pos_0, fixed_vec)

        return bpr_loss + lambda1 * reg_loss + lambda2 * content_loss

    # =========================================================
    # 8. 평가 함수 (Precision, Recall, NDCG, HitRate)
    # =========================================================
    def ndcg_at_k(rank, k):
        if rank is None or rank >= k: return 0.0
        return 1.0 / math.log2(rank + 2)

    def evaluate(model, df_eval, k=10):
        model.eval()
        users_final, items_final = model.get_all_embeddings()
        hits, ndcg, precision, recall, total_users = 0, 0, 0, 0, 0

        with torch.no_grad():
            for u_idx, group in df_eval.groupby('user'):
                total_users += 1
                target_items = set(group['item'].values)

                scores = torch.matmul(users_final[u_idx], items_final.t())

                # Train에서 본 아이템(Positive Only)은 마스킹
                if u_idx in user_pos_items:
                    scores[list(user_pos_items[u_idx])] = -1e9

                _, topk = torch.topk(scores, k)
                topk = topk.cpu().tolist()

                num_correct = 0
                dcg, idcg = 0.0, 0.0

                for i, item_id in enumerate(topk):
                    if item_id in target_items:
                        num_correct += 1
                        dcg += 1.0 / np.log2(i + 2)

                num_targets = len(target_items)
                for i in range(min(num_targets, k)):
                    idcg += 1.0 / np.log2(i + 2)

                if num_correct > 0: hits += 1
                precision += num_correct / k
                recall += num_correct / num_targets
                if idcg > 0: ndcg += dcg / idcg

        return {'HitRate': hits/total_users, 'Precision': precision/total_users, 'Recall': recall/total_users, 'NDCG': ndcg/total_users}

    # =========================================================
    # 9. 학습 실행 (Best Model Selection 적용)
    # =========================================================
    dim = 64
    layers = 3
    batch_size = 1024
    epochs = 25
    lr = 1e-3

    # [설정] 하이퍼파라미터
    lambda1 = 1e-5  # Reg (Base 실험 결과 반영)
    lambda2 = 1e-2  # Content Loss (Doc2Vec Init만 쓰려면 0, 규제하려면 1e-3 등)

    model = LightGCN_Doc2Vec(n_users, n_items, dim, layers, Adj_Matrix, doc2vec_weights).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Best Model 저장 변수
    best_recall = 0.0
    best_epoch = 0
    best_model_path = "best_doc2vec_lightgcn.pt"

    print(f"\n=== Training Start (Lambda2={lambda2}, Hard Negative Applied) ===")

    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0

        # 배치 수는 Positive Data 기준
        num_batches = len(train_pos_df) // batch_size + 1

        for _ in range(num_batches):
            # 1. 샘플링 (Hard Negative 포함)
            users, pos, neg = sample_batch_with_hard_neg(batch_size, user_pos_items, user_hard_neg_items, n_items, hard_prob=0.5)

            # 2. 전파 (BPR 용)
            u_final, i_final = model.get_all_embeddings()
            u_f = u_final[users]
            i_pos_f = i_final[pos]
            i_neg_f = i_final[neg]

            # 3. 초기값 (Reg & Content 용)
            u_0 = model.user_emb.weight[users]
            i_pos_0 = model.item_emb.weight[pos]
            i_neg_0 = model.item_emb.weight[neg]

            # 4. 고정된 Doc2Vec 추출
            fixed_vec = model.fixed_doc2vec[pos]

            # 5. Loss 계산
            loss = bpr_loss_with_content(u_f, i_pos_f, i_neg_f,
                                         u_0, i_pos_0, i_neg_0,
                                         fixed_vec, lambda1, lambda2)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # 매 Epoch마다 Validation 평가
        val_metrics = evaluate(model, val_df, k=10)
        current_recall = val_metrics['Recall']

        print(f"[Epoch {epoch:02d}] Loss: {total_loss/num_batches:.4f} | Val Recall: {current_recall:.4f} | NDCG: {val_metrics['NDCG']:.4f}")

        if current_recall > best_recall:
            best_recall = current_recall
            best_epoch = epoch
            torch.save(model.state_dict(), best_model_path)
            print(f"   >>> Best Model Updated!")

    print(f"\n=== 학습 종료. Best Epoch: {best_epoch}, Best Val Recall: {best_recall:.4f} ===")

    # Final Test with Best Model
    model.load_state_dict(torch.load(best_model_path))
    test_metrics = evaluate(model, test_df, k=10)
    print("\n=== Final Test Result (Best Model) ===")
    print(f"Hit: {test_metrics['HitRate']:.4f}, Prec: {test_metrics['Precision']:.4f}, Recall: {test_metrics['Recall']:.4f}, NDCG: {test_metrics['NDCG']:.4f}")

데이터 로드 완료. User: 671, Item: 3485
전체 Train 데이터 수: 88595
Positive Interactions (Graph용): 46243
Hard Negative Interactions: 11327

=== Doc2Vec Training ===
Doc2Vec Matrix Created. Mapped: 3485/3485
그래프 생성 중 (Positive Edge Only)...
그래프 생성 완료!

=== Training Start (Lambda2=0.01, Hard Negative Applied) ===


  d_inv_sqrt = np.power(rowsum, -0.5).flatten()


[Epoch 01] Loss: 0.6469 | Val Recall: 0.0991 | NDCG: 0.0544
   >>> Best Model Updated!
[Epoch 02] Loss: 0.5967 | Val Recall: 0.1156 | NDCG: 0.0600
   >>> Best Model Updated!
[Epoch 03] Loss: 0.5375 | Val Recall: 0.1186 | NDCG: 0.0629
   >>> Best Model Updated!
[Epoch 04] Loss: 0.4926 | Val Recall: 0.1231 | NDCG: 0.0648
   >>> Best Model Updated!
[Epoch 05] Loss: 0.4666 | Val Recall: 0.1276 | NDCG: 0.0674
   >>> Best Model Updated!
[Epoch 06] Loss: 0.4460 | Val Recall: 0.1306 | NDCG: 0.0682
   >>> Best Model Updated!
[Epoch 07] Loss: 0.4295 | Val Recall: 0.1336 | NDCG: 0.0688
   >>> Best Model Updated!
[Epoch 08] Loss: 0.4180 | Val Recall: 0.1351 | NDCG: 0.0691
   >>> Best Model Updated!
[Epoch 09] Loss: 0.4000 | Val Recall: 0.1366 | NDCG: 0.0697
   >>> Best Model Updated!
[Epoch 10] Loss: 0.3931 | Val Recall: 0.1411 | NDCG: 0.0726
   >>> Best Model Updated!
[Epoch 11] Loss: 0.3746 | Val Recall: 0.1411 | NDCG: 0.0722
[Epoch 12] Loss: 0.3641 | Val Recall: 0.1336 | NDCG: 0.0711
[Epoch 13]

**3. Lamda2 = 1e-3 -> (0.001)인 경우**

In [None]:
# =========================================================
# 2. 데이터 로드 (Pre-split Data 사용)
# =========================================================
if not os.path.exists(data_path) or not os.path.exists(movie_final_path):
    print("Error: 데이터 경로를 확인해주세요.")
else:
    # 1. 데이터 읽기
    train_df = pd.read_csv(f"{data_path}/train.csv")
    val_df   = pd.read_csv(f"{data_path}/val.csv")
    test_df  = pd.read_csv(f"{data_path}/test.csv")

    # 2. 매핑 정보 로드
    with open(f"{data_path}/user2idx.pkl", 'rb') as f: user2idx = pickle.load(f)
    with open(f"{data_path}/item2idx.pkl", 'rb') as f: item2idx = pickle.load(f)

    n_users = len(user2idx)
    n_items = len(item2idx)

    print(f"데이터 로드 완료. User: {n_users}, Item: {n_items}")
    print(f"전체 Train 데이터 수: {len(train_df)}")

    # =========================================================
    # 3. 데이터 분리 (Positive vs Hard Negative)
    # =========================================================
    # 1) Positive Data (4점 이상): 그래프 연결 및 정답 학습용
    train_pos_df = train_df[train_df['rating'] >= 4.0].copy()

    # 2) Hard Negative Data (2점 이하): 오답 학습용
    train_neg_df = train_df[train_df['rating'] <= 2.0].copy()

    print(f"Positive Interactions (Graph용): {len(train_pos_df)}")
    print(f"Hard Negative Interactions: {len(train_neg_df)}")

    # 딕셔너리 생성
    # Positive: 유저가 좋아하는 아이템 목록
    user_pos_items = train_pos_df.groupby("user")["item"].apply(set).to_dict()

    # Hard Negative: 유저가 싫어하는 아이템 목록 (샘플링 때 사용)
    user_hard_neg_items = train_neg_df.groupby("user")["item"].apply(list).to_dict()

    # =========================================================
    # 4. Doc2Vec 학습 및 매트릭스 생성 (Feature Extraction)
    # =========================================================
    print("\n=== Doc2Vec Training ===")
    movie_df = pd.read_csv(movie_final_path)
    movie_df['overview_clean'] = movie_df['overview_clean'].fillna('unknown')

    # TaggedDocument 생성 (ID는 원본 movieId 사용 -> 나중에 item2idx로 매핑)
    documents = [TaggedDocument(str(row['overview_clean']).split(), [str(row['movieId'])])
                 for _, row in movie_df.iterrows()]

    # LightGCN 차원(64)과 일치시킴
    d2v_model = Doc2Vec(documents, vector_size=64, window=5, min_count=1, workers=4, epochs=20, seed=42)

    # Embedding Matrix 생성 (item_idx 순서에 맞게 정렬)
    doc2vec_weights = np.zeros((n_items, 64))
    cnt = 0
    for movie_id, idx in item2idx.items():
        # Doc2Vec 모델에 해당 영화 ID가 있는지 확인
        if str(movie_id) in d2v_model.dv:
            doc2vec_weights[idx] = d2v_model.dv[str(movie_id)]
            cnt += 1
        else:
            # 없으면 랜덤 초기화 (작은 값)
            doc2vec_weights[idx] = np.random.normal(0, 0.01, 64)

    doc2vec_weights = torch.FloatTensor(doc2vec_weights).to(device)
    print(f"Doc2Vec Matrix Created. Mapped: {cnt}/{n_items}")


    # Save Doc2Vec embeddings to disk for reuse
    doc2vec_save_path = f"{base_path}/doc2vec_embeddings_64d.pkl"
    with open(doc2vec_save_path, 'wb') as f:
        pickle.dump({
            'embeddings': doc2vec_weights.cpu().numpy(),
            'item2idx': item2idx,
            'n_items': n_items,
            'vector_size': 64,
            'created_at': str(pd.Timestamp.now())
        }, f)
    print(f"✅ Doc2Vec embeddings saved to: {doc2vec_save_path}")
    # =========================================================
    # 5. LightGCN용 인접행렬 생성 (Positive 데이터만 사용!)
    # =========================================================
    def get_adj_mat(n_users, n_items, pos_df):
        """
        반드시 4점 이상인 pos_df만 넣어서 그래프를 만들어야 함
        """
        n_nodes = n_users + n_items
        user_np = pos_df['user'].values
        item_np = pos_df['item'].values

        R = sp.coo_matrix((np.ones(len(user_np)), (user_np, item_np)), shape=(n_users, n_items))

        top_part = sp.hstack([sp.csr_matrix((n_users, n_users)), R])
        bot_part = sp.hstack([R.T, sp.csr_matrix((n_items, n_items))])
        A = sp.vstack([top_part, bot_part])

        rowsum = np.array(A.sum(1))
        d_inv_sqrt = np.power(rowsum, -0.5).flatten()
        d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
        d_mat_inv_sqrt = sp.diags(d_inv_sqrt)

        norm_adj = d_mat_inv_sqrt.dot(A).dot(d_mat_inv_sqrt).tocoo()
        indices = torch.LongTensor(np.vstack((norm_adj.row, norm_adj.col)))
        values = torch.FloatTensor(norm_adj.data)

        return torch.sparse_coo_tensor(indices, values, torch.Size(norm_adj.shape)).to(device)

    print("그래프 생성 중 (Positive Edge Only)...")
    Adj_Matrix = get_adj_mat(n_users, n_items, train_pos_df)
    print("그래프 생성 완료!")

    # =========================================================
    # 6. 모델 정의 (LightGCN with Doc2Vec Init)
    # =========================================================
    class LightGCN_Doc2Vec(nn.Module):
        def __init__(self, n_users, n_items, dim, layers, A_hat, doc2vec_weights):
            super().__init__()
            self.n_users = n_users
            self.n_items = n_items
            self.dim = dim
            self.layers = layers
            self.A_hat = A_hat

            # User는 랜덤 초기화
            self.user_emb = nn.Embedding(n_users, dim)
            nn.init.normal_(self.user_emb.weight, std=0.1)

            # [핵심] Item은 Doc2Vec으로 초기화 (freeze=False: 학습 가능)
            self.item_emb = nn.Embedding.from_pretrained(doc2vec_weights, freeze=False)

            # Content Loss 계산용 고정 벡터 (Buffer) - 학습 안 됨
            self.register_buffer('fixed_doc2vec', doc2vec_weights.clone().detach())

        def get_all_embeddings(self):
            users = self.user_emb.weight
            items = self.item_emb.weight
            all_emb = torch.cat([users, items], dim=0)

            embs = [all_emb]
            for _ in range(self.layers):
                all_emb = torch.sparse.mm(self.A_hat, all_emb)
                embs.append(all_emb)

            out = torch.stack(embs, dim=0).mean(dim=0)
            return out[:self.n_users], out[self.n_users:]

    # =========================================================
    # 7. Loss & Sampling (Hard Negative + Content Loss)
    # =========================================================
    def sample_batch_with_hard_neg(batch_size, user_pos_items, user_hard_neg_items, n_items, hard_prob=0.5):
        users = np.random.choice(list(user_pos_items.keys()), size=batch_size)
        pos = []
        neg = []

        for u in users:
            # Positive Sampling
            pos.append(np.random.choice(list(user_pos_items[u])))

            # Hard Negative Sampling (50% 확률)
            if (u in user_hard_neg_items) and (len(user_hard_neg_items[u]) > 0) and (random.random() < hard_prob):
                neg.append(np.random.choice(user_hard_neg_items[u]))
            else:
                # Random Negative
                while True:
                    n = np.random.randint(0, n_items)
                    if n not in user_pos_items[u]:
                        neg.append(n); break

        return (torch.LongTensor(users).to(device),
                torch.LongTensor(pos).to(device),
                torch.LongTensor(neg).to(device))

    def bpr_loss_with_content(u_f, i_pos_f, i_neg_f, u_0, i_pos_0, i_neg_0, fixed_vec, lambda1, lambda2):
        """
        u_f, i_pos_f, i_neg_f : 전파된 임베딩 (BPR용)
        u_0, i_pos_0, i_neg_0 : 초기 임베딩 (Regularization용)
        fixed_vec             : 고정된 Doc2Vec 벡터 (Content Loss용)
        """
        # 1. BPR Loss
        pos_scores = (u_f * i_pos_f).sum(dim=1)
        neg_scores = (u_f * i_neg_f).sum(dim=1)
        bpr_loss = torch.mean(torch.nn.functional.softplus(-(pos_scores - neg_scores)))

        # 2. L2 Regularization (E^0 기준)
        reg_loss = 0.5 * (u_0.norm(2).pow(2) + i_pos_0.norm(2).pow(2) + i_neg_0.norm(2).pow(2)) / u_0.size(0)

        # 3. Content Loss (E^0 vs Fixed Doc2Vec)
        content_loss = F.mse_loss(i_pos_0, fixed_vec)

        return bpr_loss + lambda1 * reg_loss + lambda2 * content_loss

    # =========================================================
    # 8. 평가 함수 (Precision, Recall, NDCG, HitRate)
    # =========================================================
    def ndcg_at_k(rank, k):
        if rank is None or rank >= k: return 0.0
        return 1.0 / math.log2(rank + 2)

    def evaluate(model, df_eval, k=10):
        model.eval()
        users_final, items_final = model.get_all_embeddings()
        hits, ndcg, precision, recall, total_users = 0, 0, 0, 0, 0

        with torch.no_grad():
            for u_idx, group in df_eval.groupby('user'):
                total_users += 1
                target_items = set(group['item'].values)

                scores = torch.matmul(users_final[u_idx], items_final.t())

                # Train에서 본 아이템(Positive Only)은 마스킹
                if u_idx in user_pos_items:
                    scores[list(user_pos_items[u_idx])] = -1e9

                _, topk = torch.topk(scores, k)
                topk = topk.cpu().tolist()

                num_correct = 0
                dcg, idcg = 0.0, 0.0

                for i, item_id in enumerate(topk):
                    if item_id in target_items:
                        num_correct += 1
                        dcg += 1.0 / np.log2(i + 2)

                num_targets = len(target_items)
                for i in range(min(num_targets, k)):
                    idcg += 1.0 / np.log2(i + 2)

                if num_correct > 0: hits += 1
                precision += num_correct / k
                recall += num_correct / num_targets
                if idcg > 0: ndcg += dcg / idcg

        return {'HitRate': hits/total_users, 'Precision': precision/total_users, 'Recall': recall/total_users, 'NDCG': ndcg/total_users}

    # =========================================================
    # 9. 학습 실행 (Best Model Selection 적용)
    # =========================================================
    dim = 64
    layers = 3
    batch_size = 1024
    epochs = 25
    lr = 1e-3

    # [설정] 하이퍼파라미터
    lambda1 = 1e-5  # Reg (Base 실험 결과 반영)
    lambda2 = 1e-3  # Content Loss (Doc2Vec Init만 쓰려면 0, 규제하려면 1e-3 등)

    model = LightGCN_Doc2Vec(n_users, n_items, dim, layers, Adj_Matrix, doc2vec_weights).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Best Model 저장 변수
    best_recall = 0.0
    best_epoch = 0
    best_model_path = "best_doc2vec_lightgcn.pt"

    print(f"\n=== Training Start (Lambda2={lambda2}, Hard Negative Applied) ===")

    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0

        # 배치 수는 Positive Data 기준
        num_batches = len(train_pos_df) // batch_size + 1

        for _ in range(num_batches):
            # 1. 샘플링 (Hard Negative 포함)
            users, pos, neg = sample_batch_with_hard_neg(batch_size, user_pos_items, user_hard_neg_items, n_items, hard_prob=0.5)

            # 2. 전파 (BPR 용)
            u_final, i_final = model.get_all_embeddings()
            u_f = u_final[users]
            i_pos_f = i_final[pos]
            i_neg_f = i_final[neg]

            # 3. 초기값 (Reg & Content 용)
            u_0 = model.user_emb.weight[users]
            i_pos_0 = model.item_emb.weight[pos]
            i_neg_0 = model.item_emb.weight[neg]

            # 4. 고정된 Doc2Vec 추출
            fixed_vec = model.fixed_doc2vec[pos]

            # 5. Loss 계산
            loss = bpr_loss_with_content(u_f, i_pos_f, i_neg_f,
                                         u_0, i_pos_0, i_neg_0,
                                         fixed_vec, lambda1, lambda2)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # 매 Epoch마다 Validation 평가
        val_metrics = evaluate(model, val_df, k=10)
        current_recall = val_metrics['Recall']

        print(f"[Epoch {epoch:02d}] Loss: {total_loss/num_batches:.4f} | Val Recall: {current_recall:.4f} | NDCG: {val_metrics['NDCG']:.4f}")

        if current_recall > best_recall:
            best_recall = current_recall
            best_epoch = epoch
            torch.save(model.state_dict(), best_model_path)
            print(f"   >>> Best Model Updated!")

    print(f"\n=== 학습 종료. Best Epoch: {best_epoch}, Best Val Recall: {best_recall:.4f} ===")

    # Final Test with Best Model
    model.load_state_dict(torch.load(best_model_path))
    test_metrics = evaluate(model, test_df, k=10)
    print("\n=== Final Test Result (Best Model) ===")
    print(f"Hit: {test_metrics['HitRate']:.4f}, Prec: {test_metrics['Precision']:.4f}, Recall: {test_metrics['Recall']:.4f}, NDCG: {test_metrics['NDCG']:.4f}")

데이터 로드 완료. User: 671, Item: 3485
전체 Train 데이터 수: 88595
Positive Interactions (Graph용): 46243
Hard Negative Interactions: 11327

=== Doc2Vec Training ===
Doc2Vec Matrix Created. Mapped: 3485/3485
그래프 생성 중 (Positive Edge Only)...
그래프 생성 완료!

=== Training Start (Lambda2=0.001, Hard Negative Applied) ===


  d_inv_sqrt = np.power(rowsum, -0.5).flatten()


[Epoch 01] Loss: 0.6468 | Val Recall: 0.1066 | NDCG: 0.0567
   >>> Best Model Updated!
[Epoch 02] Loss: 0.5923 | Val Recall: 0.1096 | NDCG: 0.0588
   >>> Best Model Updated!
[Epoch 03] Loss: 0.5354 | Val Recall: 0.1186 | NDCG: 0.0636
   >>> Best Model Updated!
[Epoch 04] Loss: 0.4913 | Val Recall: 0.1216 | NDCG: 0.0646
   >>> Best Model Updated!
[Epoch 05] Loss: 0.4658 | Val Recall: 0.1246 | NDCG: 0.0654
   >>> Best Model Updated!
[Epoch 06] Loss: 0.4458 | Val Recall: 0.1276 | NDCG: 0.0665
   >>> Best Model Updated!
[Epoch 07] Loss: 0.4306 | Val Recall: 0.1336 | NDCG: 0.0681
   >>> Best Model Updated!
[Epoch 08] Loss: 0.4191 | Val Recall: 0.1366 | NDCG: 0.0689
   >>> Best Model Updated!
[Epoch 09] Loss: 0.4034 | Val Recall: 0.1351 | NDCG: 0.0683
[Epoch 10] Loss: 0.3969 | Val Recall: 0.1366 | NDCG: 0.0689
[Epoch 11] Loss: 0.3819 | Val Recall: 0.1396 | NDCG: 0.0699
   >>> Best Model Updated!
[Epoch 12] Loss: 0.3703 | Val Recall: 0.1351 | NDCG: 0.0696
[Epoch 13] Loss: 0.3629 | Val Recall:

**4. lamda2 = 1e-4**

In [None]:
# =========================================================
# 2. 데이터 로드 (Pre-split Data 사용)
# =========================================================
if not os.path.exists(data_path) or not os.path.exists(movie_final_path):
    print("Error: 데이터 경로를 확인해주세요.")
else:
    # 1. 데이터 읽기
    train_df = pd.read_csv(f"{data_path}/train.csv")
    val_df   = pd.read_csv(f"{data_path}/val.csv")
    test_df  = pd.read_csv(f"{data_path}/test.csv")

    # 2. 매핑 정보 로드
    with open(f"{data_path}/user2idx.pkl", 'rb') as f: user2idx = pickle.load(f)
    with open(f"{data_path}/item2idx.pkl", 'rb') as f: item2idx = pickle.load(f)

    n_users = len(user2idx)
    n_items = len(item2idx)

    print(f"데이터 로드 완료. User: {n_users}, Item: {n_items}")
    print(f"전체 Train 데이터 수: {len(train_df)}")

    # =========================================================
    # 3. 데이터 분리 (Positive vs Hard Negative)
    # =========================================================
    # 1) Positive Data (4점 이상): 그래프 연결 및 정답 학습용
    train_pos_df = train_df[train_df['rating'] >= 4.0].copy()

    # 2) Hard Negative Data (2점 이하): 오답 학습용
    train_neg_df = train_df[train_df['rating'] <= 2.0].copy()

    print(f"Positive Interactions (Graph용): {len(train_pos_df)}")
    print(f"Hard Negative Interactions: {len(train_neg_df)}")

    # 딕셔너리 생성
    # Positive: 유저가 좋아하는 아이템 목록
    user_pos_items = train_pos_df.groupby("user")["item"].apply(set).to_dict()

    # Hard Negative: 유저가 싫어하는 아이템 목록 (샘플링 때 사용)
    user_hard_neg_items = train_neg_df.groupby("user")["item"].apply(list).to_dict()

    # =========================================================
    # 4. Doc2Vec 학습 및 매트릭스 생성 (Feature Extraction)
    # =========================================================
    print("\n=== Doc2Vec Training ===")
    movie_df = pd.read_csv(movie_final_path)
    movie_df['overview_clean'] = movie_df['overview_clean'].fillna('unknown')

    # TaggedDocument 생성 (ID는 원본 movieId 사용 -> 나중에 item2idx로 매핑)
    documents = [TaggedDocument(str(row['overview_clean']).split(), [str(row['movieId'])])
                 for _, row in movie_df.iterrows()]

    # LightGCN 차원(64)과 일치시킴
    d2v_model = Doc2Vec(documents, vector_size=64, window=5, min_count=1, workers=4, epochs=20, seed=42)

    # Embedding Matrix 생성 (item_idx 순서에 맞게 정렬)
    doc2vec_weights = np.zeros((n_items, 64))
    cnt = 0
    for movie_id, idx in item2idx.items():
        # Doc2Vec 모델에 해당 영화 ID가 있는지 확인
        if str(movie_id) in d2v_model.dv:
            doc2vec_weights[idx] = d2v_model.dv[str(movie_id)]
            cnt += 1
        else:
            # 없으면 랜덤 초기화 (작은 값)
            doc2vec_weights[idx] = np.random.normal(0, 0.01, 64)

    doc2vec_weights = torch.FloatTensor(doc2vec_weights).to(device)
    print(f"Doc2Vec Matrix Created. Mapped: {cnt}/{n_items}")


    # Save Doc2Vec embeddings to disk for reuse
    doc2vec_save_path = f"{base_path}/doc2vec_embeddings_64d.pkl"
    with open(doc2vec_save_path, 'wb') as f:
        pickle.dump({
            'embeddings': doc2vec_weights.cpu().numpy(),
            'item2idx': item2idx,
            'n_items': n_items,
            'vector_size': 64,
            'created_at': str(pd.Timestamp.now())
        }, f)
    print(f"✅ Doc2Vec embeddings saved to: {doc2vec_save_path}")
    # =========================================================
    # 5. LightGCN용 인접행렬 생성 (Positive 데이터만 사용!)
    # =========================================================
    def get_adj_mat(n_users, n_items, pos_df):
        """
        반드시 4점 이상인 pos_df만 넣어서 그래프를 만들어야 함
        """
        n_nodes = n_users + n_items
        user_np = pos_df['user'].values
        item_np = pos_df['item'].values

        R = sp.coo_matrix((np.ones(len(user_np)), (user_np, item_np)), shape=(n_users, n_items))

        top_part = sp.hstack([sp.csr_matrix((n_users, n_users)), R])
        bot_part = sp.hstack([R.T, sp.csr_matrix((n_items, n_items))])
        A = sp.vstack([top_part, bot_part])

        rowsum = np.array(A.sum(1))
        d_inv_sqrt = np.power(rowsum, -0.5).flatten()
        d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
        d_mat_inv_sqrt = sp.diags(d_inv_sqrt)

        norm_adj = d_mat_inv_sqrt.dot(A).dot(d_mat_inv_sqrt).tocoo()
        indices = torch.LongTensor(np.vstack((norm_adj.row, norm_adj.col)))
        values = torch.FloatTensor(norm_adj.data)

        return torch.sparse_coo_tensor(indices, values, torch.Size(norm_adj.shape)).to(device)

    print("그래프 생성 중 (Positive Edge Only)...")
    Adj_Matrix = get_adj_mat(n_users, n_items, train_pos_df)
    print("그래프 생성 완료!")

    # =========================================================
    # 6. 모델 정의 (LightGCN with Doc2Vec Init)
    # =========================================================
    class LightGCN_Doc2Vec(nn.Module):
        def __init__(self, n_users, n_items, dim, layers, A_hat, doc2vec_weights):
            super().__init__()
            self.n_users = n_users
            self.n_items = n_items
            self.dim = dim
            self.layers = layers
            self.A_hat = A_hat

            # User는 랜덤 초기화
            self.user_emb = nn.Embedding(n_users, dim)
            nn.init.normal_(self.user_emb.weight, std=0.1)

            # [핵심] Item은 Doc2Vec으로 초기화 (freeze=False: 학습 가능)
            self.item_emb = nn.Embedding.from_pretrained(doc2vec_weights, freeze=False)

            # Content Loss 계산용 고정 벡터 (Buffer) - 학습 안 됨
            self.register_buffer('fixed_doc2vec', doc2vec_weights.clone().detach())

        def get_all_embeddings(self):
            users = self.user_emb.weight
            items = self.item_emb.weight
            all_emb = torch.cat([users, items], dim=0)

            embs = [all_emb]
            for _ in range(self.layers):
                all_emb = torch.sparse.mm(self.A_hat, all_emb)
                embs.append(all_emb)

            out = torch.stack(embs, dim=0).mean(dim=0)
            return out[:self.n_users], out[self.n_users:]

    # =========================================================
    # 7. Loss & Sampling (Hard Negative + Content Loss)
    # =========================================================
    def sample_batch_with_hard_neg(batch_size, user_pos_items, user_hard_neg_items, n_items, hard_prob=0.5):
        users = np.random.choice(list(user_pos_items.keys()), size=batch_size)
        pos = []
        neg = []

        for u in users:
            # Positive Sampling
            pos.append(np.random.choice(list(user_pos_items[u])))

            # Hard Negative Sampling (50% 확률)
            if (u in user_hard_neg_items) and (len(user_hard_neg_items[u]) > 0) and (random.random() < hard_prob):
                neg.append(np.random.choice(user_hard_neg_items[u]))
            else:
                # Random Negative
                while True:
                    n = np.random.randint(0, n_items)
                    if n not in user_pos_items[u]:
                        neg.append(n); break

        return (torch.LongTensor(users).to(device),
                torch.LongTensor(pos).to(device),
                torch.LongTensor(neg).to(device))

    def bpr_loss_with_content(u_f, i_pos_f, i_neg_f, u_0, i_pos_0, i_neg_0, fixed_vec, lambda1, lambda2):
        """
        u_f, i_pos_f, i_neg_f : 전파된 임베딩 (BPR용)
        u_0, i_pos_0, i_neg_0 : 초기 임베딩 (Regularization용)
        fixed_vec             : 고정된 Doc2Vec 벡터 (Content Loss용)
        """
        # 1. BPR Loss
        pos_scores = (u_f * i_pos_f).sum(dim=1)
        neg_scores = (u_f * i_neg_f).sum(dim=1)
        bpr_loss = torch.mean(torch.nn.functional.softplus(-(pos_scores - neg_scores)))

        # 2. L2 Regularization (E^0 기준)
        reg_loss = 0.5 * (u_0.norm(2).pow(2) + i_pos_0.norm(2).pow(2) + i_neg_0.norm(2).pow(2)) / u_0.size(0)

        # 3. Content Loss (E^0 vs Fixed Doc2Vec)
        content_loss = F.mse_loss(i_pos_0, fixed_vec)

        return bpr_loss + lambda1 * reg_loss + lambda2 * content_loss

    # =========================================================
    # 8. 평가 함수 (Precision, Recall, NDCG, HitRate)
    # =========================================================
    def ndcg_at_k(rank, k):
        if rank is None or rank >= k: return 0.0
        return 1.0 / math.log2(rank + 2)

    def evaluate(model, df_eval, k=10):
        model.eval()
        users_final, items_final = model.get_all_embeddings()
        hits, ndcg, precision, recall, total_users = 0, 0, 0, 0, 0

        with torch.no_grad():
            for u_idx, group in df_eval.groupby('user'):
                total_users += 1
                target_items = set(group['item'].values)

                scores = torch.matmul(users_final[u_idx], items_final.t())

                # Train에서 본 아이템(Positive Only)은 마스킹
                if u_idx in user_pos_items:
                    scores[list(user_pos_items[u_idx])] = -1e9

                _, topk = torch.topk(scores, k)
                topk = topk.cpu().tolist()

                num_correct = 0
                dcg, idcg = 0.0, 0.0

                for i, item_id in enumerate(topk):
                    if item_id in target_items:
                        num_correct += 1
                        dcg += 1.0 / np.log2(i + 2)

                num_targets = len(target_items)
                for i in range(min(num_targets, k)):
                    idcg += 1.0 / np.log2(i + 2)

                if num_correct > 0: hits += 1
                precision += num_correct / k
                recall += num_correct / num_targets
                if idcg > 0: ndcg += dcg / idcg

        return {'HitRate': hits/total_users, 'Precision': precision/total_users, 'Recall': recall/total_users, 'NDCG': ndcg/total_users}

    # =========================================================
    # 9. 학습 실행 (Best Model Selection 적용)
    # =========================================================
    dim = 64
    layers = 3
    batch_size = 1024
    epochs = 25
    lr = 1e-3

    # [설정] 하이퍼파라미터
    lambda1 = 1e-5  # Reg (Base 실험 결과 반영)
    lambda2 = 1e-4  # Content Loss (Doc2Vec Init만 쓰려면 0, 규제하려면 1e-3 등)

    model = LightGCN_Doc2Vec(n_users, n_items, dim, layers, Adj_Matrix, doc2vec_weights).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Best Model 저장 변수
    best_recall = 0.0
    best_epoch = 0
    best_model_path = "best_doc2vec_lightgcn.pt"

    print(f"\n=== Training Start (Lambda2={lambda2}, Hard Negative Applied) ===")

    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0

        # 배치 수는 Positive Data 기준
        num_batches = len(train_pos_df) // batch_size + 1

        for _ in range(num_batches):
            # 1. 샘플링 (Hard Negative 포함)
            users, pos, neg = sample_batch_with_hard_neg(batch_size, user_pos_items, user_hard_neg_items, n_items, hard_prob=0.5)

            # 2. 전파 (BPR 용)
            u_final, i_final = model.get_all_embeddings()
            u_f = u_final[users]
            i_pos_f = i_final[pos]
            i_neg_f = i_final[neg]

            # 3. 초기값 (Reg & Content 용)
            u_0 = model.user_emb.weight[users]
            i_pos_0 = model.item_emb.weight[pos]
            i_neg_0 = model.item_emb.weight[neg]

            # 4. 고정된 Doc2Vec 추출
            fixed_vec = model.fixed_doc2vec[pos]

            # 5. Loss 계산
            loss = bpr_loss_with_content(u_f, i_pos_f, i_neg_f,
                                         u_0, i_pos_0, i_neg_0,
                                         fixed_vec, lambda1, lambda2)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # 매 Epoch마다 Validation 평가
        val_metrics = evaluate(model, val_df, k=10)
        current_recall = val_metrics['Recall']

        print(f"[Epoch {epoch:02d}] Loss: {total_loss/num_batches:.4f} | Val Recall: {current_recall:.4f} | NDCG: {val_metrics['NDCG']:.4f}")

        if current_recall > best_recall:
            best_recall = current_recall
            best_epoch = epoch
            torch.save(model.state_dict(), best_model_path)
            print(f"   >>> Best Model Updated!")

    print(f"\n=== 학습 종료. Best Epoch: {best_epoch}, Best Val Recall: {best_recall:.4f} ===")

    # Final Test with Best Model
    model.load_state_dict(torch.load(best_model_path))
    test_metrics = evaluate(model, test_df, k=10)
    print("\n=== Final Test Result (Best Model) ===")
    print(f"Hit: {test_metrics['HitRate']:.4f}, Prec: {test_metrics['Precision']:.4f}, Recall: {test_metrics['Recall']:.4f}, NDCG: {test_metrics['NDCG']:.4f}")

데이터 로드 완료. User: 671, Item: 3485
전체 Train 데이터 수: 88595
Positive Interactions (Graph용): 46243
Hard Negative Interactions: 11327

=== Doc2Vec Training ===
Doc2Vec Matrix Created. Mapped: 3485/3485
그래프 생성 중 (Positive Edge Only)...
그래프 생성 완료!

=== Training Start (Lambda2=0.0001, Hard Negative Applied) ===


  d_inv_sqrt = np.power(rowsum, -0.5).flatten()


[Epoch 01] Loss: 0.6457 | Val Recall: 0.1021 | NDCG: 0.0568
   >>> Best Model Updated!
[Epoch 02] Loss: 0.5934 | Val Recall: 0.1081 | NDCG: 0.0582
   >>> Best Model Updated!
[Epoch 03] Loss: 0.5355 | Val Recall: 0.1216 | NDCG: 0.0648
   >>> Best Model Updated!
[Epoch 04] Loss: 0.4895 | Val Recall: 0.1231 | NDCG: 0.0653
   >>> Best Model Updated!
[Epoch 05] Loss: 0.4632 | Val Recall: 0.1276 | NDCG: 0.0668
   >>> Best Model Updated!
[Epoch 06] Loss: 0.4416 | Val Recall: 0.1291 | NDCG: 0.0674
   >>> Best Model Updated!
[Epoch 07] Loss: 0.4287 | Val Recall: 0.1276 | NDCG: 0.0667
[Epoch 08] Loss: 0.4174 | Val Recall: 0.1336 | NDCG: 0.0687
   >>> Best Model Updated!
[Epoch 09] Loss: 0.4066 | Val Recall: 0.1336 | NDCG: 0.0678
[Epoch 10] Loss: 0.3958 | Val Recall: 0.1336 | NDCG: 0.0678
[Epoch 11] Loss: 0.3807 | Val Recall: 0.1336 | NDCG: 0.0688
[Epoch 12] Loss: 0.3693 | Val Recall: 0.1366 | NDCG: 0.0701
   >>> Best Model Updated!
[Epoch 13] Loss: 0.3562 | Val Recall: 0.1336 | NDCG: 0.0694
[Epo

**5. lamda2 = 1e-1**

In [None]:
# =========================================================
# 2. 데이터 로드 (Pre-split Data 사용)
# =========================================================
if not os.path.exists(data_path) or not os.path.exists(movie_final_path):
    print("Error: 데이터 경로를 확인해주세요.")
else:
    # 1. 데이터 읽기
    train_df = pd.read_csv(f"{data_path}/train.csv")
    val_df   = pd.read_csv(f"{data_path}/val.csv")
    test_df  = pd.read_csv(f"{data_path}/test.csv")

    # 2. 매핑 정보 로드
    with open(f"{data_path}/user2idx.pkl", 'rb') as f: user2idx = pickle.load(f)
    with open(f"{data_path}/item2idx.pkl", 'rb') as f: item2idx = pickle.load(f)

    n_users = len(user2idx)
    n_items = len(item2idx)

    print(f"데이터 로드 완료. User: {n_users}, Item: {n_items}")
    print(f"전체 Train 데이터 수: {len(train_df)}")

    # =========================================================
    # 3. 데이터 분리 (Positive vs Hard Negative)
    # =========================================================
    # 1) Positive Data (4점 이상): 그래프 연결 및 정답 학습용
    train_pos_df = train_df[train_df['rating'] >= 4.0].copy()

    # 2) Hard Negative Data (2점 이하): 오답 학습용
    train_neg_df = train_df[train_df['rating'] <= 2.0].copy()

    print(f"Positive Interactions (Graph용): {len(train_pos_df)}")
    print(f"Hard Negative Interactions: {len(train_neg_df)}")

    # 딕셔너리 생성
    # Positive: 유저가 좋아하는 아이템 목록
    user_pos_items = train_pos_df.groupby("user")["item"].apply(set).to_dict()

    # Hard Negative: 유저가 싫어하는 아이템 목록 (샘플링 때 사용)
    user_hard_neg_items = train_neg_df.groupby("user")["item"].apply(list).to_dict()

    # =========================================================
    # 4. Doc2Vec 학습 및 매트릭스 생성 (Feature Extraction)
    # =========================================================
    print("\n=== Doc2Vec Training ===")
    movie_df = pd.read_csv(movie_final_path)
    movie_df['overview_clean'] = movie_df['overview_clean'].fillna('unknown')

    # TaggedDocument 생성 (ID는 원본 movieId 사용 -> 나중에 item2idx로 매핑)
    documents = [TaggedDocument(str(row['overview_clean']).split(), [str(row['movieId'])])
                 for _, row in movie_df.iterrows()]

    # LightGCN 차원(64)과 일치시킴
    d2v_model = Doc2Vec(documents, vector_size=64, window=5, min_count=1, workers=4, epochs=20, seed=42)

    # Embedding Matrix 생성 (item_idx 순서에 맞게 정렬)
    doc2vec_weights = np.zeros((n_items, 64))
    cnt = 0
    for movie_id, idx in item2idx.items():
        # Doc2Vec 모델에 해당 영화 ID가 있는지 확인
        if str(movie_id) in d2v_model.dv:
            doc2vec_weights[idx] = d2v_model.dv[str(movie_id)]
            cnt += 1
        else:
            # 없으면 랜덤 초기화 (작은 값)
            doc2vec_weights[idx] = np.random.normal(0, 0.01, 64)

    doc2vec_weights = torch.FloatTensor(doc2vec_weights).to(device)
    print(f"Doc2Vec Matrix Created. Mapped: {cnt}/{n_items}")


    # Save Doc2Vec embeddings to disk for reuse
    doc2vec_save_path = f"{base_path}/doc2vec_embeddings_64d.pkl"
    with open(doc2vec_save_path, 'wb') as f:
        pickle.dump({
            'embeddings': doc2vec_weights.cpu().numpy(),
            'item2idx': item2idx,
            'n_items': n_items,
            'vector_size': 64,
            'created_at': str(pd.Timestamp.now())
        }, f)
    print(f"✅ Doc2Vec embeddings saved to: {doc2vec_save_path}")
    # =========================================================
    # 5. LightGCN용 인접행렬 생성 (Positive 데이터만 사용!)
    # =========================================================
    def get_adj_mat(n_users, n_items, pos_df):
        """
        반드시 4점 이상인 pos_df만 넣어서 그래프를 만들어야 함
        """
        n_nodes = n_users + n_items
        user_np = pos_df['user'].values
        item_np = pos_df['item'].values

        R = sp.coo_matrix((np.ones(len(user_np)), (user_np, item_np)), shape=(n_users, n_items))

        top_part = sp.hstack([sp.csr_matrix((n_users, n_users)), R])
        bot_part = sp.hstack([R.T, sp.csr_matrix((n_items, n_items))])
        A = sp.vstack([top_part, bot_part])

        rowsum = np.array(A.sum(1))
        d_inv_sqrt = np.power(rowsum, -0.5).flatten()
        d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
        d_mat_inv_sqrt = sp.diags(d_inv_sqrt)

        norm_adj = d_mat_inv_sqrt.dot(A).dot(d_mat_inv_sqrt).tocoo()
        indices = torch.LongTensor(np.vstack((norm_adj.row, norm_adj.col)))
        values = torch.FloatTensor(norm_adj.data)

        return torch.sparse_coo_tensor(indices, values, torch.Size(norm_adj.shape)).to(device)

    print("그래프 생성 중 (Positive Edge Only)...")
    Adj_Matrix = get_adj_mat(n_users, n_items, train_pos_df)
    print("그래프 생성 완료!")

    # =========================================================
    # 6. 모델 정의 (LightGCN with Doc2Vec Init)
    # =========================================================
    class LightGCN_Doc2Vec(nn.Module):
        def __init__(self, n_users, n_items, dim, layers, A_hat, doc2vec_weights):
            super().__init__()
            self.n_users = n_users
            self.n_items = n_items
            self.dim = dim
            self.layers = layers
            self.A_hat = A_hat

            # User는 랜덤 초기화
            self.user_emb = nn.Embedding(n_users, dim)
            nn.init.normal_(self.user_emb.weight, std=0.1)

            # [핵심] Item은 Doc2Vec으로 초기화 (freeze=False: 학습 가능)
            self.item_emb = nn.Embedding.from_pretrained(doc2vec_weights, freeze=False)

            # Content Loss 계산용 고정 벡터 (Buffer) - 학습 안 됨
            self.register_buffer('fixed_doc2vec', doc2vec_weights.clone().detach())

        def get_all_embeddings(self):
            users = self.user_emb.weight
            items = self.item_emb.weight
            all_emb = torch.cat([users, items], dim=0)

            embs = [all_emb]
            for _ in range(self.layers):
                all_emb = torch.sparse.mm(self.A_hat, all_emb)
                embs.append(all_emb)

            out = torch.stack(embs, dim=0).mean(dim=0)
            return out[:self.n_users], out[self.n_users:]

    # =========================================================
    # 7. Loss & Sampling (Hard Negative + Content Loss)
    # =========================================================
    def sample_batch_with_hard_neg(batch_size, user_pos_items, user_hard_neg_items, n_items, hard_prob=0.5):
        users = np.random.choice(list(user_pos_items.keys()), size=batch_size)
        pos = []
        neg = []

        for u in users:
            # Positive Sampling
            pos.append(np.random.choice(list(user_pos_items[u])))

            # Hard Negative Sampling (50% 확률)
            if (u in user_hard_neg_items) and (len(user_hard_neg_items[u]) > 0) and (random.random() < hard_prob):
                neg.append(np.random.choice(user_hard_neg_items[u]))
            else:
                # Random Negative
                while True:
                    n = np.random.randint(0, n_items)
                    if n not in user_pos_items[u]:
                        neg.append(n); break

        return (torch.LongTensor(users).to(device),
                torch.LongTensor(pos).to(device),
                torch.LongTensor(neg).to(device))

    def bpr_loss_with_content(u_f, i_pos_f, i_neg_f, u_0, i_pos_0, i_neg_0, fixed_vec, lambda1, lambda2):
        """
        u_f, i_pos_f, i_neg_f : 전파된 임베딩 (BPR용)
        u_0, i_pos_0, i_neg_0 : 초기 임베딩 (Regularization용)
        fixed_vec             : 고정된 Doc2Vec 벡터 (Content Loss용)
        """
        # 1. BPR Loss
        pos_scores = (u_f * i_pos_f).sum(dim=1)
        neg_scores = (u_f * i_neg_f).sum(dim=1)
        bpr_loss = torch.mean(torch.nn.functional.softplus(-(pos_scores - neg_scores)))

        # 2. L2 Regularization (E^0 기준)
        reg_loss = 0.5 * (u_0.norm(2).pow(2) + i_pos_0.norm(2).pow(2) + i_neg_0.norm(2).pow(2)) / u_0.size(0)

        # 3. Content Loss (E^0 vs Fixed Doc2Vec)
        content_loss = F.mse_loss(i_pos_0, fixed_vec)

        return bpr_loss + lambda1 * reg_loss + lambda2 * content_loss

    # =========================================================
    # 8. 평가 함수 (Precision, Recall, NDCG, HitRate)
    # =========================================================
    def ndcg_at_k(rank, k):
        if rank is None or rank >= k: return 0.0
        return 1.0 / math.log2(rank + 2)

    def evaluate(model, df_eval, k=10):
        model.eval()
        users_final, items_final = model.get_all_embeddings()
        hits, ndcg, precision, recall, total_users = 0, 0, 0, 0, 0

        with torch.no_grad():
            for u_idx, group in df_eval.groupby('user'):
                total_users += 1
                target_items = set(group['item'].values)

                scores = torch.matmul(users_final[u_idx], items_final.t())

                # Train에서 본 아이템(Positive Only)은 마스킹
                if u_idx in user_pos_items:
                    scores[list(user_pos_items[u_idx])] = -1e9

                _, topk = torch.topk(scores, k)
                topk = topk.cpu().tolist()

                num_correct = 0
                dcg, idcg = 0.0, 0.0

                for i, item_id in enumerate(topk):
                    if item_id in target_items:
                        num_correct += 1
                        dcg += 1.0 / np.log2(i + 2)

                num_targets = len(target_items)
                for i in range(min(num_targets, k)):
                    idcg += 1.0 / np.log2(i + 2)

                if num_correct > 0: hits += 1
                precision += num_correct / k
                recall += num_correct / num_targets
                if idcg > 0: ndcg += dcg / idcg

        return {'HitRate': hits/total_users, 'Precision': precision/total_users, 'Recall': recall/total_users, 'NDCG': ndcg/total_users}

    # =========================================================
    # 9. 학습 실행 (Best Model Selection 적용)
    # =========================================================
    dim = 64
    layers = 3
    batch_size = 1024
    epochs = 25
    lr = 1e-3

    # [설정] 하이퍼파라미터
    lambda1 = 1e-5  # Reg (Base 실험 결과 반영)
    lambda2 = 1e-1  # Content Loss (Doc2Vec Init만 쓰려면 0, 규제하려면 1e-3 등)

    model = LightGCN_Doc2Vec(n_users, n_items, dim, layers, Adj_Matrix, doc2vec_weights).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Best Model 저장 변수
    best_recall = 0.0
    best_epoch = 0
    best_model_path = "best_doc2vec_lightgcn.pt"

    print(f"\n=== Training Start (Lambda2={lambda2}, Hard Negative Applied) ===")

    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0

        # 배치 수는 Positive Data 기준
        num_batches = len(train_pos_df) // batch_size + 1

        for _ in range(num_batches):
            # 1. 샘플링 (Hard Negative 포함)
            users, pos, neg = sample_batch_with_hard_neg(batch_size, user_pos_items, user_hard_neg_items, n_items, hard_prob=0.5)

            # 2. 전파 (BPR 용)
            u_final, i_final = model.get_all_embeddings()
            u_f = u_final[users]
            i_pos_f = i_final[pos]
            i_neg_f = i_final[neg]

            # 3. 초기값 (Reg & Content 용)
            u_0 = model.user_emb.weight[users]
            i_pos_0 = model.item_emb.weight[pos]
            i_neg_0 = model.item_emb.weight[neg]

            # 4. 고정된 Doc2Vec 추출
            fixed_vec = model.fixed_doc2vec[pos]

            # 5. Loss 계산
            loss = bpr_loss_with_content(u_f, i_pos_f, i_neg_f,
                                         u_0, i_pos_0, i_neg_0,
                                         fixed_vec, lambda1, lambda2)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # 매 Epoch마다 Validation 평가
        val_metrics = evaluate(model, val_df, k=10)
        current_recall = val_metrics['Recall']

        print(f"[Epoch {epoch:02d}] Loss: {total_loss/num_batches:.4f} | Val Recall: {current_recall:.4f} | NDCG: {val_metrics['NDCG']:.4f}")

        if current_recall > best_recall:
            best_recall = current_recall
            best_epoch = epoch
            torch.save(model.state_dict(), best_model_path)
            print(f"   >>> Best Model Updated!")

    print(f"\n=== 학습 종료. Best Epoch: {best_epoch}, Best Val Recall: {best_recall:.4f} ===")

    # Final Test with Best Model
    model.load_state_dict(torch.load(best_model_path))
    test_metrics = evaluate(model, test_df, k=10)
    print("\n=== Final Test Result (Best Model) ===")
    print(f"Hit: {test_metrics['HitRate']:.4f}, Prec: {test_metrics['Precision']:.4f}, Recall: {test_metrics['Recall']:.4f}, NDCG: {test_metrics['NDCG']:.4f}")

데이터 로드 완료. User: 671, Item: 3485
전체 Train 데이터 수: 88595
Positive Interactions (Graph용): 46243
Hard Negative Interactions: 11327

=== Doc2Vec Training ===
Doc2Vec Matrix Created. Mapped: 3485/3485
그래프 생성 중 (Positive Edge Only)...
그래프 생성 완료!

=== Training Start (Lambda2=0.1, Hard Negative Applied) ===


  d_inv_sqrt = np.power(rowsum, -0.5).flatten()


[Epoch 01] Loss: 0.6462 | Val Recall: 0.1006 | NDCG: 0.0556
   >>> Best Model Updated!
[Epoch 02] Loss: 0.5943 | Val Recall: 0.1096 | NDCG: 0.0585
   >>> Best Model Updated!
[Epoch 03] Loss: 0.5386 | Val Recall: 0.1201 | NDCG: 0.0637
   >>> Best Model Updated!
[Epoch 04] Loss: 0.4923 | Val Recall: 0.1201 | NDCG: 0.0644
[Epoch 05] Loss: 0.4627 | Val Recall: 0.1246 | NDCG: 0.0659
   >>> Best Model Updated!
[Epoch 06] Loss: 0.4437 | Val Recall: 0.1291 | NDCG: 0.0672
   >>> Best Model Updated!
[Epoch 07] Loss: 0.4305 | Val Recall: 0.1336 | NDCG: 0.0691
   >>> Best Model Updated!
[Epoch 08] Loss: 0.4152 | Val Recall: 0.1351 | NDCG: 0.0691
   >>> Best Model Updated!
[Epoch 09] Loss: 0.4039 | Val Recall: 0.1396 | NDCG: 0.0714
   >>> Best Model Updated!
[Epoch 10] Loss: 0.3941 | Val Recall: 0.1366 | NDCG: 0.0700
[Epoch 11] Loss: 0.3794 | Val Recall: 0.1411 | NDCG: 0.0725
   >>> Best Model Updated!
[Epoch 12] Loss: 0.3714 | Val Recall: 0.1351 | NDCG: 0.0708
[Epoch 13] Loss: 0.3561 | Val Recall: