In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import os
import math
import pickle
import scipy.sparse as sp
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# =========================================================
# 1. 환경 설정 및 시드 고정
# =========================================================
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# [경로 설정] 본인의 환경에 맞게 수정하세요
base_path = "/content/drive/MyDrive/unstructured"
data_path = f"{base_path}/k5_filtered" # Split된 데이터가 있는 폴더
movie_meta_path = f"{base_path}/movie_data_final_clean.csv" # 줄거리가 있는 영화 메타데이터
topic_edge_path = f"{base_path}/topic_vectors_7.csv" # 토픽 데이터

Using device: cuda


In [None]:
# =========================================================
# 2. 데이터 로드 (Pre-split Data)
# =========================================================
print("\n📂 데이터 로드 중...")
try:
    train_df = pd.read_csv(f"{data_path}/train.csv")
    val_df   = pd.read_csv(f"{data_path}/val.csv")
    test_df  = pd.read_csv(f"{data_path}/test.csv")

    with open(f"{data_path}/user2idx.pkl", 'rb') as f: user2idx = pickle.load(f)
    with open(f"{data_path}/item2idx.pkl", 'rb') as f: item2idx = pickle.load(f)

    n_users = len(user2idx)
    n_items = len(item2idx)
    print(f"✅ 기본 데이터 로드 완료. User: {n_users}, Item: {n_items}")

    # Topic 데이터 로드
    topic_df = pd.read_csv(topic_edge_path)
    n_topics = topic_df['topic_id'].nunique()
    print(f"✅ 토픽 데이터 로드 완료. Topic 수: {n_topics}, 연결 수: {len(topic_df)}")

except FileNotFoundError as e:
    print(f"❌ Error: 파일을 찾을 수 없습니다. {e}")
    raise


📂 데이터 로드 중...
✅ 기본 데이터 로드 완료. User: 671, Item: 3485
✅ 토픽 데이터 로드 완료. Topic 수: 7, 연결 수: 21029


In [None]:
# =========================================================
# [추가] 토픽 데이터 ID 재매핑 및 필터링
# =========================================================
print("🔄 토픽 데이터 ID 재매핑 중...")

# 1. LDA에 사용된 원본 영화 리스트 로드 (순서 중요!)
# LDA는 movie_meta_path 파일의 순서대로 0, 1, 2... ID를 부여했음
lda_movie_source = pd.read_csv(movie_meta_path)

# 2. {LDA_Index : Original_Movie_ID} 매핑 생성
# (LDA 결과의 movie_id는 위 파일의 행 번호(Index)와 같습니다)
lda_idx_to_real_id = lda_movie_source['movieId'].to_dict()

# 3. topic_df의 'movie_id'(LDA Index)를 'real_movie_id'로 변환
topic_df['real_movie_id'] = topic_df['movie_id'].map(lda_idx_to_real_id)

# 4. {Original_Movie_ID : LightGCN_Index} 매핑 (item2idx 사용)
# item2idx는 LightGCN 로드 시 pickle에서 불러온 것
topic_df['lightgcn_item_idx'] = topic_df['real_movie_id'].map(item2idx)

# 5. LightGCN 데이터셋에 없는 영화(NaN) 제거 (K-core 필터링된 것들 삭제)
print(f"매핑 전 토픽 연결 수: {len(topic_df)}")
topic_df = topic_df.dropna(subset=['lightgcn_item_idx']) # 없는 영화 삭제
topic_df['lightgcn_item_idx'] = topic_df['lightgcn_item_idx'].astype(int) # 정수형 변환

print(f"매핑 후 토픽 연결 수: {len(topic_df)}")

# 6. 컬럼 이름 교체 (그래프 생성 함수 호환성 위해)
# 기존 'movie_id'는 버리고, 매핑된 ID를 'movie_id'로 사용
topic_df = topic_df[['lightgcn_item_idx', 'topic_id', 'related']].rename(columns={'lightgcn_item_idx': 'movie_id'})

print("✅ 토픽 데이터 전처리 완료. (LightGCN ID와 동기화됨)")
print(topic_df.head())

🔄 토픽 데이터 ID 재매핑 중...
매핑 전 토픽 연결 수: 21029
매핑 후 토픽 연결 수: 8122
✅ 토픽 데이터 전처리 완료. (LightGCN ID와 동기화됨)
   movie_id  topic_id   related
0       431         1  0.960950
1      1014         1  0.603947
2      1014         6  0.146624
3      1014         0  0.115764
4      1014         5  0.113900


In [None]:
# =========================================================
# Load Pre-trained Doc2Vec Embeddings
# =========================================================
print("\n📥 Loading Doc2Vec embeddings...")
doc2vec_path = f"{base_path}/doc2vec_embeddings_64d.pkl"

try:
    with open(doc2vec_path, 'rb') as f:
        doc2vec_data = pickle.load(f)

    # Validate consistency
    assert doc2vec_data['n_items'] == n_items, \
        f"Item count mismatch! Doc2Vec: {doc2vec_data['n_items']}, Current: {n_items}"
    assert doc2vec_data['item2idx'] == item2idx, \
        "item2idx mapping mismatch! Doc2Vec and LDA must use same data split."

    doc2vec_weights = torch.FloatTensor(doc2vec_data['embeddings']).to(device)
    print(f"✅ Doc2Vec embeddings loaded successfully")
    print(f"   Shape: {doc2vec_weights.shape}, Created: {doc2vec_data['created_at']}")

except FileNotFoundError:
    print(f"⚠️  Doc2Vec embeddings not found. Falling back to random initialization")
    doc2vec_weights = None
except AssertionError as e:
    print(f"❌ Validation Error: {e}")
    raise


In [None]:
# =========================================================
# 3. 데이터 분리 & 딕셔너리 생성
# =========================================================
# 1) Positive Data (4점 이상): 그래프 및 정답용
train_pos_df = train_df[train_df['rating'] >= 4.0].copy()

# 2) Hard Negative Data (2점 이하): 오답 학습용
train_neg_df = train_df[train_df['rating'] <= 2.0].copy()

# 학습용 딕셔너리
user_pos_items = train_pos_df.groupby("user")["item"].apply(set).to_dict()
user_hard_neg_items = train_neg_df.groupby("user")["item"].apply(list).to_dict()

print(f"Positive Interactions: {len(train_pos_df)}")
print(f"Hard Negative Interactions: {len(train_neg_df)}")

Positive Interactions: 46243
Hard Negative Interactions: 11327


In [None]:
# =========================================================
# 4. Feature 2: 그래프 구축 (User-Item-Topic)
# =========================================================

def get_hetero_adj_mat(n_users, n_items, n_topics, user_item_df, item_topic_df):
    num_nodes = n_users + n_items + n_topics

    u_idx = user_item_df['user'].values
    i_idx = user_item_df['item'].values + n_users

    it_item_idx = item_topic_df['movie_id'].values + n_users
    it_topic_idx = item_topic_df['topic_id'].values + n_users + n_items

    all_src = np.concatenate([u_idx, i_idx, it_item_idx, it_topic_idx])
    all_dst = np.concatenate([i_idx, u_idx, it_topic_idx, it_item_idx])

    data = np.ones(len(all_src))
    adj = sp.coo_matrix((data, (all_src, all_dst)), shape=(num_nodes, num_nodes))

    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)

    norm_adj = d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo()

    indices = torch.LongTensor(np.vstack((norm_adj.row, norm_adj.col)))
    values = torch.FloatTensor(norm_adj.data)

    return torch.sparse_coo_tensor(indices, values, torch.Size(norm_adj.shape)).to(device)

print("\n🕸️ 이종 그래프(Heterogeneous Graph) 생성 중...")
# topic_df는 앞에서 로드한 데이터
Adj_Matrix = get_hetero_adj_mat(n_users, n_items, n_topics, train_pos_df, topic_df)
print(f"✅ 그래프 생성 완료! (Nodes: {n_users+n_items+n_topics})")


🕸️ 이종 그래프(Heterogeneous Graph) 생성 중...
✅ 그래프 생성 완료! (Nodes: 4163)


  d_inv_sqrt = np.power(rowsum, -0.5).flatten()


In [None]:
# =========================================================
# 5. 모델 정의: LDA-Only LightGCN (수정됨)
# =========================================================
class LDALightGCN(nn.Module):
    def __init__(self, n_users, n_items, n_topics, dim, layers, A_hat, doc2vec_weights=None):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.n_topics = n_topics
        self.dim = dim
        self.layers = layers
        self.A_hat = A_hat

        # 1. User Embedding (Random)
        self.user_emb = nn.Embedding(n_users, dim)
        nn.init.normal_(self.user_emb.weight, std=0.1)

        # 2. Item Embedding (Doc2Vec Initialization)
        if doc2vec_weights is not None:
            self.item_emb = nn.Embedding.from_pretrained(doc2vec_weights, freeze=False)
            self.register_buffer('fixed_doc2vec', doc2vec_weights.clone().detach())
            print("   ✅ Item embeddings initialized with Doc2Vec (trainable)")
        else:
            self.item_emb = nn.Embedding(n_items, dim)
            nn.init.normal_(self.item_emb.weight, std=0.1)
            self.register_buffer('fixed_doc2vec', None)
            print("   ⚠️  Item embeddings using random initialization")
        nn.init.normal_(self.item_emb.weight, std=0.1)

        # 3. Topic Embedding (Random)
        self.topic_emb = nn.Embedding(n_topics, dim)
        nn.init.normal_(self.topic_emb.weight, std=0.1)

    def get_all_embeddings(self):
        users = self.user_emb.weight
        items = self.item_emb.weight
        topics = self.topic_emb.weight

        all_emb = torch.cat([users, items, topics], dim=0)
        embs = [all_emb]

        for _ in range(self.layers):
            all_emb = torch.sparse.mm(self.A_hat, all_emb)
            embs.append(all_emb)

        out = torch.stack(embs, dim=0).mean(dim=0)

        users_final = out[:self.n_users]
        items_final = out[self.n_users : self.n_users + self.n_items]
        # topics_final은 학습에는 쓰이지만, 최종 추천 점수 계산엔 안 쓰임

        return users_final, items_final

In [None]:
# =========================================================
# 6. 유틸리티 함수 (샘플링 & 평가)
# =========================================================
def sample_batch_hard_neg(batch_size, user_pos, user_neg, n_items, hard_prob=0.5):
    """
    Hard Negative Sampling을 적용한 배치 샘플링 함수
    """
    users = np.random.choice(list(user_pos.keys()), size=batch_size)
    pos, neg = [], []
    for u in users:
        # Positive Item Sampling
        pos.append(np.random.choice(list(user_pos[u])))

        # Negative Item Sampling (Hard Negative Logic)
        # 유저가 싫어한 아이템(user_neg)이 있고, 확률(0.5)에 걸리면 그걸 선택
        if (u in user_neg) and (len(user_neg[u]) > 0) and (random.random() < hard_prob):
            neg.append(np.random.choice(user_neg[u]))
        else:
            # 아니면 Random Negative (안 본 영화)
            while True:
                n = np.random.randint(0, n_items)
                if n not in user_pos[u]:
                    neg.append(n); break

    return torch.LongTensor(users).to(device), torch.LongTensor(pos).to(device), torch.LongTensor(neg).to(device)

def evaluate(model, df_eval, k=10):
    """
    모델 성능 평가 함수 (Recall, NDCG 등)
    """
    model.eval()
    # LDA 모델은 get_all_embeddings()가 (users, items) 두 개만 반환합니다.
    # (Topic 임베딩은 내부적으로 아이템 업데이트에 쓰이고 반환값에는 보통 포함 안 됨)
    # 만약 Topic 임베딩도 반환하도록 코드를 수정했다면 그에 맞게 받아주세요.
    # 여기서는 표준적인 (User, Item) 반환을 가정합니다.

    # LDALightGCN 클래스의 get_all_embeddings가 user, item만 반환하는지 확인 필요
    # 앞서 드린 코드는 user, item 두 개만 반환하도록 되어 있습니다.
    users_final, items_final = model.get_all_embeddings()

    hits, ndcg, prec, recall, total = 0, 0, 0, 0, 0

    with torch.no_grad():
        for u_idx, group in df_eval.groupby('user'):
            total += 1
            targets = set(group['item'].values)

            # 점수 계산
            scores = torch.matmul(users_final[u_idx], items_final.t())

            # Train 데이터 마스킹 (이미 본 건 추천 제외)
            if u_idx in user_pos_items:
                scores[list(user_pos_items[u_idx])] = -1e9

            _, topk = torch.topk(scores, k)
            topk = topk.cpu().tolist()

            num_correct = 0
            dcg, idcg = 0.0, 0.0

            for i, item in enumerate(topk):
                if item in targets:
                    num_correct += 1
                    dcg += 1.0 / np.log2(i + 2)

            for i in range(min(len(targets), k)):
                idcg += 1.0 / np.log2(i + 2)

            if num_correct > 0: hits += 1
            prec += num_correct / k
            recall += num_correct / len(targets)
            if idcg > 0: ndcg += dcg / idcg

    return {'Hit': hits/total, 'Prec': prec/total, 'Recall': recall/total, 'NDCG': ndcg/total}

In [None]:
# =========================================================
# 7. Loss Function
# =========================================================
def calc_lda_loss(u_f, i_pos_f, i_neg_f, u_0, i_pos_0, i_neg_0, t_0, fixed_vec, lam1, lam2):
    # 1. BPR Loss
    pos_scores = (u_f * i_pos_f).sum(dim=1)
    neg_scores = (u_f * i_neg_f).sum(dim=1)
    bpr_loss = torch.mean(torch.nn.functional.softplus(-(pos_scores - neg_scores)))

    # 2. Reg Loss (Topic Embedding 포함!)
    # 토픽 노드도 학습 파라미터이므로 규제 대상에 포함해야 함
    reg_loss = 0.5 * (u_0.norm(2).pow(2) + i_pos_0.norm(2).pow(2) +
                      i_neg_0.norm(2).pow(2) + t_0.norm(2).pow(2)) / u_0.size(0)


    # 3. Content Consistency Loss (NEW)
    if fixed_vec is not None:
        content_loss = (i_pos_0 - fixed_vec).norm(2).pow(2) / i_pos_0.size(0)
    else:
        content_loss = torch.tensor(0.0, device=u_0.device)

    return bpr_loss + lam1 * reg_loss + lam2 * content_loss


In [None]:
# =========================================================
# 8. 학습 실행
# =========================================================
dim = 64
layers = 3
batch_size = 1024
epochs = 50
lr = 1e-3
lambda1 = 1e-5  # Base 실험 최적값
lambda2 = 1e-3  # Content consistency loss weight

model = LDALightGCN(n_users, n_items, n_topics, dim, layers, Adj_Matrix,
                    doc2vec_weights=doc2vec_weights).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

best_recall = 0.0
best_epoch = 0
save_path = "best_lda_lightgcn.pt"

print(f"\n🚀 LDA-Only 학습 시작 (L1={lambda1})...")

for epoch in range(1, epochs+1):
    model.train()
    total_loss = 0
    num_batches = len(train_pos_df) // batch_size + 1

    for _ in range(num_batches):
        users, pos, neg = sample_batch_hard_neg(batch_size, user_pos_items, user_hard_neg_items, n_items)

        # Forward
        u_final, i_final = model.get_all_embeddings() # Topic은 내부에서 처리됨

        u_f = u_final[users]
        i_pos_f = i_final[pos]
        i_neg_f = i_final[neg]

        u_0 = model.user_emb.weight[users]
        i_pos_0 = model.item_emb.weight[pos]
        i_neg_0 = model.item_emb.weight[neg]
        t_0 = model.topic_emb.weight

        # Loss (Content Loss 인자 제거)

        # Extract fixed Doc2Vec vectors for content loss
        if model.fixed_doc2vec is not None:
            fixed_vec = model.fixed_doc2vec[pos]
        else:
            fixed_vec = None

        loss = calc_lda_loss(u_f, i_pos_f, i_neg_f, u_0, i_pos_0, i_neg_0, t_0,
                            fixed_vec, lambda1, lambda2)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Evaluation
    val_res = evaluate(model, val_df, k=10)
    print(f"[Epoch {epoch:02d}] Loss: {total_loss/num_batches:.4f} | Val Recall: {val_res['Recall']:.4f} | NDCG: {val_res['NDCG']:.4f}")

    if val_res['Recall'] > best_recall:
        best_recall = val_res['Recall']
        best_epoch = epoch
        torch.save(model.state_dict(), save_path)
        print("   >>> Best Model Updated!")

print(f"\n✨ 학습 종료. Best Epoch: {best_epoch}, Recall: {best_recall:.4f}")

# Final Test
model.load_state_dict(torch.load(save_path))
test_res = evaluate(model, test_df, k=10)
print("\n=== Final Test Result ===")
print(f"Hit: {test_res['Hit']:.4f}, Prec: {test_res['Prec']:.4f}, Recall: {test_res['Recall']:.4f}, NDCG: {test_res['NDCG']:.4f}")


🚀 LDA-Only 학습 시작 (L1=1e-05)...
[Epoch 01] Loss: 0.6895 | Val Recall: 0.0826 | NDCG: 0.0440
   >>> Best Model Updated!
[Epoch 02] Loss: 0.6753 | Val Recall: 0.1336 | NDCG: 0.0713
   >>> Best Model Updated!
[Epoch 03] Loss: 0.6236 | Val Recall: 0.1366 | NDCG: 0.0719
   >>> Best Model Updated!
[Epoch 04] Loss: 0.5420 | Val Recall: 0.1336 | NDCG: 0.0702
[Epoch 05] Loss: 0.4829 | Val Recall: 0.1246 | NDCG: 0.0662
[Epoch 06] Loss: 0.4483 | Val Recall: 0.1246 | NDCG: 0.0658
[Epoch 07] Loss: 0.4249 | Val Recall: 0.1291 | NDCG: 0.0669
[Epoch 08] Loss: 0.4143 | Val Recall: 0.1351 | NDCG: 0.0698
[Epoch 09] Loss: 0.3958 | Val Recall: 0.1336 | NDCG: 0.0694
[Epoch 10] Loss: 0.3863 | Val Recall: 0.1351 | NDCG: 0.0705
[Epoch 11] Loss: 0.3785 | Val Recall: 0.1381 | NDCG: 0.0716
   >>> Best Model Updated!
[Epoch 12] Loss: 0.3624 | Val Recall: 0.1366 | NDCG: 0.0730
[Epoch 13] Loss: 0.3548 | Val Recall: 0.1321 | NDCG: 0.0721
[Epoch 14] Loss: 0.3411 | Val Recall: 0.1411 | NDCG: 0.0754
   >>> Best Model Up