In [1]:
import os, urllib.request
import numpy as np, pandas as pd
from typing import Dict, Tuple
from scipy import sparse

from implicit.nearest_neighbours import bm25_weight
from implicit.nearest_neighbours import CosineRecommender
from implicit.als import AlternatingLeastSquares

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --------------------------------------------------
# A. MovieLens 100k Îã§Ïö¥Î°úÎìú/Î°úÎìú
#   - ÌååÏùºÏù¥ ÏóÜÏúºÎ©¥ ÏûêÎèô Îã§Ïö¥Î°úÎìú (~/ml-100k/)
#   - u.data: user_id, item_id, rating, timestamp (tab-separated)
# --------------------------------------------------
def load_movielens_100k(data_dir: str = os.path.expanduser("~/ml-100k")) -> pd.DataFrame:
    os.makedirs(data_dir, exist_ok=True)
    ratings_path = os.path.join(data_dir, "u.data")

    if not os.path.exists(ratings_path):
        print("üì• Downloading MovieLens 100k ...")
        url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
        # ÏßÅÏ†ë ÌååÏùºÎßå Î∞õÎäî Í∞ÑÎã® Î≤ÑÏ†Ñ (ÏïïÏ∂ï Ìï¥Ï†ú Í≥ºÏ†ï ÏóÜÏù¥)
        urllib.request.urlretrieve(url, ratings_path)

    df = pd.read_csv(
        ratings_path, 
        sep="\t", 
        names=["user_id", "item_id", "rating", "timestamp"], 
        engine="python"
    )
    # Î¨∏ÏûêÏó¥Î°ú ÌÜµÏùº(Îß§Ìïë ÏïàÏ†ï)
    df["user_id"] = df["user_id"].astype(str)
    df["item_id"] = df["item_id"].astype(str)
    return df


In [3]:
#   - Í∞Å Ïú†Ï†ÄÏùò ÏµúÏã† 1Í∞úÎ•º testÎ°ú, ÎÇòÎ®∏ÏßÄÎäî trainÏúºÎ°ú ÏÇ¨Ïö©
# --------------------------------------------------
def train_test_split_loov(ratings: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    ratings = ratings.copy()
    ratings["ts_rank"] = ratings.groupby("user_id")["timestamp"].rank(method="first", ascending=False)
    test = ratings[ratings["ts_rank"] == 1].drop(columns=["ts_rank"])
    train = ratings[ratings["ts_rank"] > 1].drop(columns=["ts_rank"])
    return train, test

In [4]:
# C. ID Îß§Ìïë + CSR ÏÉùÏÑ± (user x item)
#   - implicitÏùÄ ÌïôÏäµ Ïãú item-userÎ•º Í∏∞ÎåÄÌïòÎØÄÎ°ú Ï†ÑÏπòÌï¥ÏÑú Ï†ÑÎã¨
#   - MovieLensÎäî explicit(ÌèâÏ†ê)Ïù¥ÎØÄÎ°ú, ratingÏùÑ confidenceÎ°ú ÏÇ¨Ïö©
# --------------------------------------------------
def build_id_maps(train_df: pd.DataFrame):
    users = train_df["user_id"].unique()
    items = train_df["item_id"].unique()
    u2i = {u:i for i,u in enumerate(users)}
    it2i = {it:i for i,it in enumerate(items)}
    return u2i, it2i, np.array(users), np.array(items)

def build_ui_csr(df: pd.DataFrame, user2idx: dict, item2idx: dict, use_rating=True) -> sparse.csr_matrix:
    d = df[["user_id","item_id","rating"] if "rating" in df.columns else ["user_id","item_id"]].copy()
    d["u"] = d["user_id"].map(user2idx)
    d["i"] = d["item_id"].map(item2idx)
    d = d.dropna(subset=["u","i"])
    rows = d["u"].astype(int).to_numpy()
    cols = d["i"].astype(int).to_numpy()
    vals = (d["rating"].astype(np.float32).to_numpy() if use_rating and "rating" in d.columns
            else np.ones(len(d), dtype=np.float32))
    UI = sparse.csr_matrix((vals, (rows, cols)), shape=(len(user2idx), len(item2idx)), dtype=np.float32)
    return UI

In [5]:
# D. Î™®Îç∏ ÌïôÏäµ (Cosine KNN / ALS)
# --------------------------------------------------
def fit_cosine_knn(item_user: sparse.csr_matrix, K:int=200) -> CosineRecommender:
    model = CosineRecommender(K=K)
    model.fit(item_user)   # expect item-user
    return model

def fit_als(user_item: sparse.csr_matrix, factors=64, reg=1e-2, iters=20) -> AlternatingLeastSquares:
    als = AlternatingLeastSquares(factors=factors, regularization=reg, iterations=iters)
    als.fit(user_item)     # expect item-user
    return als


In [6]:
# E. Ï∂îÏ≤ú/Ïú†ÏÇ¨ÏïÑÏù¥ÌÖú/Í∞ÑÎã® ÌèâÍ∞Ä
# --------------------------------------------------
from scipy import sparse

def recommend_topn(model, UI_bm25, user_id, N=10):
    user_row = UI_bm25[user_id]  # 1 x n_items
    ids, scores = model.recommend(
        user_id,
        user_items=user_row,
        N=N,
        filter_already_liked_items=True
    )
    return list(zip(ids, scores))

def knn_recommend_topn(model, UI_weighted: sparse.csr_matrix, user_internal_id: int, N: int = 10):
    user_row = UI_weighted[user_internal_id]
    ids, scores = model.recommend(
        user_internal_id,
        user_items=user_row,                  
        N=N,
        filter_already_liked_items=True
    )
    return list(zip(ids, scores))

def similar_items_topn(model, item_internal_id: int, N:int=10):
    ids, scores = model.similar_items(item_internal_id, N=N)
    return list(zip(ids, scores))

In [7]:
# Ïã§Ìñâ part
ratings = load_movielens_100k()
train_df, test_df = train_test_split_loov(ratings)
print(f"train: {train_df.shape}, test: {test_df.shape}")

train: (99057, 4), test: (943, 4)


In [8]:
train_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [9]:
print(train_df['user_id'].nunique())
print(train_df['item_id'].nunique())

943
1680


In [10]:
# Îß§Ìïë/ÌñâÎ†¨
user2idx, item2idx, idx2user, idx2item = build_id_maps(train_df)
UI = build_ui_csr(train_df, user2idx, item2idx, use_rating=True)

# Í∞ÄÏ§ë(Îëò Ï§ë ÌïòÎÇò ÏÑ†ÌÉù; Î≥¥ÌÜµ BM25 Í∂åÏû•)
UI_bm25  = bm25_weight(UI).tocsr()
# UI_tfidf = tfidf_weight(UI).tocsr()

# implicitÏùÄ item-user Í∏∞ÎåÄ ‚Üí Ï†ÑÏπò
IU_bm25 = UI_bm25.T.tocsr()

# Î™®Îç∏ ÌïôÏäµ
knn = fit_cosine_knn(IU_bm25, K=200)
als = fit_als(UI_bm25, factors=64, reg=1e-2, iters=20)

# Îç∞Î™®: Ïú†Ï†Ä/ÏïÑÏù¥ÌÖú ÏûÑÏùò ÏÑ†ÌÉù
u0 = 0
i0 = 0
print("KNN recommend(u0):", recommend_topn(knn, UI_bm25, u0, N=10)[:5])
print("KNN similar_items(i0):", similar_items_topn(knn, i0, N=10)[:5])
print("ALS recommend(u0):", recommend_topn(als, UI_bm25, u0, N=10)[:5])

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 943/943 [00:00<00:00, 15199.38it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:00<00:00, 31.27it/s]

KNN recommend(u0): [(np.int32(584), np.float64(64.21088724546001)), (np.int32(50), np.float64(63.18584760933893)), (np.int32(420), np.float64(62.72465281039928)), (np.int32(534), np.float64(60.02447432974939)), (np.int32(65), np.float64(59.07429995913074))]
KNN similar_items(i0): [(np.int32(0), np.float64(1.0)), (np.int32(320), np.float64(0.28796264316884335)), (np.int32(569), np.float64(0.254302711272459)), (np.int32(264), np.float64(0.2489866512472389)), (np.int32(27), np.float64(0.2372621644461647))]
ALS recommend(u0): [(np.int32(70), np.float32(0.8081374)), (np.int32(867), np.float32(0.8030158)), (np.int32(33), np.float32(0.79736036)), (np.int32(360), np.float32(0.7612708)), (np.int32(242), np.float32(0.7294104))]





In [11]:
from typing import Dict, Tuple
import numpy as np
import pandas as pd
from scipy import sparse

def eval_loov_recall_at_k(
    model,
    UI_bm25: sparse.csr_matrix,
    test_df: pd.DataFrame,
    user2idx: Dict[str, int],
    item2idx: Dict[str, int],
    K: int = 10,
    filter_seen: bool = True,
) -> Tuple[float, int, int]:
    # 0) Í≥µÍ∞Ñ Ï†ïÌï©ÏÑ±(ÏïÑÏù¥ÌÖú Ï∂ï)
    if hasattr(model, "item_factors"):
        n_items_model = model.item_factors.shape[0]
        if UI_bm25.shape[1] != n_items_model:
            raise ValueError(f"[SpaceMismatch] model items={n_items_model} vs UI cols={UI_bm25.shape[1]}")

    hits, total = 0, 0

    # 1) Ïú†Ï†Ä Î£®ÌîÑ: Í∞Å Ïú†Ï†ÄÏóê ÎåÄÌï¥ Î∞òÎìúÏãú 'Í∑∏ Ïú†Ï†ÄÏùò 1Ìñâ'Îßå Ï†ÑÎã¨
    for u, g in test_df.groupby("user_id"):
        if u not in user2idx:
            continue  # cold user
        uidx = user2idx[u]

        # ÌèâÍ∞Ä ÎåÄÏÉÅ ÏïÑÏù¥ÌÖú(ÌïôÏäµ Îß§ÌïëÏóê ÏûàÎäî Í≤ÉÎßå)
        target_items = [item2idx[it] for it in g["item_id"].values if it in item2idx]
        if not target_items:
            continue  # cold itemÎßå ÏûàÏúºÎ©¥ Ïä§ÌÇµ

        # 2) user_row Ìïú Ìñâ Ï∂îÏ∂ú + out-of-range Î∞©Ïñ¥
        user_row = UI_bm25[uidx]
        if hasattr(model, "item_factors"):
            n_items_model = model.item_factors.shape[0]
            if user_row.indices.size and user_row.indices.max() >= n_items_model:
                # ÎπÑÏ†ïÏÉÅ Ïù∏Îç±Ïä§Í∞Ä ÏûàÏúºÎ©¥ ÏûòÎùºÎÇ¥Í∏∞(Ï†ïÏÉÅÏù¥ÎùºÎ©¥ 0Í±¥)
                mask = user_row.indices < n_items_model
                user_row = sparse.csr_matrix(
                    (user_row.data[mask], user_row.indices[mask], user_row.indptr.copy()),
                    shape=(1, n_items_model)
                )

        # 3) recommend Ìò∏Ï∂ú(Î¨∏Ï†ú Î∞úÏÉùÏãú Ïñ¥Îäê Ïú†Ï†ÄÏóêÏÑú ÌÑ∞Ï°åÎäîÏßÄ ÏïåÍ∏∞ ÏúÑÌï¥ try/except)
        try:
            rec_ids, _ = model.recommend(
                uidx,
                user_items=user_row,
                N=K,
                filter_already_liked_items=filter_seen
            )
        except Exception as e:
            raise RuntimeError(f"[CrashAtUser] user='{u}' (uidx={uidx}) ÎèôÏïà recommend() Ïã§Ìå®: {e}")

        # 4) Hit ÌåêÏ†ï
        if set(rec_ids).intersection(target_items):
            hits += 1
        total += 1

    hit_rate = hits / total if total > 0 else 0.0
    return hit_rate, hits, total
