In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ehallmar/beers-breweries-and-beer-reviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'beers-breweries-and-beer-reviews' dataset.
Path to dataset files: /kaggle/input/beers-breweries-and-beer-reviews


In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import KFold as SurpriseKFold

ModuleNotFoundError: No module named 'surprise'

--전처리 내용--

CB Model 정의

In [16]:
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
import numpy as np
import pandas as pd

try:
    from scipy import sparse as sp
except ImportError:
    sp = None

def _is_sparse(X) -> bool:
    return sp is not None and sp.issparse(X)

def _l2_normalize_rows(X):
    """
    Row-wise L2 normalization. Works for CSR (preferred) or dense ndarray.
    Zero rows remain zero.
    """
    if _is_sparse(X):
        X = X.tocsr(copy=True)
        row_norms = np.sqrt(X.multiply(X).sum(axis=1)).A1
        nz = row_norms > 0
        inv = np.zeros_like(row_norms)
        inv[nz] = 1.0 / row_norms[nz]
        D = sp.diags(inv)
        return D @ X
    else:
        X = np.asarray(X, dtype=np.float32)
        row_norms = np.linalg.norm(X, axis=1, keepdims=True)
        row_norms[row_norms == 0] = 1.0
        return X / row_norms

def _now_ts_from(interactions: pd.DataFrame, ts_col: str) -> pd.Timestamp:
    """Infer a 'now' timestamp from data (max ts)."""
    ts = interactions[ts_col]
    if np.issubdtype(ts.dtype, np.datetime64):
        return ts.max()
    try:
        return pd.to_datetime(ts, unit="s").max()
    except Exception:
        return pd.to_datetime(ts).max()

def _to_datetime(series: pd.Series) -> pd.Series:
    if np.issubdtype(series.dtype, np.datetime64):
        return series
    try:
        return pd.to_datetime(series, unit="s", errors="coerce")
    except Exception:
        return pd.to_datetime(series, errors="coerce")

# ---------- 모델 ----------

@dataclass
class CBConfig:
    rating_threshold: float = 4.0        # 좋아요(1)로 간주할 평점 임계
    min_liked_items: int = 1            # 유저 프로파일 생성 최소 아이템 수
    use_recency: bool = True            # 최근성 가중치 사용 여부
    half_life_days: float = 60.0        # 최근성 half-life (일)
    # base weight = max(score - rating_threshold + 0.5, 0.0)
    # final weight = base * recency_decay

class CBModel:
    """
    콘텐츠 기반 추천 모델 (프로포절 규격: style OHE + abv 표준화 + 유저프로파일 + 코사인).
    - fit_items: 전처리 산출 아이템 특징 행렬을 로드하고 L2 정규화
    - build_user_profiles: (user,item,score,ts)로 유저 벡터 계산
    - score_user_items: 특정 유저-후보 아이템 유사도 점수
    - recommend: Top-N 추천
    - predict_pairs: (user,item) 쌍의 cb_similarity_score 산출 (스태킹 입력)
    """
    def __init__(self, config: Optional[CBConfig] = None):
        self.cfg = config or CBConfig()
        self.item_features = None
        self.beer_id_to_idx: Dict[Union[int, str], int] = {}
        self.idx_to_beer_id: np.ndarray = np.array([], dtype=object)
        self.user_profiles: Dict[Union[int, str], np.ndarray] = {}
        self.seen_items_by_user: Dict[Union[int, str], set] = {}

    # ----- 아이템 영역 -----

    def fit_items(self, X_items, beer_ids: Sequence[Union[int, str]]):
        """
        Parameters
        ----------
        X_items : csr_matrix or ndarray, shape (n_items, d)
            아이템 특징 행렬 (style OHE, abv_scaled 등 포함).
        beer_ids : array-like
            각 행에 대응하는 beer_id.
        """
        if _is_sparse(X_items):
            X = X_items.tocsr()
        else:
            X = np.asarray(X_items, dtype=np.float32)
        # Row-wise L2 normalize for cosine similarity via dot
        self.item_features = _l2_normalize_rows(X)
        self.idx_to_beer_id = np.asarray(beer_ids)
        self.beer_id_to_idx = {bid: i for i, bid in enumerate(self.idx_to_beer_id)}
        return self

    # ----- 유저 프로파일 -----

    def build_user_profiles(
        self,
        interactions: pd.DataFrame,
        user_col: str = "user_id",
        item_col: str = "beer_id",
        rating_col: str = "score",
        ts_col: Optional[str] = "ts",
    ):
        """
        유저별로 '좋아요'에 해당하는 아이템 벡터의 가중 평균으로 프로파일을 생성하고 L2 정규화.
        - weight = max(score - threshold + 0.5, 0) * recency_decay
        - recency_decay = 0.5 ** (delta_days / half_life)
        """
        assert self.item_features is not None,

        df = interactions[[user_col, item_col, rating_col] + ([ts_col] if ts_col else [])].copy()
        # seen log (추천 제외 처리에 활용)
        self.seen_items_by_user = (
            df.groupby(user_col)[item_col].apply(lambda s: set(s.values)).to_dict()
        )

        # filtering by rating threshold
        th = self.cfg.rating_threshold
        liked = df[df[rating_col] >= th].copy()

        # recency weights
        if self.cfg.use_recency and ts_col and ts_col in liked.columns:
            liked[ts_col] = _to_datetime(liked[ts_col])
            now_ts = _now_ts_from(liked, ts_col)
            delta_days = (now_ts - liked[ts_col]).dt.days.clip(lower=0).astype(float)
            recency_decay = np.power(0.5, delta_days / max(self.cfg.half_life_days, 1e-6))
        else:
            recency_decay = np.ones(len(liked), dtype=np.float32)

        base = (liked[rating_col] - th + 0.5).clip(lower=0).astype(float)
        weights = (base * recency_decay).astype(np.float32).values

        # accumulate weighted sum per user
        # build mapping only for items existing in item_features
        item_idx = liked[item_col].map(self.beer_id_to_idx).values
        valid = ~pd.isna(item_idx)
        liked = liked[valid]
        weights = weights[valid]
        item_idx = item_idx[valid].astype(int)

        # group by user
        users = liked[user_col].values
        # prepare result dict
        d = self.item_features.shape[1]
        self.user_profiles = {}

        # To speed up, process per user
        # collect indices per user
        by_user: Dict[Union[int, str], List[int]] = {}
        by_weight: Dict[Union[int, str], List[float]] = {}
        for u, i, w in zip(users, item_idx, weights):
            by_user.setdefault(u, []).append(i)
            by_weight.setdefault(u, []).append(w)

        for u, idxs in by_user.items():
            wts = np.asarray(by_weight[u], dtype=np.float32)
            if len(idxs) < self.cfg.min_liked_items or wts.sum() <= 0:
                continue
            if _is_sparse(self.item_features):
                # weighted sum of rows
                row_stack = self.item_features[idxs]
                prof = (row_stack.T @ sp.csr_matrix(wts).T).A1
            else:
                prof = (self.item_features[idxs] * wts[:, None]).sum(axis=0)
            # L2 normalize
            norm = np.linalg.norm(prof)
            if norm == 0:
                continue
            self.user_profiles[u] = (prof / norm).astype(np.float32)

        return self

    # ----- 스코어링/추천 -----

    def score_user_items(
        self,
        user_id: Union[int, str],
        candidate_beer_ids: Optional[Sequence[Union[int, str]]] = None,
        default_score: float = 0.0,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Returns
        -------
        scores : np.ndarray, shape (n_candidates,)
            코사인 유사도 점수 벡터
        cand_ids : np.ndarray
            입력 후보 beer_id 배열(매핑 실패 제외)
        """
        if self.item_features is None:
            raise RuntimeError("fit_items 먼저 호출하세요.")
        user_vec = self.user_profiles.get(user_id, None)
        if user_vec is None:
            # cold-start user → 점수 0 (또는 인기기반 후처리에서 대체)
            if candidate_beer_ids is None:
                cand_ids = self.idx_to_beer_id
                scores = np.full(len(cand_ids), default_score, dtype=np.float32)
            else:
                cand_ids = np.asarray(candidate_beer_ids)
                scores = np.full(len(cand_ids), default_score, dtype=np.float32)
            return scores, np.asarray(cand_ids)

        if candidate_beer_ids is None:
            cand_idx = np.arange(self.item_features.shape[0], dtype=int)
            cand_ids = self.idx_to_beer_id
        else:
            # map to indices, drop unknowns
            cand_ids = np.asarray(candidate_beer_ids)
            cand_idx = np.array(
                [self.beer_id_to_idx.get(b, -1) for b in cand_ids], dtype=int
            )
            mask = cand_idx >= 0
            cand_idx = cand_idx[mask]
            cand_ids = cand_ids[mask]

        # cosine via dot since rows L2-normalized
        if _is_sparse(self.item_features):
            scores = self.item_features[cand_idx].dot(user_vec).astype(np.float32)
            scores = np.asarray(scores).ravel()
        else:
            scores = (self.item_features[cand_idx] @ user_vec).astype(np.float32)

        return scores, cand_ids

    def recommend(
        self,
        user_id: Union[int, str],
        k: int = 10,
        candidate_beer_ids: Optional[Sequence[Union[int, str]]] = None,
        exclude_seen: bool = True,
    ) -> List[Tuple[Union[int, str], float]]:
        """
        Top-N 추천 반환: [(beer_id, score), ...]
        """
        scores, cand_ids = self.score_user_items(user_id, candidate_beer_ids)
        # exclude seen
        if exclude_seen:
            seen = self.seen_items_by_user.get(user_id, set())
            mask = np.array([bid not in seen for bid in cand_ids], dtype=bool)
            cand_ids = cand_ids[mask]
            scores = scores[mask]
        if len(scores) == 0:
            return []
        topk = np.argpartition(-scores, kth=min(k, len(scores)-1))[:k]
        order = topk[np.argsort(-scores[topk])]
        return [(cand_ids[i].item() if hasattr(cand_ids[i], "item") else cand_ids[i],
                 float(scores[i]))
                for i in order]

    def predict_pairs(
        self,
        pairs: pd.DataFrame,
        user_col: str = "user_id",
        item_col: str = "beer_id",
    ) -> np.ndarray:
        """
        스태킹 Level-0 특징으로 쓸 (user,item)별 cb_similarity_score 생성.
        """
        assert self.item_features is not None,
        out = np.zeros(len(pairs), dtype=np.float32)
        # group by user for vectorized scoring
        for u, grp in pairs.groupby(pairs[user_col].values):
            scores, cand_ids = self.score_user_items(u, grp[item_col].values)
            # align scores to grp order (cand_ids is a subset; we kept order)
            out[grp.index] = scores
        return out

# ---------- 스태킹 OOF 유틸 ----------

from sklearn.model_selection import KFold

def make_cb_oof_scores(
    interactions: pd.DataFrame,
    X_items,
    beer_ids: Sequence[Union[int, str]],
    n_splits: int = 5,
    random_state: int = 42,
    user_col: str = "user_id",
    item_col: str = "beer_id",
    rating_col: str = "score",
    ts_col: Optional[str] = "ts",
    cfg: Optional[CBConfig] = None,
) -> pd.DataFrame:
    """
    상호작용 단위 KFold로 OOF cb_similarity_score 생성.
    - fold마다: CBModel(아이템 고정) + 유저프로파일은 '학습 폴드' 상호작용으로만 생성
    - 검증 폴드 (user,item) 쌍에 대해 점수를 예측
    Returns: DataFrame[pairs + cb_similarity_score]
    """
    pairs_cols = [user_col, item_col]
    df = interactions[[user_col, item_col, rating_col] + ([ts_col] if ts_col else [])].copy()
    df = df.reset_index(drop=True)
    oof = np.zeros(len(df), dtype=np.float32)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for fold, (tr_idx, va_idx) in enumerate(kf.split(df), start=1):
        tr = df.iloc[tr_idx]
        va = df.iloc[va_idx]
        cb = CBModel(cfg)
        cb.fit_items(X_items, beer_ids)
        cb.build_user_profiles(tr, user_col=user_col, item_col=item_col,
                               rating_col=rating_col, ts_col=ts_col)
        oof[va_idx] = cb.predict_pairs(va[pairs_cols].copy(), user_col=user_col, item_col=item_col)
        print(f"[CB OOF] fold {fold}/{n_splits} done")

    out = df[pairs_cols].copy()
    out["cb_similarity_score"] = oof
    return out


단순 검증용 테스트 코드

In [17]:
# === POSITION-ONLY, ULTRA-LOW-RAM CB (no merge, no big copies) ===
import numpy as np, pandas as pd, heapq, gc

cf = df_for_cf
meta = X_meta_base

cf_ = cf.rename(columns={"user":"user_id","item":"beer_id","rating":"score"})[["user_id","beer_id","score"]]

# 피처 컬럼(순수 CB)
style_cols = [c for c in meta.columns if str(c).startswith("style_")]
feat_cols = (["abv"] if "abv" in meta.columns else []) + style_cols
assert len(feat_cols) >= 2, "X_meta_base에서 abv + style_*를 찾지 못함"

# 1) 타깃 유저 선택(좋아요 많은 유저)
likes = cf_[cf_["score"] >= 4.0]
assert not likes.empty, "평점≥4.0 데이터가 없음"
target_user = likes["user_id"].value_counts().idxmax()
print("[TARGET USER]", target_user)

# 유저 벡터: '좋아요 행'의 '행 인덱스'로 피처를 뽑아 가중합
like_pos = likes.index[likes["user_id"] == target_user].to_numpy()
first_pos = pd.Series(like_pos).groupby(cf_.iloc[like_pos]["beer_id"].to_numpy()).first().to_numpy()

X_like = meta.iloc[first_pos][feat_cols].apply(pd.to_numeric, errors="coerce").fillna(0).to_numpy(dtype=np.float32)
# 행 L2 정규화
nrm = np.linalg.norm(X_like, axis=1, keepdims=True); nrm[nrm==0]=1.0
X_like /= nrm

w = (cf_.iloc[first_pos]["score"].to_numpy(dtype=np.float32) - 3.5).clip(min=0)
U = (X_like * w[:, None]).sum(axis=0)
u_norm = np.linalg.norm(U)
if u_norm == 0:
    raise RuntimeError("유저 벡터가 0입니다. 좋아요가 너무 적거나 전부 결측")
U = (U / u_norm).astype(np.float32)
print("[USER VEC] ready; liked_rows:", len(first_pos))

K = 20
CHUNK = 20_000
top = []                   # (score, beer_id)
processed = set()
seen = set(cf_.loc[cf_["user_id"] == target_user, "beer_id"].to_numpy())

n = len(cf_)
for s in range(0, n, CHUNK):
    e = min(n, s+CHUNK)
    ids = cf_.iloc[s:e]["beer_id"].to_numpy()
    X = meta.iloc[s:e][feat_cols].apply(pd.to_numeric, errors="coerce").fillna(0).to_numpy(dtype=np.float32)
    # 행 L2 정규화
    nrm = np.linalg.norm(X, axis=1, keepdims=True); nrm[nrm==0]=1.0
    X /= nrm
    # 유사도
    scores = X @ U

    for bid, sc in zip(ids, scores):
        if bid in processed or bid in seen:
            continue
        processed.add(bid)
        sc = float(sc)
        if len(top) < K:
            heapq.heappush(top, (sc, bid))
        else:
            if sc > top[0][0]:
                heapq.heapreplace(top, (sc, bid))

    del ids, X, scores
    if (s // CHUNK) % 50 == 0: gc.collect()

# 결과
top_sorted = sorted(top, key=lambda x: -x[0])
print(f"\n[TOP-{K} for user {target_user}]")
for sc, bid in top_sorted:
    print(bid, round(sc, 4))



[TARGET USER] kjkinsey
[USER VEC] ready; liked_rows: 6291

[TOP-20 for user kjkinsey]
366894 0.9995
57015 0.9995
187253 0.9995
311921 0.9995
70633 0.9995
368014 0.9994
368012 0.9994
365307 0.9994
96666 0.9994
104620 0.9994
89801 0.9994
83277 0.9994
340204 0.9993
73368 0.9993
133448 0.9993
356998 0.9993
196590 0.9993
176438 0.9993
83276 0.9992
57856 0.9992


CB-preproccesing

In [18]:
# --- per-beer feature matrix (stable memory, no groupby merges) ---
import numpy as np, pandas as pd, gc

cf   = df_for_cf.rename(columns={"user":"user_id","item":"beer_id","rating":"score"})[["user_id","beer_id","score"]]
meta = X_meta_base

# 피처 컬럼 정의
style_cols = [c for c in meta.columns if str(c).startswith("style_")]
feat_cols  = (["abv"] if "abv" in meta.columns else []) + style_cols
d = len(feat_cols)
assert d >= 2, "X_meta_base에서 abv + style_*를 찾지 못했습니다."

# 유니크 beer_id와 인덱스 매핑
beer_ids = cf["beer_id"].unique()
n_beers  = beer_ids.shape[0]
id2idx   = {bid: i for i, bid in enumerate(beer_ids)}

# 누적 버퍼 준비 (고정 크기)
sum_mat = np.zeros((n_beers, d), dtype=np.float32)
cnt     = np.zeros(n_beers, dtype=np.int32)

CHUNK = 50_000
n = len(cf)
for s in range(0, n, CHUNK):
    e = min(n, s+CHUNK)

    ids = cf.iloc[s:e]["beer_id"].to_numpy()
    idx = np.fromiter((id2idx.get(b, -1) for b in ids), dtype=np.int64, count=ids.size)
    keep = idx >= 0
    if not np.any(keep):
        continue
    idx = idx[keep]

    X = meta.iloc[s:e][feat_cols].to_numpy(copy=False)
    X = pd.DataFrame(X, columns=feat_cols)
    for c in feat_cols:
        X[c] = pd.to_numeric(X[c], errors="coerce").fillna(0).astype(np.float32)
    X = X.to_numpy(dtype=np.float32, copy=False)
    X = X[keep]

    # 행 L2 정규화
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    X /= norms

    np.add.at(sum_mat, idx, X)
    np.add.at(cnt, idx, 1)

    del ids, idx, keep, X, norms
    if (s // CHUNK) % 10 == 0:
        gc.collect()

# 평균 → 행 L2 정규화
cnt = np.maximum(cnt, 1).astype(np.float32)
beer_avg = (sum_mat / cnt[:, None]).astype(np.float32)
norms = np.linalg.norm(beer_avg, axis=1, keepdims=True)
norms[norms == 0] = 1.0
beer_mat = (beer_avg / norms).astype(np.float32)

np.save("/content/beer_mat.npy", beer_mat)         # (n_beers, d), row-normalized
np.save("/content/beer_ids.npy", beer_ids)         # beer_mat 행과 매칭되는 beer_id 배열
print("SAVED:", beer_mat.shape, "→ /content/beer_mat.npy & /content/beer_ids.npy")



SAVED: (358873, 114) → /content/beer_mat.npy & /content/beer_ids.npy


In [None]:
from google.colab import files
files.download("/content/beer_mat.npy")
files.download("/content/beer_ids.npy")
files.download("/content/cb_topk.parquet")