# Item-based CF
This notebook builds an Item‑based Collaborative Filtering (CF) model from implicit user feedback.
It reuses the same windowing rules, valid actions, and action weights as previous User Profile Feature Table.

Data source

`user_behavior_log_info.csv` with columns: user_id, item_id, year, time_stamp (MMDD), timestamp (seconds), action_type

## Setup & Parameters

In [2]:
!fusermount -u /content/drive
!rm -rf /content/drive
from google.colab import drive
drive.mount('/content/drive')

fusermount: failed to unmount /content/drive: No such file or directory
Mounted at /content/drive


In [27]:
# ---- Imports ----
import os
import json
import numpy as np
import pandas as pd
from typing import Iterable, Optional, List, Tuple
from datetime import timedelta
from collections import defaultdict, Counter
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

# ---------- Paths  ----------
INPUT_BEHAVIOR_CSV = '/content/drive/MyDrive/Projects/user_behavior_log_info.csv'
OUTDIR = '/content/drive/MyDrive/Projects/cf_item_based'

# ---------- Parameters ----------
CHUNKSIZE = 200_000        # chunk size for large CSVs
DAYS_WINDOW = 30           # rolling window size, inclusive of max_date
TIMESTAMP_MAX = 86399.999  # clip day-seconds to avoid spillover to next day

# RFM weights (monetary approximation)
RFM_WEIGHTS = {
    'click': 1.0,
    'read':  2.0,
    'like':  3.0,
    'fav':   4.0,
}

VALID_ACTIONS = {'click', 'read', 'like', 'fav'}  # valid user actions

In [20]:
# ---------- Helpers ----------
def ensure_outdir(path: str):
    """Create output directory if it does not exist."""
    os.makedirs(path, exist_ok=True)

def build_datetime(year_series, mmdd_series, sec_series):
    """
    Convert (year, MMDD, seconds-of-day) into a full pandas datetime64.
    - year_series: e.g., 2024
    - mmdd_series: MMDD string or int, e.g., '0511' means May 11
    - sec_series : seconds since midnight (0 ~ 86400)
    Uses global TIMESTAMP_MAX to avoid day spillover.
    """
    mmdd_str = mmdd_series.astype(str).str.zfill(4)
    month = pd.to_numeric(mmdd_str.str[:2], errors='coerce')
    day   = pd.to_numeric(mmdd_str.str[2:], errors='coerce')
    year  = pd.to_numeric(year_series, errors='coerce')

    date_real = pd.to_datetime(
        year.astype('Int64').astype(str) + '-' +
        month.astype('Int64').astype(str).str.zfill(2) + '-' +
        day.astype('Int64').astype(str).str.zfill(2),
        errors='coerce'
    )

    sec = pd.to_numeric(sec_series, errors='coerce').fillna(0.0)
    if TIMESTAMP_MAX is not None:
        sec = sec.clip(lower=0, upper=TIMESTAMP_MAX)

    return date_real + pd.to_timedelta(sec, unit='s')

def find_window_bounds(input_behavior_csv: str, chunksize: int, days_window: int):
    """
    Scan behavior CSV in chunks to find the max calendar date in data,
    then compute [cutoff, upper) window for the last N days.
      cutoff: inclusive start of window (00:00)
      upper : exclusive end (next day's 00:00 of max_date)
    """
    max_date = None
    for chunk in pd.read_csv(input_behavior_csv, chunksize=chunksize,
                             usecols=['year','time_stamp'], dtype=str):
        dt = build_datetime(chunk['year'], chunk['time_stamp'],
                            pd.Series(0, index=chunk.index))
        local_max = dt.max()
        if pd.notna(local_max) and (max_date is None or local_max > max_date):
            max_date = local_max

    if max_date is None:
        raise ValueError("No valid dates in behavior; check 'year' and 'time_stamp'.")

    cutoff = max_date.normalize() - timedelta(days=days_window - 1)
    upper  = max_date.normalize() + timedelta(days=1)  # next day 00:00
    return max_date, cutoff, upper

def iter_clean_behavior(input_behavior_csv: str,
                        chunksize: int,
                        cutoff_date: pd.Timestamp,
                        upper: pd.Timestamp,
                        extra_usecols=None,
                        add_hour: bool=False,
                        valid_actions=VALID_ACTIONS):
    """
    Stream behavior CSV in chunks and yield pre-cleaned frames:
      - build full datetime 'dt'
      - filter to window [cutoff_date, upper)
      - clean 'user_id' and 'action_type'
      - optionally add 'hour'
      - keep only necessary columns (plus extra_usecols)
    """
    base_cols = ['user_id','year','time_stamp','timestamp','action_type']
    usecols   = base_cols + (extra_usecols or [])
    usecols   = list(dict.fromkeys(usecols))  # de-dup

    for chunk in pd.read_csv(input_behavior_csv, chunksize=chunksize,
                             usecols=usecols, dtype=str):
        chunk['dt'] = build_datetime(chunk['year'], chunk['time_stamp'], chunk['timestamp'])

        # Window filter: inclusive start, exclusive end
        mask = (chunk['dt'] >= cutoff_date) & (chunk['dt'] < upper)
        chunk = chunk.loc[mask].copy()
        if chunk.empty:
            continue

        # Clean user_id and action_type
        chunk['user_id'] = pd.to_numeric(chunk['user_id'], errors='coerce').astype('Int64')
        chunk = chunk.dropna(subset=['user_id'])
        chunk['user_id'] = chunk['user_id'].astype(int)

        chunk['action_type'] = chunk['action_type'].astype(str).str.lower().str.strip()
        if valid_actions is not None:
            chunk = chunk[chunk['action_type'].isin(valid_actions)]
            if chunk.empty:
                continue

        if add_hour:
            chunk['hour'] = chunk['dt'].dt.hour.astype(int)

        keep = ['user_id','action_type','dt'] + (['hour'] if add_hour else []) + (extra_usecols or [])
        keep = list(dict.fromkeys(keep))  # de-dup
        yield chunk[keep]

# keep your chosen order:
max_date, cutoff_date, upper = find_window_bounds(INPUT_BEHAVIOR_CSV, CHUNKSIZE, DAYS_WINDOW)
print(f"[INFO] max_date={max_date.date()} | cutoff_date={cutoff_date.date()} | upper={upper.date()} | window={DAYS_WINDOW}d")

[INFO] max_date=2024-11-12 | cutoff_date=2024-10-14 | upper=2024-11-13 | window=30d


## User-item Matrix

In [21]:
# 1) Action weights
ACTION_WEIGHTS = {
    'click': RFM_WEIGHTS.get('click', 1.0),
    'read' : RFM_WEIGHTS.get('read',  1.0),
    'like' : RFM_WEIGHTS.get('like',  1.0),
    'fav'  : RFM_WEIGHTS.get('fav',   1.0),
}

# 2) Aggregate implicit scores within [cutoff_date, upper)
ui_score = defaultdict(float)   # (user_id, item_id) -> score
rows_processed = 0
pairs_chunk_agg = 0

for chunk in iter_clean_behavior(
    INPUT_BEHAVIOR_CSV,
    CHUNKSIZE,
    cutoff_date,
    upper,
    extra_usecols=['item_id'],
    add_hour=False,
    valid_actions=VALID_ACTIONS
):
    # item_id numeric & non-null
    chunk['item_id'] = pd.to_numeric(chunk['item_id'], errors='coerce').astype('Int64')
    chunk = chunk.dropna(subset=['item_id']).copy()
    chunk['item_id'] = chunk['item_id'].astype(int)

    # map action -> weight and keep defined ones only
    w = chunk['action_type'].map(ACTION_WEIGHTS)
    chunk = chunk.loc[w.notna(), ['user_id', 'item_id']].copy()
    chunk['w'] = w.loc[chunk.index].to_numpy()

    # group within chunk to reduce Python loop overhead
    agg = chunk.groupby(['user_id', 'item_id'], observed=True)['w'].sum().reset_index()

    # accumulate into the global dictionary
    for r in agg.itertuples(index=False):
        ui_score[(int(r.user_id), int(r.item_id))] += float(r.w)

    rows_processed += len(chunk)
    pairs_chunk_agg += len(agg)

print(f"[INFO] rows processed: {rows_processed:,} | aggregated pairs (chunked): {pairs_chunk_agg:,}")
print(f"[OK]  unique (user,item) pairs: {len(ui_score):,}")

# 3) Reindex users/items to 0..n-1
users = sorted({u for (u, _) in ui_score.keys()})
items = sorted({i for (_, i) in ui_score.keys()})
u2idx = {u: idx for idx, u in enumerate(users)}
it2idx = {it: idx for idx, it in enumerate(items)}
idx2it = {v: k for k, v in it2idx.items()}

# 4) Build CSR matrix (users × items)
rows, cols, vals = [], [], []
for (u, it), s in ui_score.items():
    rows.append(u2idx[u])
    cols.append(it2idx[it])
    vals.append(float(s))

UI = csr_matrix((vals, (rows, cols)), shape=(len(users), len(items)))
print(f"[OK] UI shape: {UI.shape} | nnz={UI.nnz:,}")

# 5) Quick sanity prints
if len(users) == 0 or len(items) == 0:
    print("[WARN] UI is empty — check time window, VALID_ACTIONS, and item_id presence.")
else:
    density = UI.nnz / (UI.shape[0] * UI.shape[1] + 1e-9)
    print(f"[INFO] density: {density:.8f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['item_id'] = pd.to_numeric(chunk['item_id'], errors='coerce').astype('Int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['item_id'] = pd.to_numeric(chunk['item_id'], errors='coerce').astype('Int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['item_id'] = pd.to_numeric(chun

[INFO] rows processed: 5,318,353 | aggregated pairs (chunked): 4,285,781
[OK]  unique (user,item) pairs: 4,285,733
[OK] UI shape: (413038, 440466) | nnz=4,285,733
[INFO] density: 0.00002356


## Item-Item Similarity

In [22]:
# Cosine similarity with block-wise Top-K pruning
TOPK_SIM_PER_ITEM = 100    # keep top-K neighbors per item (tune)
BLOCK_SIZE = 4000          # items per block when computing similarity (tune)
MIN_SIM = 0.0              # optional similarity threshold

def compute_item_item_topk(UI: csr_matrix,
                           topk: int = TOPK_SIM_PER_ITEM,
                           block_size: int = BLOCK_SIZE,
                           min_sim: float = MIN_SIM) -> csr_matrix:
    """
    Compute item–item cosine similarity in blocks and keep Top-K neighbors per item.
    - UI: users × items (CSR)
    Returns:
    - S: items × items (CSR), pruned to Top-K per row
    """
    # items × users (each row is an item vector in user space)
    X = UI.T.tocsr()
    n_items = X.shape[0]

    data, rows, cols = [], [], []
    for start in range(0, n_items, block_size):
        end = min(start + block_size, n_items)

        # cosine on sparse → returns sparse if dense_output=False (needs sklearn >=1.4)
        sim_block = cosine_similarity(X[start:end], X, dense_output=False)
        # remove self-similarity in this sub-block’s diagonal
        sim_block.setdiag(0.0)

        # row-wise Top-K for the current block
        sim_block = sim_block.tocsr()
        for i in range(sim_block.shape[0]):
            row = sim_block[i]
            if row.nnz == 0:
                continue

            # pick Top-K indices by similarity
            vals = row.data
            idxs = row.indices
            if topk is not None and row.nnz > topk:
                top_idx = np.argpartition(vals, -topk)[-topk:]
                vals = vals[top_idx]
                idxs = idxs[top_idx]

            # optional min similarity cutoff
            if min_sim > 0.0:
                mask = vals >= min_sim
                if not np.any(mask):
                    continue
                vals = vals[mask]
                idxs = idxs[mask]

            # write back with row offset (row item index = start + i)
            rows.extend([start + i] * len(idxs))
            cols.extend(idxs.tolist())
            data.extend(vals.tolist())

        if (start // block_size) % 10 == 0:
            print(f"[SIM] processed items {start}..{end-1} / {n_items}")

    S = csr_matrix((data, (rows, cols)), shape=(n_items, n_items))
    S.eliminate_zeros()
    return S

# ---- run similarity ----
if UI.shape[1] == 0:
    raise RuntimeError("UI has no items; check previous phase.")
S = compute_item_item_topk(UI, topk=TOPK_SIM_PER_ITEM, block_size=BLOCK_SIZE, min_sim=MIN_SIM)
print(f"[OK] similarity S shape: {S.shape} | nnz={S.nnz:,} | avg neighbors/item ≈ {S.nnz // max(1,S.shape[0])}")

[SIM] processed items 0..3999 / 440466
[SIM] processed items 40000..43999 / 440466
[SIM] processed items 80000..83999 / 440466
[SIM] processed items 120000..123999 / 440466
[SIM] processed items 160000..163999 / 440466
[SIM] processed items 200000..203999 / 440466
[SIM] processed items 240000..243999 / 440466
[SIM] processed items 280000..283999 / 440466
[SIM] processed items 320000..323999 / 440466
[SIM] processed items 360000..363999 / 440466
[SIM] processed items 400000..403999 / 440466
[SIM] processed items 440000..440465 / 440466
[OK] similarity S shape: (440466, 440466) | nnz=26,568,156 | avg neighbors/item ≈ 60


## Batch Recommend (sparse top-N per user)

In [25]:
def _sparse_topn_row(
    row: csr_matrix,
    seen_idx: Optional[np.ndarray],
    n_recs: int
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Extract top-N (indices, scores) from a sparse row without densifying.
    - row: 1 x n_items CSR row (scores for one user)
    - seen_idx: array of column indices the user has already seen (to exclude)
    - n_recs: number of items to keep
    Returns:
      (top_col_indices, top_scores) sorted by descending score
    """
    if row.nnz == 0:
        return np.array([], dtype=int), np.array([], dtype=float)

    vals = row.data
    cols = row.indices

    if seen_idx is not None and len(seen_idx) > 0:
        # exclude seen indices
        mask = np.isin(cols, seen_idx, invert=True)
        if not np.any(mask):
            return np.array([], dtype=int), np.array([], dtype=float)
        vals = vals[mask]
        cols = cols[mask]

    if len(vals) == 0:
        return np.array([], dtype=int), np.array([], dtype=float)

    if len(vals) > n_recs:
        top_idx = np.argpartition(vals, -n_recs)[-n_recs:]
        vals, cols = vals[top_idx], cols[top_idx]

    # sort by score desc
    order = np.argsort(-vals)
    return cols[order], vals[order]


def recommend_for_users_batch(
    UI: csr_matrix,
    S: csr_matrix,
    u2idx: dict,
    idx2it: dict,
    user_ids: Optional[Iterable[int]] = None,
    n_recs: int = 20,
    batch_size: int = 1024,
    filter_seen: bool = True,
    min_score: Optional[float] = None,
    as_dataframe: bool = True
):
    """
    Batch item-based CF recommendation: for a set of users, compute Top-N items each.
    Works fully sparse (no densify), so it's memory-safe for large matrices.

    Parameters
    ----------
    UI : csr_matrix
        Users × Items implicit matrix.
    S : csr_matrix
        Items × Items similarity matrix (Top-K pruned).
    u2idx : dict
        Mapping user_id -> row index in UI.
    idx2it : dict
        Mapping column index -> item_id.
    user_ids : iterable[int] or None
        Users to score. If None, score all users that exist in u2idx.
    n_recs : int
        Number of items to recommend per user.
    batch_size : int
        Number of users per multiplication batch.
    filter_seen : bool
        Whether to exclude items already interacted by user.
    min_score : float or None
        Optional filter to drop items with score < min_score.
    as_dataframe : bool
        If True, return a DataFrame with columns [user_id, item_id, score].
        If False, return dict[user_id] -> list[(item_id, score)].

    Returns
    -------
    pd.DataFrame or dict
    """
    if user_ids is None:
        # all users that appear in UI (i.e., have a row index)
        user_ids = list(u2idx.keys())

    results_rows = []  # collect rows for DataFrame
    results_dict = {}  # or dict output

    # materialize id->row index only for requested users
    pairs = [(uid, u2idx[uid]) for uid in user_ids if uid in u2idx]
    if not pairs:
        return (pd.DataFrame(columns=['user_id', 'item_id', 'score'])
                if as_dataframe else {})

    # process in batches
    for start in range(0, len(pairs), batch_size):
        sub = pairs[start:start + batch_size]
        idxs = [u_idx for (_, u_idx) in sub]
        UI_batch = UI[idxs]                # (B × n_items) CSR
        scores_batch = UI_batch.dot(S)     # (B × n_items) CSR

        for row_pos, (uid, u_idx) in enumerate(sub):
            row_scores = scores_batch.getrow(row_pos)  # 1 × n_items
            seen = UI.getrow(u_idx).indices if filter_seen else None

            cols, vals = _sparse_topn_row(row_scores, seen, n_recs)
            if min_score is not None and len(vals) > 0:
                keep = vals >= min_score
                cols, vals = cols[keep], vals[keep]

            if as_dataframe:
                for c, v in zip(cols, vals):
                    results_rows.append((uid, idx2it[c], float(v)))
            else:
                results_dict[uid] = [(idx2it[c], float(v)) for c, v in zip(cols, vals)]

        if (start // batch_size) % 10 == 0:
            print(f"[BATCH] processed users {start}..{start+len(sub)-1} / {len(pairs)}")

    if as_dataframe:
        df = pd.DataFrame(results_rows, columns=['user_id', 'item_id', 'score'])
        # optionally ensure descending per-user order
        df = (df.sort_values(['user_id', 'score'], ascending=[True, False])
                .reset_index(drop=True))
        return df
    else:
        return results_dict

In [26]:
# 1) Generate Top-20 recommendations for all users with history
#    (Note: the full result can be very large).
#    Start with a subset (e.g., the first 50 users) for testing.
all_users_with_history = [u for u in u2idx.keys() if UI[u2idx[u]].getnnz() > 0]
subset_users = all_users_with_history[:5000]  # Test with the first 5k users

recs_df = recommend_for_users_batch(
    UI, S, u2idx, idx2it,
    user_ids=subset_users,
    n_recs=20,
    batch_size=1024,
    filter_seen=True,
    min_score=None,
    as_dataframe=True
)

print(recs_df.head())
print("[OK] batch recommendations:", recs_df.shape)

# 2) Save results
ensure_outdir(OUTDIR)
recs_df.to_csv(f"{OUTDIR}/item_based_recs_top20_subset.csv", index=False)
print("Saved:", f"{OUTDIR}/item_based_recs_top20_subset.csv")

[BATCH] processed users 0..1023 / 5000
   user_id  item_id     score
0        1    95268  0.673722
1        1   207111  0.673722
2        1    39162  0.673722
3        1   929503  0.663435
4        1   180450  0.595201
[OK] batch recommendations: (99722, 3)
Saved: /content/drive/MyDrive/Projects/cf_item_based/item_based_recs_top20_subset.csv
