# Part 2 — Item-based Collaborative Filtering

This notebook prepares item profiles, computes an item-item similarity matrix (cosine on mean-centered ratings), and produces top-N recommendations for test users. Changes in this edit:

- Added a clear header and short description.
- Tidied imports and moved constants close to the top.
- Kept original logic intact; only reorganized cells for clarity.

Usage:
- Run cells top-to-bottom. Input data files are expected in the parent folder ("../rating10user91_trainset.csv", "../rating10user91_testset.csv").


In [1]:
# Core imports
from pathlib import Path
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np

# Useful shorthand
NP = np
PD = pd


In [2]:
INPUT_PATH = "../rating10user91_trainset.csv"
OUTPUT_PATH = "P2Part2_1Profile_Group4.csv"

In [3]:
# 1. Load
df = pd.read_csv(INPUT_PATH)

In [4]:
# Auto-detect columns (fallback to first three if necessary)
original_columns = list(df.columns)
cols_lower = [c.lower() for c in original_columns]

def find_col(candidates):
    for pat in candidates:
        for i, c in enumerate(cols_lower):
            if pat in c:
                return original_columns[i]
    return None

user_col = find_col(['user', 'userid', 'user_id', 'uid', 'userid', 'userId', 'customer'])
item_col = find_col(['isbn', 'item', 'movie', 'product', 'book', 'id'])
rating_col = find_col(['rating', 'rate', 'score', 'stars', 'value'])

if user_col is None or item_col is None or rating_col is None:
    user_col, item_col, rating_col = original_columns[:3]

# Rename to canonical names used throughout the notebook
df = df.rename(columns={user_col: 'user', item_col: 'item', rating_col: 'rating'})
# Ensure lower-case column names for downstream consistency
DF_COLS_LOWER = ['user', 'item', 'rating']

# Show head for quick sanity check
df.head()

Unnamed: 0,user,item,rating
0,6251,60392452,10
1,6251,61009059,7
2,6251,140067477,10
3,6251,375727345,6
4,6251,380789035,7


In [5]:
# 2) Clean types
# Ensure canonical names and types
for col in ['user','item','rating']:
    if col not in df.columns:
        raise KeyError(f"Expected column '{col}' in dataframe after rename. Found: {df.columns.tolist()}")

df['user'] = df['user'].astype(str)
df['item'] = df['item'].astype(str)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df = df.dropna(subset=['rating']).copy()

df.head()

Unnamed: 0,user,item,rating
0,6251,60392452,10
1,6251,61009059,7
2,6251,140067477,10
3,6251,375727345,6
4,6251,380789035,7


In [6]:
# 3) Per-user mean, mean-centering
user_mean = df.groupby('user')['rating'].mean()
df['user_mean'] = df['user'].map(user_mean)
df['rating_centered'] = df['rating'] - df['user_mean']
df.head()

Unnamed: 0,user,item,rating,user_mean,rating_centered
0,6251,60392452,10,8.545455,1.454545
1,6251,61009059,7,8.545455,-1.545455
2,6251,140067477,10,8.545455,1.454545
3,6251,375727345,6,8.545455,-2.545455
4,6251,380789035,7,8.545455,-1.545455


In [7]:
# 3) Per-user mean, mean-centering
user_mean = df.groupby('user')['rating'].mean()
df['user_mean'] = df['user'].map(user_mean)
df['rating_centered'] = df['rating'] - df['user_mean']
df.head()

# 4) Pivot: users (rows) x items (columns)
pivot = df.pivot_table(index='user', columns='item', values='rating_centered', aggfunc='mean')
# canonical index name for clarity
pivot.index.name = 'user'
pivot.head()

item,014028009X,014029628X,034538475X,043935806X,044021145X,044022165X,044023722X,044651652X,059035342X,067976402X,...,671003755,671027360,671041789,679781587,743418174,786868716,804106304,805063897,842329129,971880107
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101851,,,,1.0,,,,,,,...,,,,,,,,,,
102647,,,,,,,,,,,...,,,,,,,,,,
104636,,,,,1.375,-0.625,,,1.375,,...,-0.625,,,,,,,,,
105028,,,,,,,,,,,...,,,,,,,,,,
105517,,,,-2.428571,,,,,,,...,,,,,,,,,,


In [8]:
pivot.shape

(91, 112)

In [9]:
# 5) Save CSV
pivot.to_csv(OUTPUT_PATH, float_format="%.4f", na_rep="")

In [10]:
GROUP_NO = 4
MODEL_PATH = f"P2Part2_2Model_Group{GROUP_NO}.csv"

# Lecturer-faithful settings:
MIN_OVERLAP = 2        # require at least 2 co-raters to avoid degenerate ±1 from single overlap
APPLY_SHRINKAGE = False
SHRINKAGE_LAMBDA = 10  # ignored if APPLY_SHRINKAGE is False

print(f"Writing similarity matrix to: {MODEL_PATH}")

Writing similarity matrix to: P2Part2_2Model_Group4.csv


In [11]:
# raw pivot using original ratings (use canonical names)
raw_pivot = df.pivot_table(index='user', columns='item', values='rating', aggfunc='mean')
raw_pivot.index.name = 'user'
raw_pivot.head()

item,014028009X,014029628X,034538475X,043935806X,044021145X,044022165X,044023722X,044651652X,059035342X,067976402X,...,671003755,671027360,671041789,679781587,743418174,786868716,804106304,805063897,842329129,971880107
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101851,,,,10.0,,,,,,,...,,,,,,,,,,
102647,,,,,,,,,,,...,,,,,,,,,,
104636,,,,,10.0,8.0,,,10.0,,...,8.0,,,,,,,,,
105028,,,,,,,,,,,...,,,,,,,,,,
105517,,,,4.0,,,,,,,...,,,,,,,,,,


In [12]:
# Centered pivot you already created: `pivot` (values are rating_centered)
centered_pivot = pivot  # rename for clarity

items = centered_pivot.columns.to_list()
n_items = len(items)
values = centered_pivot.values  # shape: (num_users, num_items)

sim_mat = np.zeros((n_items, n_items), dtype=np.float64)
overlap_mat = np.zeros((n_items, n_items), dtype=np.int32)  # optional diagnostics

for i in range(n_items):
    vi = values[:, i]
    mask_i = ~np.isnan(vi)
    for j in range(i, n_items):
        if i == j:
            continue
        vj = values[:, j]
        mask_j = ~np.isnan(vj)
        common = mask_i & mask_j
        c = int(common.sum())
        if c < MIN_OVERLAP or c == 0:
            continue
        vi_c = vi[common]
        vj_c = vj[common]
        # numerator & denominators
        num = np.dot(vi_c, vj_c)
        denom_i = np.sqrt(np.dot(vi_c, vi_c))
        denom_j = np.sqrt(np.dot(vj_c, vj_c))
        if denom_i == 0 or denom_j == 0:
            continue  # leave similarity 0 (undefined)
        raw_sim = num / (denom_i * denom_j)
        if APPLY_SHRINKAGE:
            weight = c / (c + SHRINKAGE_LAMBDA)
            sim_val = weight * raw_sim
        else:
            sim_val = raw_sim
        sim_mat[i, j] = sim_val
        sim_mat[j, i] = sim_val
        overlap_mat[i, j] = c
        overlap_mat[j, i] = c

# Diagonal
for i in range(n_items):
    if np.any(~np.isnan(values[:, i])):
        sim_mat[i, i] = 1.0

sim_df = pd.DataFrame(sim_mat, index=items, columns=items)
sim_df.to_csv(MODEL_PATH, float_format="%.6f")
print(f"Similarity matrix saved with shape {sim_df.shape}")
nonzero_pairs = np.sum((sim_mat != 0) & (~np.eye(n_items, dtype=bool)))
print(f"Non-zero off-diagonal entries: {nonzero_pairs}")

Similarity matrix saved with shape (112, 112)
Non-zero off-diagonal entries: 5360


In [13]:
# Distribution of similarities (excluding zero or diagonal)
tri = sim_mat[np.triu_indices(n_items, k=1)]
valid = tri[tri != 0]
print("Count valid pairwise similarities:", valid.size)
if valid.size:
    print("Min / Max similarity:", valid.min(), valid.max())
    print("Fraction negative:", (valid < 0).mean())

# Example: top 5 neighbors (absolute similarity) for a random item
example_item = items[0]
row_series = sim_df.loc[example_item].drop(example_item)
# show top neighbors by signed similarity (descending)
top5 = row_series.sort_values(ascending=False).head(5)
print(f"Top 5 neighbors for {example_item} (signed sims):")
print(top5)

Count valid pairwise similarities: 2680
Min / Max similarity: -1.0 1.0000000000000002
Fraction negative: 0.5257462686567164
Top 5 neighbors for 014028009X (signed sims):
61009059     1.000000
60392452     1.000000
590353403    1.000000
440498058    1.000000
609804138    0.889254
Name: 014028009X, dtype: float64


In [14]:
TRAIN_PATH = "../rating10user91_trainset.csv"
TEST_PATH = "../rating10user91_testset.csv"  # adjust if path differs
SIM_PATH = f"P2Part2_2Model_Group{GROUP_NO}.csv"
RECOMMEND_PATH = f"P2Part2_3Recommendation_Group{GROUP_NO}.csv"

# Parameters / toggles (lecturer-style weighted sum — denominator is sum of sims, not abs)
USE_ABS_IN_DENOM = False          # lecturer formula uses sum(sim), not sum(|sim|)
INCLUDE_NEGATIVE_SIMS = False     # use only most similar (positive) neighbors
TOP_K_NEIGHBORS = 50              # cap neighbors per item (choose K; lecturer shows 2-NN as example)

CLIP_MIN, CLIP_MAX = 1.0, 10.0
TOP_N = 10

print(f"Using similarity: {SIM_PATH}")
print(f"Recommendations will be written to: {RECOMMEND_PATH}")

Using similarity: P2Part2_2Model_Group4.csv
Recommendations will be written to: P2Part2_3Recommendation_Group4.csv


In [15]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(
    TEST_PATH
)  # Needed later for RMSE; not used directly for ranking now

# Normalize column names
for d in (train_df, test_df):
    d.columns = [c.lower() for c in d.columns]

train_df = train_df.rename(
    columns={"userid": "user", "user_id": "user", "isbn": "item", "book": "item"}
)
test_df = test_df.rename(
    columns={"userid": "user", "user_id": "user", "isbn": "item", "book": "item"}
)

# Ensure consistent types
train_df["user"] = train_df["user"].astype(str)
train_df["item"] = train_df["item"].astype(str)
train_df["rating"] = pd.to_numeric(train_df["rating"], errors="coerce")

test_df["user"] = test_df["user"].astype(str)
test_df["item"] = test_df["item"].astype(str)
test_df["rating"] = pd.to_numeric(test_df["rating"], errors="coerce")

# Drop NaN ratings if any
train_df = train_df.dropna(subset=["rating"])
test_df = test_df.dropna(subset=["rating"])

# Load similarity matrix (items x items)
sim_df = pd.read_csv(SIM_PATH, index_col=0)
# Ensure item ids are strings
sim_df.index = sim_df.index.astype(str)
sim_df.columns = sim_df.columns.astype(str)

print(
    "Train users:",
    train_df["user"].nunique(),
    "Train items:",
    train_df["item"].nunique(),
    "Similarity matrix shape:",
    sim_df.shape,
)

Train users: 91 Train items: 112 Similarity matrix shape: (112, 112)


In [16]:
# User -> list of (item, rating)
# Fix groupby warning by selecting columns before apply
user_ratings = (
    train_df.groupby('user')[[ 'item', 'rating' ]]
    .apply(lambda g: list(g.itertuples(index=False, name=None)))
    .to_dict()
)

# User mean ratings (for fallback)
user_mean = train_df.groupby('user')['rating'].mean().to_dict()
global_mean = train_df['rating'].mean()

all_items = set(sim_df.index.tolist())  # items in similarity matrix

In [17]:
# Precompute for each item: list of (neighbor_item, sim_value)
item_neighbors = {}

for item in sim_df.index:
    row = sim_df.loc[item]
    # Remove self
    row = row.drop(item, errors='ignore')
    # Drop zeros
    row = row[row != 0]
    # Keep only positive (most similar) if requested
    if not INCLUDE_NEGATIVE_SIMS:
        row = row[row > 0]
    if row.empty:
        item_neighbors[item] = []
    else:
        # Sort by similarity descending (not absolute) and apply top-K cap
        row = row.sort_values(ascending=False)
        if TOP_K_NEIGHBORS and TOP_K_NEIGHBORS > 0:
            row = row.head(TOP_K_NEIGHBORS)
        item_neighbors[item] = list(zip(row.index.tolist(), row.values.tolist()))

print("Example neighbor list length distribution (first 5 items):")
for idx, (it, neighs) in enumerate(item_neighbors.items()):
    print(it, len(neighs))
    if idx >= 4:
        break

Example neighbor list length distribution (first 5 items):
014028009X 37
014029628X 24
034538475X 22
043935806X 27
044021145X 39


In [18]:
def predict_for_user(user_id):
    """
    Returns list of (item, predicted_rating) for candidate items for user_id.
    """
    rated_pairs = user_ratings.get(user_id, [])
    if not rated_pairs:
        # Cold user fallback: global mean for all items (degenerate)
        return [(it, global_mean) for it in all_items]
    
    rated_items = {it for it, _ in rated_pairs}
    # Candidate = items present in similarity matrix but not already rated (train)
    candidates = all_items - rated_items
    if not candidates:
        return []
    
    # Build rating dict for quick lookup
    rdict = dict(rated_pairs)
    
    num = {}
    denom = {}
    
    # For each rated item, propagate to candidate items via its neighbors
    for i, r_ui in rated_pairs:
        neighs = item_neighbors.get(i, [])
        if not neighs:
            continue
        for j, sim_val in neighs:
            if j not in candidates:
                continue
            # Accumulate
            num[j] = num.get(j, 0.0) + sim_val * r_ui
            if USE_ABS_IN_DENOM:
                # Not lecturer's formula; kept for optional experiments
                denom[j] = denom.get(j, 0.0) + abs(sim_val)
            else:
                denom[j] = denom.get(j, 0.0) + sim_val
    
    # Compute predictions
    preds = []
    u_mean = user_mean.get(user_id, global_mean)
    for j in candidates:
        if j in num and denom.get(j, 0) not in (0, None):
            d = denom[j]
            if d == 0:
                pred = u_mean
            else:
                pred = num[j] / d
        else:
            # Fallback if no contributing neighbors
            pred = u_mean
        # Clamp to rating scale
        if pred < CLIP_MIN:
            pred = CLIP_MIN
        if pred > CLIP_MAX:
            pred = CLIP_MAX
        preds.append((j, pred))
    
    # Sort by predicted rating descending, break ties by item id (stable)
    preds.sort(key=lambda x: (-x[1], x[0]))
    return preds

In [19]:
target_users = sorted(test_df['user'].unique())

records = []
for u in target_users:
    pred_list = predict_for_user(u)
    top_k = pred_list[:TOP_N]
    for item, score in top_k:
        records.append((u, item, score))

recs_df = pd.DataFrame(records, columns=['user','item','pred_rating'])
recs_df.to_csv(RECOMMEND_PATH, index=False, float_format="%.4f")
recs_df.head(15)

Unnamed: 0,user,item,pred_rating
0,101851,014028009X,10.0
1,101851,067976402X,10.0
2,101851,1400031354,10.0
3,101851,312195516,10.0
4,101851,312278586,10.0
5,101851,312983271,10.0
6,101851,316096199,10.0
7,101851,345350499,10.0
8,101851,375726403,10.0
9,101851,380018179,10.0


In [20]:
print("Number of target users:", len(target_users))
print("Rows in recommendation file (should be users * TOP_N unless fewer candidates):", len(recs_df))
missing_any = [u for u in target_users if (recs_df['user']==u).sum()==0]
if missing_any:
    print("Users with no recommendations (unexpected):", missing_any)
else:
    print("All users have at least one recommendation.")

Number of target users: 40
Rows in recommendation file (should be users * TOP_N unless fewer candidates): 400
All users have at least one recommendation.


### RMSE evaluation using existing predict_for_user (no duplicated logic)
#### For each user in the test set, compute predictions once via predict_for_user() then look up predictions for that user's test items. Save per-row predictions to CSV.

In [23]:
from math import sqrt

# Compute RMSE
PREDICTIONS_PATH = f"P2Part2_4_Predictions_Group{GROUP_NO}.csv"
RMSE_PATH = f"P2Part2_4_RMSE_Group{GROUP_NO}.csv"

records = []
fallback_count = 0
missing_item_count = 0

# Group test rows by user to avoid recomputing predictions repeatedly
for user, group in test_df.groupby("user"):
    user = str(user)
    preds = predict_for_user(user)  # returns list of (item, predicted_rating)
    preds_dict = dict(preds)
    u_mean = user_mean.get(user, global_mean)

    for _, r in group.iterrows():
        item = str(r["item"])
        actual = float(r["rating"])

        pred = preds_dict.get(item, None)
        if pred is None:
            # No prediction produced for this (user,item) by predict_for_user
            # Possible reasons: item not in similarity matrix, or user had no candidate items
            if item not in all_items:
                missing_item_count += 1
            fallback_count += 1
            pred = u_mean

        # Clip to rating range
        pred = max(CLIP_MIN, min(CLIP_MAX, pred))
        records.append((user, item, actual, pred))

# Save predictions
pred_df = pd.DataFrame(
    records, columns=["user", "item", "actual_rating", "predicted_rating"]
)
pred_df.to_csv(PREDICTIONS_PATH, index=False, float_format="%.4f")

# Compute RMSE
mse = ((pred_df["actual_rating"] - pred_df["predicted_rating"]) ** 2).mean()
rmse = sqrt(mse)
with open(RMSE_PATH, "w") as f:
    f.write(f"RMSE,{rmse:.4f}\n")

print(f"Prediction rows: {len(pred_df)}")
print(f"RMSE on test set: {rmse:.4f}")
print(f"Fallback predictions (used user mean): {fallback_count}")
print(f"Items missing from similarity matrix: {missing_item_count}")
print(f"Predictions saved to: {PREDICTIONS_PATH}")
print(f"RMSE saved to: {RMSE_PATH}")

Prediction rows: 200
RMSE on test set: 1.7518
Fallback predictions (used user mean): 10
Items missing from similarity matrix: 0
Predictions saved to: P2Part2_4_Predictions_Group4.csv
RMSE saved to: P2Part2_4_RMSE_Group4.csv
