In [4]:
# STAT3009 — Full TabRS.SVD Midterm Pipeline with CV & Hyperparameter Tuning
# Requirements: numpy, pandas, sklearn
# Make sure you have internet or that TabRS.py is available locally.

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
import itertools
import time
import os
import sys

In [None]:
https://www.kaggle.com/competitions/cuhk-stat-3009-quiz-2

In [50]:
import urllib.request
from pathlib import Path
import os

user = "statmlben"
repo = "CUHK-STAT3009"
src = "src"
pyfile = "TabRS.py"
url = f"https://raw.githubusercontent.com/{user}/{repo}/main/{src}/{pyfile}"

# Download TabRS.py if missing
if not Path("TabRS.py").exists():
    print("Downloading TabRS.py from GitHub ...")
    urllib.request.urlretrieve(url, "TabRS.py")
else:
    print("✅ TabRS.py already exists — skipping download")

from TabRS import SVD


✅ TabRS.py already exists — skipping download


In [37]:
# ---------------------------
# 1. Data paths — adjust if needed
# ---------------------------
# DATA_DIR = Path('/kaggle/input/cuhk-stat-3009-quiz-2')  # <- change to your path if needed
# TRAIN_CSV = DATA_DIR / 'train.csv'
# TEST_CSV = DATA_DIR / 'test.csv'
# SAMPLE_SUB = DATA_DIR / 'sample_submission.csv'

# optional side files
# USER_FEATS = DATA_DIR / 'user_feats.csv'
# ITEM_FEATS = DATA_DIR / 'item_feats.csv'
# USER_SOCIAL = DATA_DIR / 'user_social_net.csv'

TRAIN_CSV = 'fake_midterm_data/train.csv'
TEST_CSV = 'fake_midterm_data/test.csv'
SAMPLE_SUB = 'fake_midterm_data/sample_submission.csv'

# load
train = pd.read_csv(TRAIN_CSV)
test = pd.read_csv(TEST_CSV)
sample_submission = pd.read_csv(SAMPLE_SUB)

USER_FEATS = 'fake_midterm_data/user_feats.csv'
ITEM_FEATS = 'fake_midterm_data/item_feats.csv'


print('train shape', train.shape)
print('test shape', test.shape)



train shape (80, 4)
test shape (20, 3)


In [None]:
# ---------------------------
# 2. Remap IDs to contiguous integers (0..n-1) for user,item,group
# ---------------------------
# Build unions so test unseen ids are mapped too
all_users = np.union1d(train['userID'].unique(), test['userID'].unique())
all_items = np.union1d(train['itemID'].unique(), test['itemID'].unique())
all_groups = np.union1d(train['groupID'].unique(), test['groupID'].unique())

user2idx = {u:i for i,u in enumerate(all_users)}
item2idx = {it:i for i,it in enumerate(all_items)}
group2idx = {g:i for i,g in enumerate(all_groups)}

# remap train/test (new columns: u_idx, i_idx, g_idx)
train = train.copy()
test = test.copy()
train['u_idx'] = train['userID'].map(user2idx).astype(int)
train['i_idx'] = train['itemID'].map(item2idx).astype(int)
train['g_idx'] = train['groupID'].map(group2idx).astype(int)

test['u_idx'] = test['userID'].map(user2idx).astype(int)
test['i_idx'] = test['itemID'].map(item2idx).astype(int)
test['g_idx'] = test['groupID'].map(group2idx).astype(int)

n_users = len(user2idx)
n_items = len(item2idx)
n_groups = len(group2idx)
print(f"n_users={n_users}, n_items={n_items}, n_groups={n_groups}")

n_users=8, n_items=10, n_groups=3


In [20]:
# ---------------------------
# 3. CV + Grid search helpers (works with TabRS.SVD)
# ---------------------------
def cv_score_svd(X_pairs, y, n_rows, n_cols, params, n_splits=3, random_state=42, verbose=False):
    """
    X_pairs: np.array shape (n_samples,2) containing [left_id, right_id] (already remapped)
    n_rows, n_cols: sizes for SVD constructor (order: first arg, second arg)
    params: dict with keys 'K','lam','iterNum'
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    scores = []
    for tr_idx, val_idx in kf.split(X_pairs):
        X_tr, X_val = X_pairs[tr_idx], X_pairs[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        # instantiate SVD — TabRS.SVD signature: SVD(n_users, n_items, lam=.001, K=10, iterNum=10, tol=..., verbose=...)
        model = SVD(n_rows, n_cols, lam=params['lam'], K=params['K'], iterNum=params['iterNum'], tol=params.get('tol',1e-4), verbose=0)
        model.fit(X_tr, y_tr)
        preds = model.predict(X_val)
        scores.append(rmse(y_val, preds))
    mean_score = float(np.mean(scores))
    std_score = float(np.std(scores))
    if verbose:
        print(f"params {params} -> CV RMSE = {mean_score:.4f} ± {std_score:.4f}")
    return mean_score, std_score

def grid_search_svd(X_pairs, y, n_rows, n_cols, param_grid, n_splits=3, random_state=42, verbose=True):
    combos = [dict(zip(param_grid.keys(), vals)) for vals in itertools.product(*param_grid.values())]
    best_score = 1e9
    best_params = None
    results = []
    start = time.time()
    for p in combos:
        mean_rmse, std_rmse = cv_score_svd(X_pairs, y, n_rows, n_cols, p, n_splits=n_splits, random_state=random_state, verbose=False)
        results.append((p, mean_rmse, std_rmse))
        if verbose:
            print(f"params {p} -> RMSE {mean_rmse:.4f} ± {std_rmse:.4f}")
        if mean_rmse < best_score:
            best_score = mean_rmse
            best_params = p
    elapsed = time.time() - start
    if verbose:
        print(f"Grid search finished in {elapsed:.1f}s. Best: {best_params} RMSE={best_score:.4f}")
    return best_params, results

In [21]:
# ---------------------------
# 4. Parameter grid (keep reasonably small; expand if you have time)
# ---------------------------
param_grid = {
    'K': [5, 10, 20],
    'lam': [0.001, 0.01, 0.1],
    'iterNum': [5, 10]  # avoid huge iterNums for CV runtime
}


In [24]:
# ---------------------------
# 5. CV + Grid search for each variant
#    a) user + item
# ---------------------------
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

pairs_ui = np.vstack([train['u_idx'].values, train['i_idx'].values]).T
y = train['rating'].values
print("\n=== Grid search: SVD (user + item) ===")
best_ui_params, ui_results = grid_search_svd(pairs_ui, y, n_users, n_items, param_grid, n_splits=3)

# ---------------------------
#    b) user + group
# ---------------------------
pairs_ug = np.vstack([train['u_idx'].values, train['g_idx'].values]).T
print("\n=== Grid search: SVD (user + group) ===")
best_ug_params, ug_results = grid_search_svd(pairs_ug, y, n_users, n_groups, param_grid, n_splits=3)

# ---------------------------
#    c) item + group
# ---------------------------
pairs_ig = np.vstack([train['i_idx'].values, train['g_idx'].values]).T
print("\n=== Grid search: SVD (item + group) ===")
best_ig_params, ig_results = grid_search_svd(pairs_ig, y, n_items, n_groups, param_grid, n_splits=3)


=== Grid search: SVD (user + item) ===
params {'K': 5, 'lam': 0.001, 'iterNum': 5} -> RMSE 1.6848 ± 0.1542
params {'K': 5, 'lam': 0.001, 'iterNum': 10} -> RMSE 1.5130 ± 0.0408
params {'K': 5, 'lam': 0.01, 'iterNum': 5} -> RMSE 1.4477 ± 0.0821
params {'K': 5, 'lam': 0.01, 'iterNum': 10} -> RMSE 1.5647 ± 0.1622
params {'K': 5, 'lam': 0.1, 'iterNum': 5} -> RMSE 1.2384 ± 0.0322
params {'K': 5, 'lam': 0.1, 'iterNum': 10} -> RMSE 1.2378 ± 0.0310
params {'K': 10, 'lam': 0.001, 'iterNum': 5} -> RMSE 1.9236 ± 0.0924
params {'K': 10, 'lam': 0.001, 'iterNum': 10} -> RMSE 1.6268 ± 0.0518
params {'K': 10, 'lam': 0.01, 'iterNum': 5} -> RMSE 1.5913 ± 0.1067
params {'K': 10, 'lam': 0.01, 'iterNum': 10} -> RMSE 1.3780 ± 0.1066
params {'K': 10, 'lam': 0.1, 'iterNum': 5} -> RMSE 1.2399 ± 0.0299
params {'K': 10, 'lam': 0.1, 'iterNum': 10} -> RMSE 1.2379 ± 0.0307
params {'K': 20, 'lam': 0.001, 'iterNum': 5} -> RMSE 2.0495 ± 0.3042
params {'K': 20, 'lam': 0.001, 'iterNum': 10} -> RMSE 1.7965 ± 0.3089
param

In [25]:
# ---------------------------
# 6. Train final models on full train set using best params
# ---------------------------
print("\n=== Training final models on full train data ===")

# user+item final
print("Training SVD(user+item) with", best_ui_params)
svd_ui = SVD(n_users, n_items,
             lam=best_ui_params['lam'], K=best_ui_params['K'], iterNum=best_ui_params['iterNum'], verbose=1)
svd_ui.fit(pairs_ui, y)
test_pairs_ui = np.vstack([test['u_idx'].values, test['i_idx'].values]).T
pred_ui = svd_ui.predict(test_pairs_ui)

# user+group final
print("Training SVD(user+group) with", best_ug_params)
svd_ug = SVD(n_users, n_groups,
             lam=best_ug_params['lam'], K=best_ug_params['K'], iterNum=best_ug_params['iterNum'], verbose=1)
svd_ug.fit(pairs_ug, y)
test_pairs_ug = np.vstack([test['u_idx'].values, test['g_idx'].values]).T
pred_ug = svd_ug.predict(test_pairs_ug)

# item+group final
print("Training SVD(item+group) with", best_ig_params)
svd_ig = SVD(n_items, n_groups,
             lam=best_ig_params['lam'], K=best_ig_params['K'], iterNum=best_ig_params['iterNum'], verbose=1)
svd_ig.fit(pairs_ig, y)
test_pairs_ig = np.vstack([test['i_idx'].values, test['g_idx'].values]).T
pred_ig = svd_ig.predict(test_pairs_ig)


=== Training final models on full train data ===
Training SVD(user+item) with {'K': 5, 'lam': 0.1, 'iterNum': 10}
Fitting Reg-SVD: K: 5, lam: 0.10000
RegSVD-ALS: 0; obj: 1.458; rmse:1.113, diff: 20.591
RegSVD-ALS: 1; obj: 0.973; rmse:0.979, diff: 0.485
RegSVD-ALS: 2; obj: 0.953; rmse:0.975, diff: 0.020
RegSVD-ALS: 3; obj: 0.951; rmse:0.975, diff: 0.001
RegSVD-ALS: 4; obj: 0.951; rmse:0.975, diff: 0.000
RegSVD-ALS: 5; obj: 0.951; rmse:0.975, diff: 0.000
Training SVD(user+group) with {'K': 20, 'lam': 0.1, 'iterNum': 5}
Fitting Reg-SVD: K: 20, lam: 0.10000
RegSVD-ALS: 0; obj: 2.666; rmse:1.172, diff: 72.435
RegSVD-ALS: 1; obj: 1.928; rmse:1.102, diff: 0.738
RegSVD-ALS: 2; obj: 1.437; rmse:1.051, diff: 0.491
RegSVD-ALS: 3; obj: 1.174; rmse:1.019, diff: 0.263
RegSVD-ALS: 4; obj: 1.100; rmse:1.012, diff: 0.074
Training SVD(item+group) with {'K': 20, 'lam': 0.1, 'iterNum': 5}
Fitting Reg-SVD: K: 20, lam: 0.10000
RegSVD-ALS: 0; obj: 2.784; rmse:1.342, diff: 61.687
RegSVD-ALS: 1; obj: 1.497; r

In [28]:
# ---------------------------
# 7. Ensemble averaging & save
# ---------------------------
pred_ensemble = (pred_ui + pred_ug + pred_ig) / 3.0
submission = sample_submission.copy()
submission['rating'] = pred_ensemble
submission.to_csv('submission.csv', index=False)

In [42]:
train

Unnamed: 0,userID,itemID,groupID,rating
0,6,6,2,3.91
1,3,7,2,2.47
2,4,2,0,3.53
3,6,0,2,3.53
4,2,3,2,3.14
...,...,...,...,...
75,1,6,2,1.78
76,3,0,2,3.89
77,3,3,2,2.12
78,6,3,2,1.10


In [46]:
# Encode userID and itemID to contiguous integer indices
train['u_idx'] = train['userID'].astype('category').cat.codes
train['i_idx'] = train['itemID'].astype('category').cat.codes
test['u_idx'] = test['userID'].astype('category').cat.codes
test['i_idx'] = test['itemID'].astype('category').cat.codes

# Also group if needed
train['g_idx'] = train['groupID'].astype('category').cat.codes
test['g_idx'] = test['groupID'].astype('category').cat.codes


In [47]:
print("\n=== Enhanced Hybrid + Fallback Ensemble ===")

# 1️⃣ Collect base SVD predictions (aligned with test.csv order)
pred_ensemble = (pred_ui + pred_ug + pred_ig) / 3.0

# 2️⃣ Cold-start fallback for missing user/item pairs
item_mean = train.groupby('i_idx')['rating'].mean().to_dict()
global_mean = train['rating'].mean()
known_users_idx = set(train['u_idx'].unique())
known_items_idx = set(train['i_idx'].unique())

def predict_with_fallback_idx(u_idx, i_idx, model, default_pred):
    # if both known -> ensemble prediction
    if (u_idx in known_users_idx) and (i_idx in known_items_idx):
        return default_pred
    # if item known -> item mean
    if i_idx in item_mean:
        return item_mean[i_idx]
    # else global
    return global_mean

preds_combined = np.array([
    predict_with_fallback_idx(u, i, svd_ui, p)
    for u, i, p in zip(test['u_idx'], test['i_idx'], pred_ensemble)
])

# 3️⃣ Build hybrid dataset (add features)
user_feats = pd.read_csv(USER_FEATS)
item_feats = pd.read_csv(ITEM_FEATS)

# --- Prepare training data ---
df = train[['userID', 'itemID', 'u_idx', 'i_idx', 'rating']].copy()
# Add ensemble predictions (average of 3 SVDs)
df['svd_ui'] = svd_ui.predict(df[['u_idx','i_idx']].values)
df['svd_ug'] = svd_ug.predict(np.vstack([df['u_idx'], train['g_idx']]).T)
df['svd_ig'] = svd_ig.predict(np.vstack([df['i_idx'], train['g_idx']]).T)
df['svd_ensemble'] = (df['svd_ui'] + df['svd_ug'] + df['svd_ig']) / 3.0

# Merge user and item features
df = df.merge(user_feats, on='userID', how='left')
df = df.merge(item_feats, on='itemID', how='left')

# --- Encode + clean ---
for c in df.columns:
    if df[c].dtype.kind in 'biufc':
        df[c] = df[c].fillna(df[c].mean())
    else:
        df[c] = df[c].fillna('NA')

ignore_cols = {'userID', 'itemID', 'u_idx', 'i_idx', 'rating'}
cat_cols = [c for c in df.columns if c not in ignore_cols and df[c].dtype == object]
df = pd.get_dummies(df, columns=cat_cols, dummy_na=True)

feature_cols = [c for c in df.columns if c not in ignore_cols]
X = df[feature_cols].values
y = df['rating'].values

# 4️⃣ Train Ridge on all features
ridge = Ridge(alpha=1.0)
ridge.fit(X, y)

# --- Prepare test data ---
test_df = test[['userID','itemID','u_idx','i_idx','groupID']].copy()
test_df['svd_ui'] = pred_ui
test_df['svd_ug'] = pred_ug
test_df['svd_ig'] = pred_ig
test_df['svd_ensemble'] = preds_combined  # includes fallback logic

test_df = test_df.merge(user_feats, on='userID', how='left')
test_df = test_df.merge(item_feats, on='itemID', how='left')

for c in test_df.columns:
    if test_df[c].dtype.kind in 'biufc':
        test_df[c] = test_df[c].fillna(df[c].mean() if c in df.columns else 0)
    else:
        test_df[c] = test_df[c].fillna('NA')

test_df = pd.get_dummies(test_df, columns=[c for c in cat_cols if c in test_df.columns], dummy_na=True)
for c in feature_cols:
    if c not in test_df.columns:
        test_df[c] = 0
X_test = test_df[feature_cols].values

# 5️⃣ Final prediction (hybrid + ensemble + fallback)
pred_final = ridge.predict(X_test)
pred_final = np.clip(pred_final, train['rating'].min(), train['rating'].max())

# Save final submission
submission_final = sample_submission.copy()
submission_final['rating'] = pred_final
submission_final.to_csv('submission_tabrs_final_ensemble.csv', index=False)
print("✅ Saved submission_tabrs_final_ensemble.csv")




=== Enhanced Hybrid + Fallback Ensemble ===
✅ Saved submission_tabrs_final_ensemble.csv


In [None]:
df = df.drop(['age', 'income'], axis=1)


In [None]:
# ---------------------------
# 10. Social graph smoothing / blending (if available)
# ---------------------------
if USER_SOCIAL.exists():
    print("\n=== Social graph blending ===")
    edges = pd.read_csv(USER_SOCIAL)
    # expect columns like ['from','to'] in original IDs; we map them to idx if possible
    # try to detect column names
    if 'from' in edges.columns and 'to' in edges.columns:
        col_from, col_to = 'from', 'to'
    elif 'follower' in edges.columns and 'followee' in edges.columns:
        col_from, col_to = 'follower', 'followee'
    else:
        # try first two columns
        col_from, col_to = edges.columns[0], edges.columns[1]

    # map original ids -> idx if exist in mapping; otherwise ignore those edges
    def map_if_present(x, mapping):
        return mapping[x] if x in mapping else None

    followees = {}
    for _, row in edges.iterrows():
        a = row[col_from]
        b = row[col_to]
        if (a in user2idx) and (b in user2idx):
            ai = user2idx[a]; bi = user2idx[b]
            followees.setdefault(ai, []).append(bi)

    # user mean ratings (by idx)
    user_mean_idx = train.groupby('u_idx')['rating'].mean().to_dict()

    social_preds = []
    for u_orig, i_orig in zip(test['userID'], test['itemID']):
        u_idx = user2idx.get(u_orig, None)
        i_idx = item2idx.get(i_orig, None)
        if u_idx is None or i_idx is None:
            social_preds.append(np.nan); continue
        friends = followees.get(u_idx, [])
        vals = [user_mean_idx.get(f, np.nan) for f in friends if f in user_mean_idx]
        vals = [v for v in vals if not np.isnan(v)]
        if len(vals) > 0:
            social_preds.append(np.mean(vals))
        else:
            social_preds.append(np.nan)
    social_preds = np.array(social_preds)

    # blend: where social exists, blend 80% ensemble + 20% social average
    blended = np.where(~np.isnan(social_preds), 0.8 * pred_ensemble + 0.2 * social_preds, pred_ensemble)
    submission_social = sample_submission.copy()
    submission_social['rating'] = blended
    social_path = out_dir / 'submission_tabrs_social.csv'
    submission_social.to_csv(social_path, index=False)
    print(f"Saved social blend submission: {social_path}")
else:
    print("\nNo user_social_net found — skipping social blending.")

# ---------------------------
# 11. Summary printout
# ---------------------------
print("\n=== Done ===\nGenerated files in:", out_dir)
print("- ensemble:", ensemble_path)
if 'hybrid_path' in locals():
    print("- hybrid:", hybrid_path)
print("- fallback:", fallback_path)
if 'social_path' in locals():
    print("- social blend:", social_path)

# Optionally save CV results as CSV for inspection
cv_out = out_dir / 'cv_results_summary.csv'
rows = []
for p,mean,std in ui_results:
    rows.append({'model':'user_item', 'K':p['K'], 'lam':p['lam'], 'iterNum':p['iterNum'], 'rmse':mean, 'std':std})
for p,mean,std in ug_results:
    rows.append({'model':'user_group', 'K':p['K'], 'lam':p['lam'], 'iterNum':p['iterNum'], 'rmse':mean, 'std':std})
for p,mean,std in ig_results:
    rows.append({'model':'item_group', 'K':p['K'], 'lam':p['lam'], 'iterNum':p['iterNum'], 'rmse':mean, 'std':std})
pd.DataFrame(rows).to_csv(cv_out, index=False)
print("Saved CV summary:", cv_out)
