***Setup and imports***

In [1]:
# Core Python & scientific stack
import os
import json
import math
import numpy as np
import pandas as pd

from typing import Dict, List

# Sparse + ML utilities
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error, mean_absolute_error



***Configuration & Data Loading***

In [2]:
# Paths
BASE_DIR = "Data"

PATH_ANIME = os.path.join(BASE_DIR, "anime.csv")
PATH_TRAIN = os.path.join(BASE_DIR, "train.csv")
PATH_TEST  = os.path.join(BASE_DIR, "test.csv")

USER, ITEM, RATE = "user_id", "anime_id", "rating"

# Speed mode toggle
USE_SMALL = False
USER_CAP  = 2000
ITEM_CAP  = 3000

# Load tables
anime_raw = pd.read_csv(PATH_ANIME)
train_raw = pd.read_csv(PATH_TRAIN)
test_raw  = pd.read_csv(PATH_TEST)

# Clean invalid ratings
train_raw = train_raw[pd.to_numeric(train_raw[RATE], errors="coerce").notnull()].copy()
train_raw[RATE] = train_raw[RATE].astype(float)
train_raw = train_raw[train_raw[RATE] >= 0].copy()

# Stats
rmin, rmax = float(train_raw[RATE].min()), float(train_raw[RATE].max())
gmean = float(train_raw[RATE].mean())

print(f"Train size: {train_raw.shape}, Test size: {test_raw.shape}")
print(f"Rating range: [{rmin}, {rmax}], Global mean={gmean:.3f}")


Train size: (5703555, 3), Test size: (633686, 2)
Rating range: [1.0, 10.0], Global mean=7.809


***Build Lightweight Content Text (Metadata)***

In [3]:
text_cols = [c for c in ["genre", "type", "name", "episodes"] if c in anime_raw.columns]

anime_raw["meta_text"] = (
    anime_raw[text_cols].astype(str).agg(" ".join, axis=1)
)

# Keep only item IDs present in training data
all_item_ids = sorted(train_raw[ITEM].unique())
anime_info = pd.DataFrame({ITEM: all_item_ids}).merge(
    anime_raw[[ITEM, "meta_text"]], on=ITEM, how="left"
).fillna({"meta_text": ""})


***Integer Mappings & Sparse Matrix***

In [4]:
# ID → index mappings
u_index = {u: i for i, u in enumerate(sorted(train_raw[USER].unique()))}
i_index = {a: j for j, a in enumerate(all_item_ids)}

nU, nI = len(u_index), len(i_index)
print(f"Users={nU}, Items={nI}")

# Sparse CSR rating matrix
r = train_raw
row = r[USER].map(u_index).values
col = r[ITEM].map(i_index).values
val = r[RATE].values

Rmat = sparse.coo_matrix((val, (row, col)), shape=(nU, nI)).tocsr()


Users=69481, Items=9838


***Per-User & Per-Item Means***

In [5]:
def csr_axis_mean(mat: sparse.csr_matrix, axis=1):
    # axis=1 → per-row; axis=0 → per-column
    if axis == 0:
        return csr_axis_mean(mat.T, axis=1)

    means = np.zeros(mat.shape[0], dtype=float)
    for r in range(mat.shape[0]):
        s, e = mat.indptr[r], mat.indptr[r+1]
        means[r] = mat.data[s:e].mean() if e > s else gmean
    return means

u_mean = csr_axis_mean(Rmat, axis=1)
i_mean = csr_axis_mean(Rmat, axis=0)

# Quick lookup dict of items per user
user_r: List[Dict[int, float]] = [dict() for _ in range(nU)]

for u in range(nU):
    s, e = Rmat.indptr[u], Rmat.indptr[u+1]
    rated_items = Rmat.indices[s:e]
    rated_vals  = Rmat.data[s:e]
    user_r[u]   = {int(i): float(r) for i, r in zip(rated_items, rated_vals)}


***Similarity Neighbors (CF + Content)***

In [6]:
K_NEIGH = 30 if not USE_SMALL else 25

# ---- CF neighbor model (item-item) ----
item_knn = NearestNeighbors(
    metric="cosine", algorithm="brute",
    n_neighbors=min(K_NEIGH+1, nI)
).fit(Rmat.T)

cf_d, cf_i = item_knn.kneighbors(Rmat.T)
cf_s = 1.0 - cf_d
cf_i, cf_s = cf_i[:, 1:], cf_s[:, 1:]    # remove self-neighbor

# ---- Content-based neighbors ----
tfidf_vec = TfidfVectorizer(
    max_features=40000 if not USE_SMALL else 20000,
    min_df=3, ngram_range=(1,2)
)
content_matrix = tfidf_vec.fit_transform(anime_info["meta_text"])

cb_knn = NearestNeighbors(
    metric="cosine", algorithm="brute",
    n_neighbors=min(K_NEIGH+1, nI)
).fit(content_matrix)

cb_d, cb_i = cb_knn.kneighbors(content_matrix)
cb_s = 1.0 - cb_d
cb_i, cb_s = cb_i[:, 1:], cb_s[:, 1:]


***Prediction Functions***

In [7]:
def weighted_neighbor_mean(
    uid: int, iid: int,
    neigh_idx: np.ndarray,
    neigh_sim: np.ndarray
) -> float:
    """Predict rating using weighted average on similar items."""
    total, weight = 0.0, 0.0
    rated = user_r[uid]

    for nb, s in zip(neigh_idx[iid], neigh_sim[iid]):
        r = rated.get(int(nb))
        if r is None:
            continue
        total += s * r
        weight += abs(s)

    if weight > 0:
        return total / weight

    return 0.5 * u_mean[uid] + 0.5 * i_mean[iid]


def hybrid_predict(uid: int, iid: int, alpha: float) -> float:
    p_cf = weighted_neighbor_mean(uid, iid, cf_i, cf_s)
    p_cb = weighted_neighbor_mean(uid, iid, cb_i, cb_s)
    return alpha * p_cf + (1 - alpha) * p_cb


def predict(uid, iid, alpha):
    u = u_index.get(uid)
    i = i_index.get(iid)

    if u is None and i is None:
        return gmean
    if u is None:
        return i_mean[i]
    if i is None:
        return u_mean[u]

    p = hybrid_predict(u, i, alpha)
    return float(np.clip(p, rmin, rmax))


***Tune α on Validation Set***

In [8]:
# Reliable RMSE
def RMSE(true, pred):
    try:
        return mean_squared_error(true, pred, squared=False)
    except:
        return np.sqrt(mean_squared_error(true, pred))

# 90/10 split
train_small, valid = train_test_split(train_raw, test_size=0.10, random_state=13)

# Downsample for speed
valid = valid.sample(n=min(50000, len(valid)), random_state=13).reset_index(drop=True)

def evaluate(df, alpha):
    true = df[RATE].astype(float).values
    pred = [predict(u, a, alpha) for u, a in df[[USER, ITEM]].itertuples(index=False)]

    return RMSE(true, pred), mean_absolute_error(true, pred)

alphas = np.linspace(0, 1, 11)
results = []

best_alpha = None
best_rmse = float("inf")

for a in alphas:
    rm, ma = evaluate(valid, a)
    results.append((a, rm, ma))

    if rm < best_rmse:
        best_rmse = rm
        best_alpha = a

print(f"Optimal α = {best_alpha:.2f} | RMSE={best_rmse:.4f}")
pd.DataFrame(results, columns=["alpha", "rmse", "mae"]).sort_values("rmse")


Optimal α = 0.70 | RMSE=1.1931


Unnamed: 0,alpha,rmse,mae
7,0.7,1.193146,0.87118
8,0.8,1.194525,0.871917
6,0.6,1.19882,0.874706
9,0.9,1.202931,0.877067
5,0.5,1.211448,0.882437
10,1.0,1.218221,0.886259
4,0.4,1.230815,0.894289
3,0.3,1.25661,0.909699
2,0.2,1.288446,0.928118
1,0.1,1.32589,0.948891


***Predict for Test Set***

In [9]:
predictions = []
CHUNK = 200_000

for s in range(0, len(test_raw), CHUNK):
    block = test_raw.iloc[s:s+CHUNK]
    block_pred = [
        predict(u, i, best_alpha)
        for u, i in block[[USER, ITEM]].itertuples(index=False)
    ]
    predictions.extend(block_pred)

submission = test_raw.copy()
submission[RATE] = predictions[:len(test_raw)]

OUT_CSV = os.path.join(BASE_DIR, "predictions_custom.csv")
submission.to_csv(OUT_CSV, index=False)

print("Saved:", OUT_CSV)


Saved: Data\predictions_custom.csv


***Save Model Artifact***

In [12]:
import numpy as np

def to_py(x):
    """
    Convert numpy types → Python native types so JSON can serialize them.
    """
    if isinstance(x, (np.integer, np.int64, np.int32)):
        return int(x)
    if isinstance(x, (np.floating, np.float32, np.float64)):
        return float(x)
    if isinstance(x, np.ndarray):
        return x.tolist()
    return x

def pack_csr_safe(csr):
    return {
        "data":   to_py(csr.data),
        "indices": to_py(csr.indices),
        "indptr":  to_py(csr.indptr),
        "shape":   [int(csr.shape[0]), int(csr.shape[1])]
    }

artifact = {
    "global_mean": float(gmean),
    "rating_min":  float(rmin),
    "rating_max":  float(rmax),
    "best_alpha":  float(best_alpha),
    "user_ids":    [int(x) for x in u_index.keys()],
    "item_ids":    [int(x) for x in i_index.keys()],
    "user_mean":   [float(v) for v in u_mean.tolist()],
    "item_mean":   [float(v) for v in i_mean.tolist()],
    "R_matrix":    pack_csr_safe(Rmat),
}

ART_PATH = os.path.join(BASE_DIR, "hybrid_recommender_custom.json")

with open(ART_PATH, "w", encoding="utf-8") as f:
    json.dump(artifact, f)

print("Artifact saved →", ART_PATH)



Artifact saved → Data\hybrid_recommender_custom.json


***Streamlit***

In [18]:
!pip install streamlit


Collecting streamlit
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting altair!=5.4.0,!=5.4.1,<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<7,>=4.0 (from streamlit)
  Downloading cachetools-6.2.2-py3-none-any.whl.metadata (5.6 kB)
Collecting pyarrow<22,>=7.0 (from streamlit)
  Downloading pyarrow-21.0.0-cp313-cp313-win_amd64.whl.metadata (3.4 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-win_amd64.whl.metadata (44 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading gitpython-3.1.45-py3-none-any.whl.metadata (1

In [17]:
# app.py - Option A: simple user+anime prediction UI
import os
import json
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import streamlit as st

# -----------------------
# CONFIG - adjust paths
# -----------------------
DATA_DIR = "Data"
ANIME_PATH = os.path.join(DATA_DIR, "anime.csv")
TRAIN_PATH = os.path.join(DATA_DIR, "train.csv")
ARTIFACT_PATH = os.path.join(DATA_DIR, "hybrid_recommender_custom.json")

# neighbor K (should match what you used previously)
K_NEIGH = 30

# -----------------------
# UTIL: safe json loader (handles numeric lists)
# -----------------------
def load_json_safe(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# -----------------------
# REBUILD minimal model objects
# -----------------------
@st.cache_data(show_spinner=False)
def build_models(anime_csv: str, train_csv: str, artifact_json: str):
    anime_df = pd.read_csv(anime_csv)
    train_df = pd.read_csv(train_csv)
    art = load_json_safe(artifact_json)

    # basics from artifact (fallbacks used defensively)
    global_mean = float(art.get("global_mean", np.mean(art.get("user_mean", [7.5]))))
    rating_min = float(art.get("rating_min", 1.0))
    rating_max = float(art.get("rating_max", 10.0))
    best_alpha = float(art.get("best_alpha", 0.7))

    user_ids = [int(x) for x in art["user_ids"]]
    item_ids = [int(x) for x in art["item_ids"]]

    user_to_index = {uid: i for i, uid in enumerate(user_ids)}
    item_to_index = {iid: i for i, iid in enumerate(item_ids)}
    index_to_item = {i: iid for iid, i in item_to_index.items()}

    n_users, n_items = len(user_ids), len(item_ids)

    # rebuild R matrix from artifact if present
    if "R_matrix" in art and art["R_matrix"] is not None:
        pack = art["R_matrix"]
        data = np.array(pack["data"], dtype=float)
        indices = np.array(pack["indices"], dtype=int)
        indptr = np.array(pack["indptr"], dtype=int)
        shape = tuple(pack["shape"])
        Rmat = sparse.csr_matrix((data, indices, indptr), shape=shape)
    else:
        # fallback reconstruct from train.csv
        tri = train_df[train_df["anime_id"].isin(item_ids)].copy()
        rows = tri["user_id"].map(user_to_index).dropna().astype(int).values
        cols = tri["anime_id"].map(item_to_index).dropna().astype(int).values
        vals = tri["rating"].astype(float).values
        Rmat = sparse.coo_matrix((vals, (rows, cols)), shape=(n_users, n_items)).tocsr()

    # per-user and per-item means
    def csr_row_means(csr: sparse.csr_matrix):
        arr = np.zeros(csr.shape[0], dtype=float)
        for r in range(csr.shape[0]):
            s, e = csr.indptr[r], csr.indptr[r+1]
            arr[r] = float(csr.data[s:e].mean()) if (e > s) else float(global_mean)
        return arr

    user_mean = csr_row_means(Rmat)
    item_mean = csr_row_means(Rmat.T.tocsr())

    # quick per-user rating dict
    user_rdict: List[Dict[int, float]] = [dict() for _ in range(n_users)]
    for u in range(n_users):
        s, e = Rmat.indptr[u], Rmat.indptr[u+1]
        idxs, vals = Rmat.indices[s:e], Rmat.data[s:e]
        user_rdict[u] = {int(i): float(v) for i, v in zip(idxs, vals)}

    # prepare content TF-IDF using anime metadata and the item_ids order
    meta_cols = [c for c in ["genre", "type", "name", "episodes", "synopsis"] if c in anime_df.columns]
    if len(meta_cols) > 0:
        anime_df["meta_text"] = anime_df[meta_cols].astype(str).agg(" ".join, axis=1)
    else:
        anime_df["meta_text"] = ""

    # build anime info frame in the order of item_ids
    info = pd.DataFrame({"anime_id": item_ids})
    info = info.merge(anime_df.rename(columns={"anime_id": "anime_id"}), on="anime_id", how="left")
    info["meta_text"] = info["meta_text"].fillna("")

    tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2)
    tfidf_item_matrix = tfidf.fit_transform(info["meta_text"].astype(str))

    # CF kNN (item-item) on R.T
    cf_knn = NearestNeighbors(metric="cosine", algorithm="brute",
                              n_neighbors=min(K_NEIGH + 1, n_items))
    cf_knn.fit(Rmat.T)
    cf_dists, cf_inds = cf_knn.kneighbors(Rmat.T, n_neighbors=min(K_NEIGH + 1, n_items), return_distance=True)
    cf_sims = 1.0 - cf_dists
    cf_inds = cf_inds[:, 1:]; cf_sims = cf_sims[:, 1:]

    # CB kNN on TF-IDF
    cb_knn = NearestNeighbors(metric="cosine", algorithm="brute",
                              n_neighbors=min(K_NEIGH + 1, n_items))
    cb_knn.fit(tfidf_item_matrix)
    cb_dists, cb_inds = cb_knn.kneighbors(tfidf_item_matrix, n_neighbors=min(K_NEIGH + 1, n_items), return_distance=True)
    cb_sims = 1.0 - cb_dists
    cb_inds = cb_inds[:, 1:]; cb_sims = cb_sims[:, 1:]

    # name mapping for dropdowns
    name_map = {}
    for _, row in anime_df.iterrows():
        if "anime_id" in row and not pd.isna(row["anime_id"]):
            try:
                aid = int(row["anime_id"])
            except:
                continue
            name_map[aid] = row.get("name", str(aid))

    return {
        "artifact": art,
        "Rmat": Rmat,
        "user_ids": user_ids,
        "item_ids": item_ids,
        "user_to_index": user_to_index,
        "item_to_index": item_to_index,
        "index_to_item": index_to_item,
        "user_mean": user_mean,
        "item_mean": item_mean,
        "user_rdict": user_rdict,
        "cf_inds": cf_inds,
        "cf_sims": cf_sims,
        "cb_inds": cb_inds,
        "cb_sims": cb_sims,
        "name_map": name_map,
        "global_mean": global_mean,
        "rating_min": rating_min,
        "rating_max": rating_max,
        "best_alpha": best_alpha
    }

# -----------------------
# Prediction helpers
# -----------------------
def weighted_neighbor_mean(uidx: int, iidx: int, neigh_idx: np.ndarray, neigh_sim: np.ndarray, user_rdict) -> float:
    numer = 0.0
    denom = 0.0
    rated = user_rdict[uidx]
    for nb, s in zip(neigh_idx[iidx], neigh_sim[iidx]):
        r = rated.get(int(nb))
        if r is not None:
            numer += s * r
            denom += abs(s)
    if denom > 0:
        return float(numer / denom)
    return None

def hybrid_components(uidx: int, iidx: int, alpha: float, cf_inds, cf_sims, cb_inds, cb_sims, user_mean, item_mean, user_rdict, global_mean):
    p_cf = weighted_neighbor_mean(uidx, iidx, cf_inds, cf_sims, user_rdict)
    p_cb = weighted_neighbor_mean(uidx, iidx, cb_inds, cb_sims, user_rdict)

    # Backoffs if no neighbors
    if p_cf is None and p_cb is None:
        fallback = 0.5 * user_mean[uidx] + 0.5 * item_mean[iidx]
        return fallback, None, None

    if p_cf is None:
        p_cf = 0.5 * user_mean[uidx] + 0.5 * item_mean[iidx]
    if p_cb is None:
        p_cb = 0.5 * user_mean[uidx] + 0.5 * item_mean[iidx]

    hybrid = float(alpha * p_cf + (1.0 - alpha) * p_cb)
    return hybrid, float(p_cf), float(p_cb)

# -----------------------
# Streamlit UI
# -----------------------
st.set_page_config(page_title="Simple Hybrid Predictor", layout="centered")
st.title("Hybrid Recommender — Single prediction (User + Anime)")

with st.spinner("Loading models..."):
    state = build_models(ANIME_PATH, TRAIN_PATH, ARTIFACT_PATH)

user_ids = state["user_ids"]
item_ids = state["item_ids"]
user_to_index = state["user_to_index"]
item_to_index = state["item_to_index"]
index_to_item = state["index_to_item"]
user_mean = state["user_mean"]
item_mean = state["item_mean"]
user_rdict = state["user_rdict"]
cf_inds, cf_sims = state["cf_inds"], state["cf_sims"]
cb_inds, cb_sims = state["cb_inds"], state["cb_sims"]
name_map = state["name_map"]
global_mean = state["global_mean"]
rating_min = state["rating_min"]
rating_max = state["rating_max"]
best_alpha = state["best_alpha"]

st.markdown("### Pick a user and an anime to predict the rating")

# user selection: show a small sample list but allow typing
sample_users = user_ids[:200]
sel_user = st.selectbox("Pick user (from known users)", options=sample_users)
typed_user = st.text_input("Or type user id", value=str(sel_user))
try:
    chosen_user = int(typed_user)
except:
    st.error("User id must be an integer")
    st.stop()

# anime selection: build readable dropdown "id — name" for first N, but allow typing
display_items = []
for aid in item_ids[:3000]:  # cap for dropdown performance
    name = name_map.get(aid, str(aid))
    display_items.append(f"{aid} — {name}")

selected_display = st.selectbox("Pick anime (from known items)", options=display_items)
# extract id from selected_display which begins with "<id> —"
selected_aid = int(selected_display.split(" — ", 1)[0])

typed_aid = st.text_input("Or type anime id", value=str(selected_aid))
try:
    chosen_aid = int(typed_aid)
except:
    st.error("Anime id must be an integer")
    st.stop()

# alpha slider (start with best_alpha from artifact but user may override)
alpha = st.slider("Hybrid weight α (higher = more CF)", min_value=0.0, max_value=1.0, value=float(best_alpha), step=0.05)

# Predict button
if st.button("Predict rating"):
    # determine indices
    uidx = user_to_index.get(chosen_user)
    iidx = item_to_index.get(chosen_aid)

    if uidx is None and iidx is None:
        st.warning("Unknown user and unknown anime — returning global mean")
        st.write(f"Predicted rating: **{global_mean:.3f}**")
    elif uidx is None:
        st.warning("Unknown user — returning item mean")
        # item mean index might still exist
        if iidx is not None:
            st.write(f"Item mean: **{item_mean[iidx]:.3f}**")
        else:
            st.write(f"Global mean: **{global_mean:.3f}**")
    elif iidx is None:
        st.warning("Unknown anime — returning user mean")
        st.write(f"User mean: **{user_mean[uidx]:.3f}**")
    else:
        # compute hybrid and components
        hybrid, p_cf, p_cb = hybrid_components(uidx, iidx, alpha,
                                               cf_inds, cf_sims, cb_inds, cb_sims,
                                               user_mean, item_mean, user_rdict, global_mean)
        # if components missing, show fallback explanation
        st.write("### Prediction result")
        st.write(f"Hybrid prediction: **{float(np.clip(hybrid, rating_min, rating_max)):.3f}** (clipped to [{rating_min}, {rating_max}])")
        if p_cf is None and p_cb is None:
            st.info("No neighbor ratings found for this user on similar items — used backoff to user/item means.")
            st.write(f"User mean: {user_mean[uidx]:.3f}  —  Item mean: {item_mean[iidx]:.3f}")
        else:
            st.write(f"CF-only prediction: {p_cf:.3f}" if p_cf is not None else "CF-only: (no neighbor info)")
            st.write(f"CB-only prediction: {p_cb:.3f}" if p_cb is not None else "CB-only: (no neighbor info)")
            st.write(f"Alpha used: {alpha:.2f}  → hybrid = α*CF + (1-α)*CB")

st.markdown("---")
st.caption("This app uses a precomputed artifact and reconstructs item-item (CF) and TF-IDF (CB) neighbors on load.")


ModuleNotFoundError: No module named 'streamlit'