***Setup and imports***

In [1]:
# Core Python & scientific stack
import os
import json
import math
import numpy as np
import pandas as pd

from typing import Dict, List

# Sparse + ML utilities
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error, mean_absolute_error



***Configuration & Data Loading***

In [2]:
# Paths
BASE_DIR = "Data"

PATH_ANIME = os.path.join(BASE_DIR, "anime.csv")
PATH_TRAIN = os.path.join(BASE_DIR, "train.csv")
PATH_TEST  = os.path.join(BASE_DIR, "test.csv")

USER, ITEM, RATE = "user_id", "anime_id", "rating"

# Speed mode toggle
USE_SMALL = False
USER_CAP  = 2000
ITEM_CAP  = 3000

# Load tables
anime_raw = pd.read_csv(PATH_ANIME)
train_raw = pd.read_csv(PATH_TRAIN)
test_raw  = pd.read_csv(PATH_TEST)

# Clean invalid ratings
train_raw = train_raw[pd.to_numeric(train_raw[RATE], errors="coerce").notnull()].copy()
train_raw[RATE] = train_raw[RATE].astype(float)
train_raw = train_raw[train_raw[RATE] >= 0].copy()

# Stats
rmin, rmax = float(train_raw[RATE].min()), float(train_raw[RATE].max())
gmean = float(train_raw[RATE].mean())

print(f"Train size: {train_raw.shape}, Test size: {test_raw.shape}")
print(f"Rating range: [{rmin}, {rmax}], Global mean={gmean:.3f}")


Train size: (5703555, 3), Test size: (633686, 2)
Rating range: [1.0, 10.0], Global mean=7.809


***Build Lightweight Content Text (Metadata)***

In [3]:
text_cols = [c for c in ["genre", "type", "name", "episodes"] if c in anime_raw.columns]

anime_raw["meta_text"] = (
    anime_raw[text_cols].astype(str).agg(" ".join, axis=1)
)

# Keep only item IDs present in training data
all_item_ids = sorted(train_raw[ITEM].unique())
anime_info = pd.DataFrame({ITEM: all_item_ids}).merge(
    anime_raw[[ITEM, "meta_text"]], on=ITEM, how="left"
).fillna({"meta_text": ""})


***Integer Mappings & Sparse Matrix***

In [4]:
# ID → index mappings
u_index = {u: i for i, u in enumerate(sorted(train_raw[USER].unique()))}
i_index = {a: j for j, a in enumerate(all_item_ids)}

nU, nI = len(u_index), len(i_index)
print(f"Users={nU}, Items={nI}")

# Sparse CSR rating matrix
r = train_raw
row = r[USER].map(u_index).values
col = r[ITEM].map(i_index).values
val = r[RATE].values

Rmat = sparse.coo_matrix((val, (row, col)), shape=(nU, nI)).tocsr()


Users=69481, Items=9838


***Per-User & Per-Item Means***

In [5]:
def csr_axis_mean(mat: sparse.csr_matrix, axis=1):
    # axis=1 → per-row; axis=0 → per-column
    if axis == 0:
        return csr_axis_mean(mat.T, axis=1)

    means = np.zeros(mat.shape[0], dtype=float)
    for r in range(mat.shape[0]):
        s, e = mat.indptr[r], mat.indptr[r+1]
        means[r] = mat.data[s:e].mean() if e > s else gmean
    return means

u_mean = csr_axis_mean(Rmat, axis=1)
i_mean = csr_axis_mean(Rmat, axis=0)

# Quick lookup dict of items per user
user_r: List[Dict[int, float]] = [dict() for _ in range(nU)]

for u in range(nU):
    s, e = Rmat.indptr[u], Rmat.indptr[u+1]
    rated_items = Rmat.indices[s:e]
    rated_vals  = Rmat.data[s:e]
    user_r[u]   = {int(i): float(r) for i, r in zip(rated_items, rated_vals)}


***Similarity Neighbors (CF + Content)***

In [6]:
K_NEIGH = 30 if not USE_SMALL else 25

# ---- CF neighbor model (item-item) ----
item_knn = NearestNeighbors(
    metric="cosine", algorithm="brute",
    n_neighbors=min(K_NEIGH+1, nI)
).fit(Rmat.T)

cf_d, cf_i = item_knn.kneighbors(Rmat.T)
cf_s = 1.0 - cf_d
cf_i, cf_s = cf_i[:, 1:], cf_s[:, 1:]    # remove self-neighbor

# ---- Content-based neighbors ----
tfidf_vec = TfidfVectorizer(
    max_features=40000 if not USE_SMALL else 20000,
    min_df=3, ngram_range=(1,2)
)
content_matrix = tfidf_vec.fit_transform(anime_info["meta_text"])

cb_knn = NearestNeighbors(
    metric="cosine", algorithm="brute",
    n_neighbors=min(K_NEIGH+1, nI)
).fit(content_matrix)

cb_d, cb_i = cb_knn.kneighbors(content_matrix)
cb_s = 1.0 - cb_d
cb_i, cb_s = cb_i[:, 1:], cb_s[:, 1:]


***Prediction Functions***

In [7]:
def weighted_neighbor_mean(
    uid: int, iid: int,
    neigh_idx: np.ndarray,
    neigh_sim: np.ndarray
) -> float:
    """Predict rating using weighted average on similar items."""
    total, weight = 0.0, 0.0
    rated = user_r[uid]

    for nb, s in zip(neigh_idx[iid], neigh_sim[iid]):
        r = rated.get(int(nb))
        if r is None:
            continue
        total += s * r
        weight += abs(s)

    if weight > 0:
        return total / weight

    return 0.5 * u_mean[uid] + 0.5 * i_mean[iid]


def hybrid_predict(uid: int, iid: int, alpha: float) -> float:
    p_cf = weighted_neighbor_mean(uid, iid, cf_i, cf_s)
    p_cb = weighted_neighbor_mean(uid, iid, cb_i, cb_s)
    return alpha * p_cf + (1 - alpha) * p_cb


def predict(uid, iid, alpha):
    u = u_index.get(uid)
    i = i_index.get(iid)

    if u is None and i is None:
        return gmean
    if u is None:
        return i_mean[i]
    if i is None:
        return u_mean[u]

    p = hybrid_predict(u, i, alpha)
    return float(np.clip(p, rmin, rmax))


***Tune α on Validation Set***

In [8]:
# Reliable RMSE
def RMSE(true, pred):
    try:
        return mean_squared_error(true, pred, squared=False)
    except:
        return np.sqrt(mean_squared_error(true, pred))

# 90/10 split
train_small, valid = train_test_split(train_raw, test_size=0.10, random_state=13)

# Downsample for speed
valid = valid.sample(n=min(50000, len(valid)), random_state=13).reset_index(drop=True)

def evaluate(df, alpha):
    true = df[RATE].astype(float).values
    pred = [predict(u, a, alpha) for u, a in df[[USER, ITEM]].itertuples(index=False)]

    return RMSE(true, pred), mean_absolute_error(true, pred)

alphas = np.linspace(0, 1, 11)
results = []

best_alpha = None
best_rmse = float("inf")

for a in alphas:
    rm, ma = evaluate(valid, a)
    results.append((a, rm, ma))

    if rm < best_rmse:
        best_rmse = rm
        best_alpha = a

print(f"Optimal α = {best_alpha:.2f} | RMSE={best_rmse:.4f}")
pd.DataFrame(results, columns=["alpha", "rmse", "mae"]).sort_values("rmse")


Optimal α = 0.70 | RMSE=1.1931


Unnamed: 0,alpha,rmse,mae
7,0.7,1.193146,0.87118
8,0.8,1.194525,0.871917
6,0.6,1.19882,0.874706
9,0.9,1.202931,0.877067
5,0.5,1.211448,0.882437
10,1.0,1.218221,0.886259
4,0.4,1.230815,0.894289
3,0.3,1.25661,0.909699
2,0.2,1.288446,0.928118
1,0.1,1.32589,0.948891


***Predict for Test Set***

In [9]:
predictions = []
CHUNK = 200_000

for s in range(0, len(test_raw), CHUNK):
    block = test_raw.iloc[s:s+CHUNK]
    block_pred = [
        predict(u, i, best_alpha)
        for u, i in block[[USER, ITEM]].itertuples(index=False)
    ]
    predictions.extend(block_pred)

submission = test_raw.copy()
submission[RATE] = predictions[:len(test_raw)]

OUT_CSV = os.path.join(BASE_DIR, "predictions_custom.csv")
submission.to_csv(OUT_CSV, index=False)

print("Saved:", OUT_CSV)


Saved: Data\predictions_custom.csv


***Save Model Artifact***

In [10]:
import numpy as np

def to_py(x):
    """
    Convert numpy types → Python native types so JSON can serialize them.
    """
    if isinstance(x, (np.integer, np.int64, np.int32)):
        return int(x)
    if isinstance(x, (np.floating, np.float32, np.float64)):
        return float(x)
    if isinstance(x, np.ndarray):
        return x.tolist()
    return x

def pack_csr_safe(csr):
    return {
        "data":   to_py(csr.data),
        "indices": to_py(csr.indices),
        "indptr":  to_py(csr.indptr),
        "shape":   [int(csr.shape[0]), int(csr.shape[1])]
    }

artifact = {
    "global_mean": float(gmean),
    "rating_min":  float(rmin),
    "rating_max":  float(rmax),
    "best_alpha":  float(best_alpha),
    "user_ids":    [int(x) for x in u_index.keys()],
    "item_ids":    [int(x) for x in i_index.keys()],
    "user_mean":   [float(v) for v in u_mean.tolist()],
    "item_mean":   [float(v) for v in i_mean.tolist()],
    "R_matrix":    pack_csr_safe(Rmat),
}

ART_PATH = os.path.join(BASE_DIR, "hybrid_recommender_custom.json")

with open(ART_PATH, "w", encoding="utf-8") as f:
    json.dump(artifact, f)

print("Artifact saved →", ART_PATH)



Artifact saved → Data\hybrid_recommender_custom.json


***Streamlit***

In [17]:
import numpy as np
import json

# Encoder for numpy types
def np_encoder(obj):
    if isinstance(obj, np.generic):
        return obj.item()  # Convert to Python native type
    raise TypeError(f"Object of type {type(obj)} is not JSON serializable")


In [18]:
ART_PATH = os.path.join(DATA_DIR, "hybrid_recommender_custom.json")
with open(ART_PATH, "w", encoding="utf-8") as f:
    json.dump(artifact, f, default=np_encoder)

In [19]:
import json
import numpy as np
import os

DATA_DIR = r"C:\Users\Admin\Documents\GitHub\Unsupervised_Learning_Project"
ART_PATH = os.path.join(DATA_DIR, "hybrid_recommender_custom.json")

artifact_demo = {
    "global_mean": 5.0,
    "rating_min": 1.0,
    "rating_max": 10.0,
    "best_alpha": 0.7,
    "user_ids": [1, 2, 3],
    "item_ids": [101, 102, 103],
    "user_mean": [5.0, 6.0, 7.0],
    "item_mean": [5.5, 6.5, 7.5],
    "R_csr": {
        "data": [5, 6, 7],
        "indices": [0, 1, 2],
        "indptr": [0, 1, 2, 3],
        "shape": [3, 3]
    }
}

def np_encoder(obj):
    if isinstance(obj, np.generic):
        return obj.item()
    raise TypeError

with open(ART_PATH, "w", encoding="utf-8") as f:
    json.dump(artifact_demo, f, default=np_encoder)

print("Demo artifact created at:", ART_PATH)


Demo artifact created at: C:\Users\Admin\Documents\GitHub\Unsupervised_Learning_Project\hybrid_recommender_custom.json


NameError: name 'art' is not defined