<a href="https://colab.research.google.com/github/Marcin19721205/BDCaseStudy/blob/main/BDCSZad13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# wczytaj movies.dat i ratings.dat (sample_data/), usun timestamp
movies = pd.read_csv("sample_data/movies.dat", sep=r"::", engine="python", header=None,
                     names=["movieId", "title", "genres"])
ratings = pd.read_csv("sample_data/ratings.dat", sep=r"::", engine="python", header=None,
                      names=["userId", "movieId", "rating", "timestamp"])

ratings = ratings.drop(columns=["timestamp"])

print("n_movies:", movies["movieId"].nunique())
print("n_ratings:", len(ratings))
print("n_users:", ratings["userId"].nunique())
print("n_items:", ratings["movieId"].nunique())

TOP_N_USERS = 3000
TOP_N_ITEMS = None  # np. 3000 albo None #

if TOP_N_USERS is not None:
    top_users = ratings["userId"].value_counts().head(TOP_N_USERS).index
    ratings = ratings[ratings["userId"].isin(top_users)].copy()

if TOP_N_ITEMS is not None:
    top_items = ratings["movieId"].value_counts().head(TOP_N_ITEMS).index
    ratings = ratings[ratings["movieId"].isin(top_items)].copy()

print("after_filter_n_ratings:", len(ratings))
print("after_filter_n_users:", ratings["userId"].nunique())
print("after_filter_n_items:", ratings["movieId"].nunique())


n_movies: 38013
n_ratings: 921398
n_users: 71707
n_items: 38013
after_filter_n_ratings: 486753
after_filter_n_users: 3000
after_filter_n_items: 32089


In [5]:
# macierz user×item jako sparse (CSR), sparsit, dim
u_codes, u_uni = pd.factorize(ratings["userId"], sort=True)
i_codes, i_uni = pd.factorize(ratings["movieId"], sort=True)

X = coo_matrix((ratings["rating"].astype(float).values, (u_codes, i_codes)),
               shape=(len(u_uni), len(i_uni))).tocsr()

n_users, n_items = X.shape
nnz = X.nnz
sparsity = 1.0 - (nnz / (n_users * n_items))

print("user_item_dim:", X.shape)
print("nnz:", nnz)
print("sparsity:", float(sparsity))

rows, cols = X.nonzero()
k = min(10, len(rows))
idx = np.random.choice(len(rows), size=k, replace=False) if len(rows) else np.array([], dtype=int)

sample = pd.DataFrame({
    "userId": u_uni[rows[idx]] if len(rows) else [],
    "movieId": i_uni[cols[idx]] if len(rows) else [],
    "rating": X[rows[idx], cols[idx]].A1 if len(rows) else [],
})

print("sample_ratings:\n", sample.to_string(index=False))


user_item_dim: (3000, 32089)
nnz: 486753
sparsity: 0.9949437190314439
sample_ratings:
  userId  movieId  rating
   9061  1311071     3.0
  16352  2361317     8.0
  20384   963966     8.0
  17812  1837709     5.0
  27269  3416532     7.0
  18052   114369    10.0
  66641  2277860     8.0
  33357  3521164     9.0
   1035    52311    10.0
   9105   472062     6.0


In [8]:
# filtr userów z min. liczbą ocen i split 80/20 per user na sparse macierzy (train_X, test_X)
MIN_RATINGS_PER_USER = 20
TEST_FRAC = 0.2
SEED = 42

rng = np.random.default_rng(SEED)

row_nnz = np.diff(X.indptr)
keep_users = row_nnz >= MIN_RATINGS_PER_USER

X_f = X[keep_users]
u_uni_f = u_uni[keep_users]

train_rows, train_cols, train_vals = [], [], []
test_rows, test_cols, test_vals = [], [], []

for r in range(X_f.shape[0]):
    start, end = X_f.indptr[r], X_f.indptr[r + 1]
    cols = X_f.indices[start:end]
    vals = X_f.data[start:end]
    n = len(cols)
    if n == 0:
        continue
    n_test = max(1, int(np.ceil(TEST_FRAC * n)))
    test_pos = rng.choice(n, size=n_test, replace=False)
    test_mask = np.zeros(n, dtype=bool)
    test_mask[test_pos] = True

    tr_cols = cols[~test_mask]
    tr_vals = vals[~test_mask]
    te_cols = cols[test_mask]
    te_vals = vals[test_mask]

    train_rows.extend([r] * len(tr_cols))
    train_cols.extend(tr_cols.tolist())
    train_vals.extend(tr_vals.tolist())

    test_rows.extend([r] * len(te_cols))
    test_cols.extend(te_cols.tolist())
    test_vals.extend(te_vals.tolist())

train_X = coo_matrix((train_vals, (train_rows, train_cols)), shape=X_f.shape).tocsr()
test_X = coo_matrix((test_vals, (test_rows, test_cols)), shape=X_f.shape).tocsr()

print("MIN_RATINGS_PER_USER:", MIN_RATINGS_PER_USER, "| TEST_FRAC:", TEST_FRAC)
print("before_users/items:", X.shape, "| nnz:", X.nnz)
print("after_users/items:", X_f.shape, "| nnz:", X_f.nnz)
print("train nnz:", train_X.nnz, "| test nnz:", test_X.nnz)
print("train/test ratio:", float(test_X.nnz / max(1, (train_X.nnz + test_X.nnz))))
print("sample_users_kept:", u_uni_f[:10])


MIN_RATINGS_PER_USER: 20 | TEST_FRAC: 0.2
before_users/items: (3000, 32089) | nnz: 486753
after_users/items: (3000, 32089) | nnz: 486753
train nnz: 388165 | test nnz: 98588
train/test ratio: 0.202542151768967
sample_users_kept: Index([39, 54, 56, 66, 69, 95, 111, 116, 117, 123], dtype='int64')


In [10]:
# “fit” model UBCF w Python: mean-centering na train_X i liczę user×user cosine similarity (nn=30). #
METHOD = "UBCF"
SIM = "cosine"
NN = 30

def mean_center_csr_rows(X: csr_matrix) -> csr_matrix:
    X = X.tocsr()
    row_sums = np.asarray(X.sum(axis=1)).ravel()
    row_nnz = np.diff(X.indptr)
    row_means = np.divide(row_sums, row_nnz, out=np.zeros_like(row_sums, dtype=float), where=row_nnz != 0)
    Xc = X.copy()
    for r in range(Xc.shape[0]):
        s, e = Xc.indptr[r], Xc.indptr[r + 1]
        if s == e:
            continue
        Xc.data[s:e] = Xc.data[s:e] - row_means[r]
    return Xc, row_means

train_X_mc, train_user_means = mean_center_csr_rows(train_X)

user_sim = cosine_similarity(train_X_mc, dense_output=True)
np.fill_diagonal(user_sim, 1.0)

print("METHOD:", METHOD, "| SIM:", SIM, "| NN:", NN)
print("train_X.shape:", train_X.shape, "| nnz:", train_X.nnz)
print("user_sim.shape:", user_sim.shape)
print("user_sim_minmax:", float(user_sim.min()), float(user_sim.max()))


METHOD: UBCF | SIM: cosine | NN: 30
train_X.shape: (3000, 32089) | nnz: 388165
user_sim.shape: (3000, 3000)
user_sim_minmax: -0.28622697393252094 1.0


In [22]:
# predykcja TOP-5 dla jednego usera: top-NN sąsiadów, score z ważonej średniej i mapowanie movieId→title

movie_titles = movies.drop_duplicates("movieId").set_index("movieId")["title"]

def topN_for_user(u_idx: int, N: int = 5, NN: int = 30):
    sim_u = user_sim[u_idx].copy()
    sim_u[u_idx] = -np.inf

    neigh_idx = np.argsort(sim_u)[-NN:][::-1]
    neigh_sim = user_sim[u_idx, neigh_idx]

    rated_mask = train_X[u_idx].toarray().ravel() != 0
    cand_mask = ~rated_mask

    num = neigh_sim @ train_X_mc[neigh_idx].toarray()
    den = float(np.sum(np.abs(neigh_sim)))

    scores = np.full(train_X.shape[1], -np.inf, dtype=float)
    if den > 0:
        scores[cand_mask] = (num.ravel()[cand_mask] / den) + float(train_user_means[u_idx])

    top_idx = np.argsort(scores)[-N:][::-1]
    return top_idx, scores[top_idx]
#
#
u_idx = 0  # index user z train -> wybór
#
#
top_idx, top_scores = topN_for_user(u_idx, N=5, NN=NN)

top_movieIds = i_uni[top_idx]
out = pd.DataFrame({
    "userId": [int(u_uni_f[u_idx])] * len(top_movieIds),
    "movieId": top_movieIds.astype(int),
    "title": pd.Series(top_movieIds).map(movie_titles).values,
    "score": top_scores,
})

print(out.to_string(index=False))


 userId  movieId               title    score
     39  1663202 The Revenant (2015) 8.476180
     39  6751668 Gisaengchung (2019) 8.404829
     39  2267998    Gone Girl (2014) 8.387053
     39   137523   Fight Club (1999) 8.383598
     39  7286456        Joker (2019) 8.374242


In [24]:
# foreword: Zad13 / Krok 6 — printy: model+parametry, dim+sparsity, ile ocen user ma w train/test oraz TOP-5 (movieId, title). #

n_users, n_items = train_X.shape
sparsity = 1.0 - (train_X.nnz / (n_users * n_items))

print("MODEL:", METHOD, "| SIM:", SIM, "| NN:", NN)
print("dim_train:", train_X.shape, "| nnz:", train_X.nnz, "| sparsity:", float(sparsity))

uid = int(u_uni_f[u_idx])
n_train = int(train_X[u_idx].nnz)
n_test = int(test_X[u_idx].nnz)

print("userId:", uid, "| n_train_ratings:", n_train, "| n_test_ratings:", n_test)

print("TOP-5 recommendations:")
print(out[["movieId", "title"]].to_string(index=False))


MODEL: UBCF | SIM: cosine | NN: 30
dim_train: (3000, 32089) | nnz: 388165 | sparsity: 0.9959678290587637
userId: 39 | n_train_ratings: 54 | n_test_ratings: 14
TOP-5 recommendations:
 movieId               title
 1663202 The Revenant (2015)
 6751668 Gisaengchung (2019)
 2267998    Gone Girl (2014)
  137523   Fight Club (1999)
 7286456        Joker (2019)
