<a href="https://colab.research.google.com/github/Marcin19721205/BDCaseStudy/blob/main/BDCSZad12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np


from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# wczytuje movies.dat i ratings.dat (sample_data/), usuniecie timestamp
movies = pd.read_csv("sample_data/movies.dat", sep=r"::", engine="python", header=None,
                     names=["movieId", "title", "genres"])
ratings = pd.read_csv("sample_data/ratings.dat", sep=r"::", engine="python", header=None,
                      names=["userId", "movieId", "rating", "timestamp"])

ratings = ratings.drop(columns=["timestamp"])

print("n_movies:", movies["movieId"].nunique())
print("n_ratings:", len(ratings))
print("n_users:", ratings["userId"].nunique())
print("n_items:", ratings["movieId"].nunique())


n_movies: 38013
n_ratings: 921398
n_users: 71707
n_items: 38013


In [5]:
# top-N użytkowników po liczbie ocen i filtr ratings tylko do nich

TOP_N_USERS = 3000

top_users = ratings["userId"].value_counts().head(TOP_N_USERS).index
ratings_f = ratings[ratings["userId"].isin(top_users)].copy()

print("TOP_N_USERS:", TOP_N_USERS)
print("after_filter_n_ratings:", len(ratings_f))
print("after_filter_n_users:", ratings_f["userId"].nunique())
print("after_filter_n_items:", ratings_f["movieId"].nunique())


TOP_N_USERS: 3000
after_filter_n_ratings: 486753
after_filter_n_users: 3000
after_filter_n_items: 32089


In [6]:
# macierz user–item z brakami jako 0 + mean-centering per user. #
user_item = ratings_f.pivot_table(index="userId", columns="movieId", values="rating", aggfunc="mean").fillna(0)

MEAN_CENTER = True
if MEAN_CENTER:
    user_means = user_item.replace(0, np.nan).mean(axis=1)
    user_item_mc = user_item.sub(user_means, axis=0).where(user_item != 0, 0)
else:
    user_item_mc = user_item

print("user_item.shape:", user_item.shape)
print("nonzero:", int((user_item != 0).sum().sum()))
print("density:", float((user_item != 0).sum().sum() / (user_item.shape[0] * user_item.shape[1])))
print("MEAN_CENTER:", MEAN_CENTER)


user_item.shape: (3000, 32089)
nonzero: 486658
density: 0.0050552941298679715
MEAN_CENTER: True


In [8]:
# macierz podobieństw user×user metodą cosine na macierzy mean-centered. #
X = csr_matrix(user_item_mc.values)  # (n_users, n_items)
user_sim = cosine_similarity(X, dense_output=True)  # (n_users, n_users)

np.fill_diagonal(user_sim, 1.0)
user_ids = user_item_mc.index.to_numpy()

print("user_sim.shape:", user_sim.shape)
print("user_sim_minmax:", float(user_sim.min()), float(user_sim.max()))
print("user_ids_sample:", user_ids[:10])


user_sim.shape: (3000, 3000)
user_sim_minmax: -0.2401086999651548 1.0
user_ids_sample: [ 39  54  56  66  69  95 111 116 117 123]


In [12]:
user_sim[0]

array([ 1.        , -0.01193226,  0.07937793, ...,  0.00199511,
        0.012219  ,  0.        ])

In [14]:
#wybieram usera, top-K podobnych userów i liczę score filmów nieocenionych

rank=1
user_id = int(ratings_f["userId"].value_counts().index[rank - 1])

K = 30

u_idx = int(np.where(user_ids == user_id)[0][0])
sim_u = user_sim[u_idx].copy()
sim_u[u_idx] = -np.inf

neigh_idx = np.argsort(sim_u)[-K:][::-1]
neigh_sim = user_sim[u_idx, neigh_idx]

R = user_item.values
Rmc = user_item_mc.values

rated_mask = R[u_idx] != 0
cand_mask = ~rated_mask

num = neigh_sim @ Rmc[neigh_idx]
den = np.sum(np.abs(neigh_sim))

scores = np.full(R.shape[1], -np.inf, dtype=float)
if den > 0:
    scores[cand_mask] = (num[cand_mask] / den)

movie_ids = user_item.columns.to_numpy()
rec = pd.DataFrame({"movieId": movie_ids, "score": scores})
rec = rec[np.isfinite(rec["score"])].sort_values("score", ascending=False).reset_index(drop=True)

print("user_id:", user_id, "| K:", K, "| n_neighbors:", len(neigh_idx), "| n_rated:", int(rated_mask.sum()))
print(rec.head(10))


user_id: 17405 | K: 30 | n_neighbors: 30 | n_rated: 2875
   movieId     score
0  2267998  1.415631
1  1454468  1.122354
2  3498820  0.862209
3   790636  0.844782
4  2024544  0.819537
5  3748528  0.801781
6  4154756  0.791788
7  1631867  0.720830
8  1170358  0.669678
9   111161  0.648162


In [15]:
# funkcja visualizar_recomendacion(user_id, N=5) dla user-based CF: top-K sąsiadów, score i wydruk TOP-N z tytułami. #

movie_titles = movies.drop_duplicates("movieId").set_index("movieId")["title"]

def visualizar_recomendacion(user_id: int, N: int = 5, K: int = 30):
    if user_id not in user_item.index:
        print(f"user_id={user_id} -> brak w macierzy user_item (po filtrze)")
        return

    u_idx = int(np.where(user_ids == user_id)[0][0])

    sim_u = user_sim[u_idx].copy()
    sim_u[u_idx] = -np.inf

    neigh_idx = np.argsort(sim_u)[-K:][::-1]
    neigh_sim = user_sim[u_idx, neigh_idx]

    R = user_item.values
    Rmc = user_item_mc.values

    rated_mask = R[u_idx] != 0
    cand_mask = ~rated_mask

    num = neigh_sim @ Rmc[neigh_idx]
    den = float(np.sum(np.abs(neigh_sim)))

    scores = np.full(R.shape[1], -np.inf, dtype=float)
    if den > 0:
        scores[cand_mask] = (num[cand_mask] / den)

    out = pd.DataFrame({"movieId": user_item.columns.to_numpy(), "score": scores})
    out = out[np.isfinite(out["score"])].sort_values("score", ascending=False).head(N).copy()
    out["title"] = out["movieId"].map(movie_titles)
    out = out[["movieId", "title", "score"]].reset_index(drop=True)

    print(f"user_id={user_id} | TOP-{N} | K={K}")
    print(out.to_string(index=False))

visualizar_recomendacion(user_id, N=5, K=30)


user_id=17405 | TOP-5 | K=30
 movieId                             title    score
 2267998                  Gone Girl (2014) 1.415631
 1454468                    Gravity (2013) 1.122354
 3498820 Captain America: Civil War (2016) 0.862209
  790636         Dallas Buyers Club (2013) 0.844782
 2024544           12 Years a Slave (2013) 0.819537


In [17]:
# walidacje: shape+gęstość, ile ocen ma user, top-10 podobnych userów, ile kandydatów po filtrze. #
nz = int((user_item != 0).sum().sum())
all_ = int(user_item.shape[0] * user_item.shape[1])
density = nz / all_ if all_ else 0.0

print("user_item.shape:", user_item.shape)
print("nonzero:", nz, "| all:", all_, "| density:", float(density))

if user_id in user_item.index:
    u_idx = int(np.where(user_ids == user_id)[0][0])
    n_user_ratings = int((user_item.loc[user_id] != 0).sum())
    n_candidates = int((user_item.loc[user_id] == 0).sum())
    print("user_id:", user_id, "| n_user_ratings:", n_user_ratings, "| ok>=5:", n_user_ratings >= 5)
    print("n_candidates_after_filter:", n_candidates)

    sim_u = user_sim[u_idx].copy()
    sim_u[u_idx] = -np.inf
    top10_idx = np.argsort(sim_u)[-10:][::-1]

    top10 = pd.DataFrame({
        "neighbor_userId": user_ids[top10_idx],
        "sim": sim_u[top10_idx],
        "n_ratings_neighbor": [(user_item.loc[uid] != 0).sum() for uid in user_ids[top10_idx]],
    }).reset_index(drop=True)

    print("top10_similar_users:\n", top10.to_string(index=False))
else:
    print(f"user_id={user_id} -> brak w macierzy user_item (po filtrze)")


user_item.shape: (3000, 32089)
nonzero: 486658 | all: 96267000 | density: 0.0050552941298679715
user_id: 17405 | n_user_ratings: 2875 | ok>=5: True
n_candidates_after_filter: 29214
top10_similar_users:
  neighbor_userId      sim  n_ratings_neighbor
            3664 0.194085                1274
           64954 0.168122                 527
           69530 0.159407                1770
           68019 0.157517                1187
           20543 0.151340                 834
           21084 0.149506                 271
            3130 0.146769                 199
           26962 0.139929                2070
           33307 0.139575                 370
           13978 0.139473                 598
