<a href="https://colab.research.google.com/github/Marcin19721205/BDCaseStudy/blob/main/BDCSZad10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np


In [3]:
#wczytaj
movies_path = "sample_data/movies.dat"
ratings_path = "sample_data/ratings.dat"

movies = pd.read_csv(movies_path, sep=r"::", engine="python", header=None, names=["movieId", "title", "genres"])
ratings = pd.read_csv(ratings_path, sep=r"::", engine="python", header=None, names=["userId", "movieId", "rating", "timestamp"])

# check
print("movies.shape:", movies.shape)
print("ratings.shape:", ratings.shape)
print("movies.head():\n", movies.head())
print("ratings.head():\n", ratings.head())


movies.shape: (38018, 3)
ratings.shape: (921398, 4)
movies.head():
    movieId                                              title  \
0        8      Edison Kinetoscopic Record of a Sneeze (1894)   
1       10                La sortie des usines Lumière (1895)   
2       12                      The Arrival of a Train (1896)   
3       25  The Oxford and Cambridge University Boat Race ...   
4       91                         Le manoir du diable (1896)   

              genres  
0  Documentary|Short  
1  Documentary|Short  
2  Documentary|Short  
3                NaN  
4       Short|Horror  
ratings.head():
    userId  movieId  rating   timestamp
0       1   114508       8  1381006850
1       2   499549       9  1376753198
2       2  1305591       8  1376742507
3       2  1428538       1  1371307089
4       3    75314       1  1595468524


In [5]:
# rozbijam genres po '|' i buduję binarne cechy gatunków (0/1) dla każdego filmu.
g = movies["genres"].fillna("").astype(str).str.get_dummies(sep="|")
item_features = pd.concat([movies[["movieId", "title", "genres"]], g], axis=1)

print("n_unique_genres:", g.shape[1])
print("genres_sample:", list(g.columns[:15]))
print("item_features.shape:", item_features.shape)
print(item_features.head())


n_unique_genres: 28
genres_sample: ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 'Horror']
item_features.shape: (38018, 31)
   movieId                                              title  \
0        8      Edison Kinetoscopic Record of a Sneeze (1894)   
1       10                La sortie des usines Lumière (1895)   
2       12                      The Arrival of a Train (1896)   
3       25  The Oxford and Cambridge University Boat Race ...   
4       91                         Le manoir du diable (1896)   

              genres  Action  Adult  Adventure  Animation  Biography  Comedy  \
0  Documentary|Short       0      0          0          0          0       0   
1  Documentary|Short       0      0          0          0          0       0   
2  Documentary|Short       0      0          0          0          0       0   
3                NaN       0      0          0     

In [22]:
# wybieram userId, biorę jego oceny, buduję profil jako ważoną średnią cech gatunków i normalizuję.
# user_id ręcznie, albo top-1/top-2/... po liczbie ocen. #

genre_cols = g.columns.tolist()

user_counts = ratings["userId"].value_counts()
rank = 1  # 1=top1, 2=top2, 3=top3... #
user_id = int(user_counts.index[rank - 1])  # wybór po rank #

# user_id = 12345  # <- alternatywny wybór po ID

u = ratings[ratings["userId"] == user_id].copy()
u = u.merge(item_features[["movieId"] + genre_cols], on="movieId", how="inner")

w = u["rating"].astype(float).values
w = w - w.mean()

X = u[genre_cols].astype(float).values
profile = (X * w[:, None]).mean(axis=0)

profile_norm = np.linalg.norm(profile)
profile = profile / profile_norm if profile_norm > 0 else profile

print("rank:", rank)
print("user_id:", user_id)
print("n_user_ratings:", len(u))
print("profile_len:", len(profile))
print("profile_norm_after:", float(np.linalg.norm(profile)))
print("top_genres_in_profile:", list(pd.Series(profile, index=genre_cols).sort_values(ascending=False).head(10).index))


rank: 1
user_id: 17405
n_user_ratings: 2877
profile_len: 28
profile_norm_after: 0.9999999999999999
top_genres_in_profile: ['Drama', 'Biography', 'History', 'Crime', 'Documentary', 'War', 'Music', 'Western', 'Animation', 'Adventure']


In [23]:
# liczę cosine score (film vs profil), wycinam filmy już ocenione przez usera i sortuję malejąco. #
rated_ids = set(ratings.loc[ratings["userId"] == user_id, "movieId"].unique())
cand = item_features[~item_features["movieId"].isin(rated_ids)].copy()

A = cand[genre_cols].astype(float).values
A_norm = np.linalg.norm(A, axis=1)
scores = (A @ profile) / np.where(A_norm > 0, A_norm, 1.0)

cand["score"] = scores
cand = cand.sort_values("score", ascending=False).reset_index(drop=True)

print("n_rated_movies:", len(rated_ids))
print("n_candidates:", len(cand))
print("score_stats:", cand["score"].describe())
print(cand[["movieId", "title", "genres", "score"]].head(10))


n_rated_movies: 2875
n_candidates: 35141
score_stats: count    35141.000000
mean         0.100612
std          0.344858
min         -0.569489
25%         -0.202167
50%          0.121024
75%          0.409556
max          0.679764
Name: score, dtype: float64
   movieId                                title           genres     score
0    93776            Prick Up Your Ears (1987)  Biography|Drama  0.679764
1  2378507              The Glass Castle (2017)  Biography|Drama  0.679764
2  7352942                      Yumorist (2019)  Biography|Drama  0.679764
3  1660399                          Nude (2010)  Biography|Drama  0.679764
4   432402                  Factory Girl (2006)  Biography|Drama  0.679764
5   183659                       Pollock (2000)  Biography|Drama  0.679764
6    64296       Erosu purasu gyakusatsu (1969)  Biography|Drama  0.679764
7  7339792  Unbroken: Path to Redemption (2018)  Biography|Drama  0.679764
8  1529292      Das Ende ist mein Anfang (2010)  Biography|Drama  0

In [24]:
# definiuję funkcję visualizar_recomendacion(user_id, N=5), która wypisuje TOP-N rekomendacji.

def visualizar_recomendacion(user_id: int, N: int = 5):
    rated_ids = set(ratings.loc[ratings["userId"] == user_id, "movieId"].unique())

    u = ratings[ratings["userId"] == user_id].merge(item_features[["movieId"] + genre_cols], on="movieId", how="inner")
    if len(u) == 0:
        print(f"user_id={user_id} -> brak ocen po joinie z movies")
        return

    w = u["rating"].astype(float).values
    w = w - w.mean()

    X = u[genre_cols].astype(float).values
    profile = (X * w[:, None]).mean(axis=0)

    pn = np.linalg.norm(profile)
    profile = profile / pn if pn > 0 else profile

    cand = item_features[~item_features["movieId"].isin(rated_ids)].copy()
    A = cand[genre_cols].astype(float).values
    A_norm = np.linalg.norm(A, axis=1)
    scores = (A @ profile) / np.where(A_norm > 0, A_norm, 1.0)

    out = cand[["movieId", "title", "genres"]].copy()
    out["score"] = scores
    out = out.sort_values("score", ascending=False).head(N).reset_index(drop=True)

    print(f"user_id={user_id} | TOP-{N}")
    print(out[["movieId", "title", "score", "genres"]].to_string(index=False))

visualizar_recomendacion(user_id, N=5)


user_id=17405 | TOP-5
 movieId                     title    score          genres
   93776 Prick Up Your Ears (1987) 0.679764 Biography|Drama
 2378507   The Glass Castle (2017) 0.679764 Biography|Drama
 7352942           Yumorist (2019) 0.679764 Biography|Drama
 1660399               Nude (2010) 0.679764 Biography|Drama
  432402       Factory Girl (2006) 0.679764 Biography|Drama


In [25]:
# walidacje: liczności, gatunki, shape cech oraz ile user ocenił i ile zostało kandydatów. #
rated_ids = set(ratings.loc[ratings["userId"] == user_id, "movieId"].unique())
n_candidates = int((~item_features["movieId"].isin(rated_ids)).sum())

print("n_movies:", len(movies))
print("n_ratings:", len(ratings))
print("n_users:", ratings["userId"].nunique())

print("n_unique_genres:", len(genre_cols))
print("genres_sample:", genre_cols[:15])

print("features_matrix_shape (n_movies x n_genres):", (len(movies), len(genre_cols)))

print("user_id:", user_id)
print("n_user_rated_movies:", len(rated_ids))
print("n_candidates_after_filter:", n_candidates)


n_movies: 38018
n_ratings: 921398
n_users: 71707
n_unique_genres: 28
genres_sample: ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 'Horror']
features_matrix_shape (n_movies x n_genres): (38018, 28)
user_id: 17405
n_user_rated_movies: 2875
n_candidates_after_filter: 35141
