In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from rs_datasets import MovieLens

In [2]:
movielens = MovieLens('100k')
movielens.info()

ratings


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116



users


Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067



items


Unnamed: 0,item_id,title,release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,False,False,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,False,True,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False





In [4]:
def build_user_item_matrix(
    interactions: pd.DataFrame,
    user_col="user_id",
    item_col="item_id"
):
    """
    interactions: (user_id, item_id, ...)
    implicit: >= 1
    """
    u_codes, u_index = pd.factorize(interactions[user_col])
    i_codes, i_index = pd.factorize(interactions[item_col])

    data = np.ones(len(interactions), dtype=np.float32)
    mat = csr_matrix(
        (data, (u_codes, i_codes)),
        shape=(len(u_index), len(i_index))
    )
    return mat, u_index, i_index  # u_index / i_index: массивы исходных id по позициям

def recommend_user_user(
    interactions_train: pd.DataFrame,
    user_id,
    k_neighbors=5,
    k_recs=5,
    user_col="user_id",
    item_col="item_id"
):
    mat, u_index, i_index = build_user_item_matrix(
        interactions_train, 
        user_col, 
        item_col
    )

    # user_id -> row
    user_pos = pd.Index(u_index).get_indexer([user_id])[0]
    if user_pos == -1:
        return []  # новый пользователь

    # cosine similarity с target-user ко всем
    sims = cosine_similarity(mat[user_pos], mat).ravel()
    sims[user_pos] = -np.inf  # исключаем самого себя

    # top-K соседей
    neigh_pos = np.argpartition(-sims, kth=min(k_neighbors, len(sims)-1)-1)[:k_neighbors]
    neigh_pos = neigh_pos[np.argsort(-sims[neigh_pos])]
    neigh_sims = sims[neigh_pos].astype(np.float32)

    # скор для item = сумма(sim(u, v) * 1_{v смотрел item})
    # mat[neigh_pos] : (K x I), умножаем на (K,)
    scores = mat[neigh_pos].T.dot(neigh_sims)  # (I,)
    scores = np.asarray(scores).ravel()

    # выкидываем уже просмотренное пользователем
    seen_items_pos = set(mat[user_pos].indices)
    if seen_items_pos:
        scores[list(seen_items_pos)] = -np.inf

    # top-N рекомендаций
    rec_pos = np.argpartition(-scores, kth=min(k_recs, len(scores)-1)-1)[:k_recs]
    rec_pos = rec_pos[np.argsort(-scores[rec_pos])]
    rec_item_ids = i_index[rec_pos].tolist()

    # также вернём соседей (иногда удобно для дебага)
    neigh_user_ids = u_index[neigh_pos].tolist()
    return rec_item_ids, neigh_user_ids, neigh_sims.tolist()

In [5]:
def split_leave_last_n(
    interactions: pd.DataFrame,
    n_last=5,
    user_col="user_id",
    time_col="timestamp"
):
    df = interactions.sort_values([user_col, time_col])
    # последние n_last в тест
    test_idx = df.groupby(user_col).tail(n_last).index
    test = df.loc[test_idx]
    train = df.drop(test_idx)
    return train, test

def precision_recall_at_k(recs, gt_set, k):
    recs_k = recs[:k]
    if k == 0:
        return 0.0, 0.0
    hit = sum(1 for x in recs_k if x in gt_set)
    precision = hit / k
    recall = hit / max(1, len(gt_set))
    return precision, recall

In [6]:
train, test = split_leave_last_n(
    movielens.ratings,
    n_last=5,
    time_col="timestamp"
)

# возьмём 2 “знакомых” пользователя: например, самых активных (чтобы оценка была не шумом)
active_users = train["user_id"].value_counts().head(20).index.tolist()
users_to_check = active_users[:2]

for u in users_to_check:
    gt = set(test.loc[test["user_id"] == u, "item_id"])
    recs, neigh_users, neigh_sims = recommend_user_user(train, u, k_neighbors=5, k_recs=5)

    p5, r5 = precision_recall_at_k(recs, gt, k=5)
    print(f"user={u}  neighbors={list(zip(neigh_users, np.round(neigh_sims, 3)))}")
    print(f"recs={recs}")
    print(f"gt(last5)={list(gt)}")
    print(f"Precision@5={p5:.3f}  Recall@5={r5:.3f}\n")

user=405  neighbors=[(846, np.float64(0.63)), (94, np.float64(0.521)), (276, np.float64(0.514)), (311, np.float64(0.513)), (7, np.float64(0.511))]
recs=[496, 258, 474, 268, 484]
gt(last5)=[1265, 308, 1591, 1399, 351]
Precision@5=0.000  Recall@5=0.000

user=655  neighbors=[(537, np.float64(0.575)), (201, np.float64(0.554)), (293, np.float64(0.517)), (378, np.float64(0.51)), (450, np.float64(0.509))]
recs=[380, 482, 99, 180, 588]
gt(last5)=[131, 1645, 1650, 1268, 918]
Precision@5=0.000  Recall@5=0.000



In [7]:
def items_of_user(df, u, user_col="user_id", item_col="item_id"):
    return set(df.loc[df[user_col] == u, item_col])

for u in users_to_check:
    gt = set(test.loc[test["user_id"] == u, "item_id"])
    recs, neigh_users, neigh_sims = recommend_user_user(train, u, k_neighbors=5, k_recs=5)

    neigh_union = set()
    for v in neigh_users:
        neigh_union |= items_of_user(train, v)

    print("user", u)
    print("gt:", gt)
    print("gt ∩ (union neighbors watched):", gt & neigh_union)
    print("size:", len(gt & neigh_union))
    print()


user 405
gt: {1265, 308, 1591, 1399, 351}
gt ∩ (union neighbors watched): set()
size: 0

user 655
gt: {131, 1645, 1650, 1268, 918}
gt ∩ (union neighbors watched): {131, 1268}
size: 2



In [8]:
for u in users_to_check:
    gt = set(test.loc[test["user_id"] == u, "item_id"])
    recs, neigh_users, neigh_sims = recommend_user_user(train, u, k_neighbors=5, k_recs=50)

    p5, r5 = precision_recall_at_k(recs, gt, k=5)
    p50, r50 = precision_recall_at_k(recs, gt, k=50)
    print(f"user={u}  P@5={p5:.3f} R@5={r5:.3f} | P@50={p50:.3f} R@50={r50:.3f}")


user=405  P@5=0.000 R@5=0.000 | P@50=0.000 R@50=0.000
user=655  P@5=0.000 R@5=0.000 | P@50=0.000 R@50=0.000
