In [1]:
from copy import deepcopy

import numpy as np
import pandas as pd
import scipy.sparse as sp
from rs_datasets import MovieLens

In [2]:
movielens = MovieLens('100k')
movielens.info()

ratings


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116



users


Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067



items


Unnamed: 0,item_id,title,release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,False,False,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,False,True,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False





### 0-based index

In [3]:
movielens.ratings.user_id.min(), movielens.ratings.item_id.min()

(np.int32(1), np.int32(1))

In [4]:
movielens.ratings['user_id'] -= 1
movielens.ratings['item_id'] -= 1

In [5]:
movielens.ratings.user_id.min(), movielens.ratings.item_id.min()

(np.int32(0), np.int32(0))

In [6]:
movielens.ratings.user_id.max(), movielens.ratings.user_id.nunique()

(np.int32(942), 943)

In [7]:
movielens.ratings.item_id.max(), movielens.ratings.item_id.nunique()

(np.int32(1681), 1682)

### Popularity
Берем топ 100 популярных (по количеству оценок)

In [8]:
top_100 = movielens.ratings.groupby('item_id')['rating'].size().sort_values(ascending=False)[:100].index.tolist()
top_100[:10]

[49, 257, 99, 180, 293, 285, 287, 0, 299, 120]

### User-Item matrix
Sparse матрица, где строки - users, столбцы - items, значения - ratings.
$$
R \in \mathbb{R}^{|U| \times |I|}
$$

In [9]:
n_users = movielens.ratings.user_id.max() + 1
n_items = movielens.ratings.item_id.max() + 1

In [10]:
rows = movielens.ratings['user_id'].values
cols = movielens.ratings['item_id'].values
vals = movielens.ratings['rating'].values

In [11]:
R = sp.csr_matrix((vals, (rows, cols)), shape=(n_users, n_items))

In [12]:
R.shape

(943, 1682)

### Interactions for top-100 

In [13]:
R[:, top_100]

<Compressed Sparse Row sparse matrix of dtype 'int32'
	with 29931 stored elements and shape (943, 100)>

### Cosine Similarity

In [14]:
def cos_sim_pair(R, i, j):
    num = R[:, i].multiply(R[:, j]).sum()

    norm_i = np.sqrt(R[:, i].power(2).sum())
    norm_j = np.sqrt(R[:, j].power(2).sum())

    if norm_i == 0 or norm_j == 0:
        return 0.0

    return num / (norm_i * norm_j)

In [15]:
def cosine_sim_matrix(R):
    numerator = R.T @ R

    item_norms = np.sqrt(R.power(2).sum(axis=0))
    item_norms = np.asarray(item_norms).ravel()
    item_norms[item_norms == 0] = 1e-9

    denominator = np.outer(item_norms, item_norms)

    cos_sim = numerator / denominator

    cos_sim = cos_sim.tocsr()
    cos_sim.setdiag(0.0)
    cos_sim.eliminate_zeros()

    return cos_sim

In [16]:
def cosine_sim_matrix_shrinkage(R, beta=100):
    cos_sim = cosine_sim_matrix(R)   # sparse

    bin_R = R.copy()
    bin_R.data = np.ones_like(bin_R.data)
    co_count = bin_R.T @ bin_R       # sparse

    shrink = co_count.copy()
    shrink.data = shrink.data / (shrink.data + beta)

    cos_sim = cos_sim.multiply(shrink)

    cos_sim.setdiag(0.0)
    cos_sim.eliminate_zeros()
    return cos_sim

### Item-item similarity for top-100

In [17]:
sim_top_100 = cosine_sim_matrix_shrinkage(R)[top_100][:, top_100]
sim_top_100

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9900 stored elements and shape (100, 100)>

In [18]:
sim_top_100_effective = cosine_sim_matrix_shrinkage(R[:, top_100])
sim_top_100_effective

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9900 stored elements and shape (100, 100)>

In [19]:
(sim_top_100_effective != sim_top_100).sum()

np.int64(0)

#### Проверка на diag(sim) = 0

In [20]:
np.allclose(sim_top_100.diagonal(), 0.0)

True

#### Проверка на симметричность sim

In [21]:
(sim_top_100 != sim_top_100.T).sum()

np.int64(0)

### Top-10 for each

In [22]:
def top_k(sim_matrix, top_n_items_ids, k=10):
    result = {}
    
    n = sim_matrix.shape[0]
    assert len(top_n_items_ids) == n
    
    for i in range(n):
        row = sim_matrix[i]
        
        indices = row.indices
        values = row.data
        
        if len(values) == 0:
            result[top_n_items_ids[i]] = []
            continue
        
        order = np.argsort(-values)[:k]

        movie_i = top_n_items_ids[i]
        topk = [
            (top_n_items_ids[j], values[idx])
            for idx, j in zip(order, indices[order])
        ]

        result[movie_i] = topk

    return result

In [23]:
top10_similar = top_k(sim_top_100, top_100, k=10)
all_sims = [
    sim
    for sims in top10_similar.values()
    for _, sim in sims
]

len(set(np.round(all_sims, 4))), len(all_sims)
np.percentile(all_sims, [0, 25, 50, 75, 90, 99])


array([0.20662257, 0.35383008, 0.40845362, 0.46089162, 0.50677191,
       0.58132587])

In [24]:
k = 0
for elem, vals in top10_similar.items():
    print('For', movielens.items.loc[movielens.items['item_id'] == elem, 'title'].iat[0])
    ids = [val[0] for val in vals]
    res = movielens.items.loc[
        movielens.items['item_id'].isin(ids),
        'title'
    ]
    print(', '.join(res))
    print()
    
    k += 1
    if k == 3:
        break

For I.Q. (1994)
Dances with Wolves (1990), Snow White and the Seven Dwarfs (1937), Striptease (1996), Spitfire Grill, The (1996), Delicatessen (1991), Princess Bride, The (1987), Apocalypse Now (1979), This Is Spinal Tap (1984), Breaking the Waves (1996)

For Men in Black (1997)
I.Q. (1994), Apocalypse Now (1979), Breaking the Waves (1996), Secrets & Lies (1996), Marvin's Room (1996), Donnie Brasco (1997), Hoodlum (1997), Midnight in the Garden of Good and Evil (1997), Cop Land (1997), Benny & Joon (1993)

For Snow White and the Seven Dwarfs (1937)
Shanghai Triad (Yao a yao yao dao waipo qiao) (1995), I.Q. (1994), Professional, The (1994), Dances with Wolves (1990), Striptease (1996), Spitfire Grill, The (1996), Apocalypse Now (1979), Citizen Ruth (1996), Sense and Sensibility (1995)



In [25]:
movielens.items.loc[movielens.items['item_id'] == elem, 'title'].iat[0]

'Snow White and the Seven Dwarfs (1937)'

In [26]:
from collections import Counter

cnt = Counter(
    j
    for sims in top10_similar.values()
    for j, _ in sims
)

cnt.most_common(10)


[(49, 68),
 (173, 60),
 (171, 49),
 (203, 48),
 (97, 47),
 (55, 36),
 (180, 34),
 (68, 30),
 (209, 29),
 (0, 27)]