In [2]:
import tqdm
import json

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

## User-based Collaborative Filtering

#### Основная идея: 
Рекомендовать пользователю треки, которые понравились похожим на него пользователям

$$\hat r_{ui} = h^{-1} \left( \frac{\sum_{v \in N_i(u)} w_{uv} h(r_{vi})}{\sum_{v \in N_i(u)} w_{uv}} \right)$$

$N_i(u)$ - соседи пользователя $u$, которые оценили айтем $i$,
$w_{uv}, w_{ij}$ - веса соседей, 
$h$ - функция нормализации



**Нормализация**: В качестве функции нормализации используем среднее время прослушивания

**Веса**: Похожих пользователей будем искать по *cosine similarity*

**Отсутствующие данные**: заполним средним времнем прослушивания по пользователю

**Соседи**: в качестве соседей будем рассматривать всех пользователей. Q: Как это упростит формулу?

In [6]:
# BOTIFY_DATA_DIR = "/Users/matthewiskornev/Made/sem_2/RECSYS/hw_1/recsys-itmo-spring-2023/botify/data/"

data = pd.read_json("/Users/matthewiskornev/Made/sem_2/RECSYS/lec_2/data.json", lines=True)[["user", "time", "track"]].copy()

data.head()

Unnamed: 0,user,time,track
0,404,1.0,1084
1,404,1.0,1084
2,404,1.0,1084
3,404,1.0,1084
4,404,0.0,487


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283834 entries, 0 to 283833
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   user    283834 non-null  int64  
 1   time    283834 non-null  float64
 2   track   283834 non-null  int64  
dtypes: float64(1), int64(2)
memory usage: 6.5 MB


In [9]:
# нормализуем время, вычтя среднее (для каждого пользователя)

data["normalized_time"] = data.groupby("user")["time"].transform(lambda time: time - time.mean())

data.head()

Unnamed: 0,user,time,track,normalized_time
0,404,1.0,1084,0.83
1,404,1.0,1084,0.83
2,404,1.0,1084,0.83
3,404,1.0,1084,0.83
4,404,0.0,487,-0.17


In [10]:
# строим матрицу интеракций

interactions = pd.pivot_table(data, values="normalized_time", index="user", columns="track").fillna(0)

print(f"Interactions matrix: shape={interactions.shape}, sparsity={(interactions != 0).values.sum() / interactions.size}")

Interactions matrix: shape=(9443, 49397), sparsity=0.0005827303650766998


In [11]:
interactions.head()

track,0,1,2,3,4,5,6,7,8,9,...,49989,49990,49991,49992,49993,49994,49996,49997,49998,49999
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
similarity_matrix = cosine_similarity(interactions)
np.fill_diagonal(similarity_matrix, 0)

print(f"Mean positive neighbours per user: {(similarity_matrix > 0).sum(axis=1).mean()}")

Mean positive neighbours per user: 116.22789367785661


In [13]:
print(f"Mean negative neighbours per user: {(similarity_matrix < 0).sum(axis=1).mean()}")

Mean negative neighbours per user: 62.6976596420629


In [14]:
# TODO: Compute proper user-based scores
# TODO: expected size: observed users x observed tracks
scores_matrix = np.matmul(similarity_matrix, interactions.values)

scores = pd.DataFrame(
    scores_matrix,
    index=interactions.index,
    columns=interactions.columns
)

scores[[1, 2, 3, 4, 5]].head()

track,1,2,3,4,5
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0


In [16]:
(scores != 0).values.sum() / scores.size

0.12427701226210958

## Глянем на рекомендации

In [15]:
BOTIFY_DATA_DIR = "/Users/matthewiskornev/Made/sem_2/RECSYS/hw_1/recsys-itmo-spring-2023/botify/data/"

products = pd.read_json(BOTIFY_DATA_DIR + "tracks.json", lines=True).set_index("track") # зачитали каталог
products.head()

Unnamed: 0_level_0,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Jack Johnson,The Cove
1,Billy Preston,Nothing from Nothing
2,Paco De Lucia,Entre Dos Aguas
3,Josh Rouse,Under Cold Blue Stars
4,The Dead 60s,Riot Radio (Soundtrack Version)


In [17]:
user = np.random.choice(scores.index) # семплируем случайного пользователя
k = 10

# data[data["user"] == user]

In [19]:
user_scores = pd.merge(
    scores.loc[user].sort_values(ascending=False)[:k].to_frame("score"),  #достаем строку соответсвтующую пользователю и сортируем скоры по убыванию
    products, 
    left_index=True, 
    right_index=True,
    how="inner"
)
# потом джойним с матрицей продуктс - где весь список наших треков

user_scores

Unnamed: 0_level_0,score,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19949,0.23531,Sugarland,Hello
16990,0.151046,Hootie And The Blowfish,Tuckers Town (LP Version)
5944,0.135424,Rainbow,Since You Been Gone
24447,0.132583,Snow Patrol,It's Beginning To Get To Me
12411,0.132583,Josh Turner,All Over Me
26241,0.132583,Snow Patrol,The Finish Line
21571,0.118533,U2,Bad
6900,0.118533,U2,I Still Haven't Found What I'm Looking For
16908,0.087521,Little River Band,Help Is On Its Way
1661,0.06857,Sade,Paradise


In [20]:
user_interactions = pd.merge(
    interactions.loc[user].sort_values(ascending=False).to_frame("time"),
    products, 
    left_index=True, 
    right_index=True, 
    how="inner"
)

user_interactions[user_interactions["time"] != 0]

Unnamed: 0_level_0,time,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19949,0.715714,Sugarland,Hello
16990,0.715714,Hootie And The Blowfish,Tuckers Town (LP Version)
30262,0.645714,Days Of The New,The Down Town
42544,-0.254286,Natalie Imbruglia,Leave Me Alone
33172,-0.274286,Alice Cooper,It's Me
43224,-0.274286,Carleen Anderson,True Spirit (K-klassic Mix)
25369,-0.284286,P J Harvey,Oh My Lover
34859,-0.284286,Lostprophets,Where We Belong (Radio Edit)
616,-0.284286,Boys Noize,Oh!
33568,-0.284286,Slick Rick,Indian Girl (An Adult Story)


## Подготавливаем рекомендации для продакшена

In [24]:
def recommend(user_id, scores, k):
    return scores.loc[user_id].sort_values(ascending=False)[:k].index.tolist()

# выдаем для каждого пользователя топ-100 треков по скору

# кажется, довольно странным, что по тем трекам, по которым у нас есть скор у пользователя, мы перезатираем эту
# инфу, просто заменяя оценку пользователя на среднюю оценку его соседей

In [23]:
users = data["user"].unique()

with open(BOTIFY_DATA_DIR + "recommendations_ub.json", "w") as rf:
    for user in tqdm.tqdm(users):
        recommendation = {
            "user": int(user),
            "tracks": recommend(user, scores, 100)
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|██████████████████████████████████████| 9443/9443 [00:17<00:00, 538.03it/s]
