In [41]:
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split as sklearn_train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

import pandas as pd
import numpy as np
import zipfile

In [42]:
import warnings
warnings.filterwarnings('ignore')

In [43]:
ratings = pd.read_csv('./data/ratings.csv')
movies = pd.read_csv('./data/movies.csv')

### Через функции

In [44]:
def ratings_matrix(ratings):    
    # создайте csr матрицу 
    return csr_matrix(pd.crosstab(ratings.userId, ratings.movieId, ratings.rating, aggfunc=sum).fillna(0).values)  

R = ratings_matrix(ratings)

In [45]:
def create_model(rating_matrix, metric):
    """
    - создание модели с базовыми параметрами
    """
    model = NearestNeighbors(metric=metric, n_neighbors=21, algorithm='brute')
    
    model.fit(rating_matrix)    
    return model

In [46]:
def nearest_neighbors(rating_matrix, model):
    """    
    :param rating_matrix : матрица рейтингов (nb_users, nb_items)
    :param model : модель knn  
    """    
    similarities, neighbors = model.kneighbors(rating_matrix)        
    return similarities[:, 1:], neighbors[:, 1:]

In [47]:
# метрику схожести успользуем Косинусную
model = create_model(rating_matrix=R, metric='cosine')
similarities, neighbors = nearest_neighbors(R, model)

In [74]:
users = pd.DataFrame({'users':ratings.userId.unique()})


In [137]:
def ap(row):
    return neighbors[row.name]

users['neighbors'] = users.apply(ap, axis=1)

def ap2(row):
    return similarities[row.name]

users['similarities'] = users.apply(ap2, axis=1)


In [168]:
d = users[users.users==34].neighbors.tolist()[0]
d

array([213, 118, 194, 101,  18, 231, 247, 117,  20, 471, 154, 341, 584,
       241, 462, 460, 593, 513,  29, 354], dtype=int64)

In [169]:
s = users[users.users==34].similarities.tolist()[0]
s

array([0.58376998, 0.60165327, 0.62283047, 0.64324162, 0.6524467 ,
       0.6582923 , 0.6628399 , 0.66614299, 0.66671276, 0.6711183 ,
       0.6763172 , 0.67839462, 0.68363842, 0.6863217 , 0.68712191,
       0.6892908 , 0.69446488, 0.69618003, 0.69632538, 0.69995988])

### Ручной способ

In [132]:
movie_users = {}
user_movies = {}
for index, row in ratings.iterrows():
    if row['movieId'] not in movie_users:
        movie_users[row['movieId']] = {}
        
    if row['userId'] not in user_movies:
        user_movies[row['userId']] = {}
        
    movie_users[row['movieId']][row['userId']] = row['rating']
    user_movies[row['userId']][row['movieId']] = row['rating']

In [135]:
# сделаем расчет по всем пользователям
# проверяем разницу на основе cos(A,B)
# Там где косунусное сходство = 1, там у пользователи пересекаются в одном фильме
similarities1 = {}

# все фильмы пользователя 1
for user1 in user_movies:
    if user1 not in similarities1:
        similarities1[user1] = {}
    
    # все фильмы пользователя 2
    for user2 in user_movies:
        if user2 in similarities1[user1] or user1 == user2:
            continue
        
        user1_ratings = []
        user2_ratings = []
        
        # только те фильме, которые оба пользователя смотрели
        for key in user_movies[user1]:
            watched_users = movie_users[key]
            if user1 in watched_users and user2 in watched_users:
                user1_ratings.append(watched_users[user1])
                user2_ratings.append(watched_users[user2])
                
        if len(user1_ratings) > 0:
            if user2 not in similarities1:
                similarities1[user2] = {}
            
            # делаем расчет расстояния
            sim = np.dot(user1_ratings, user2_ratings) / (np.linalg.norm(user1_ratings) * np.linalg.norm(user2_ratings))
            similarities1[user1][user2] = sim
            similarities1[user2][user1] = sim

In [173]:
d_s = sorted(similarities1[34].items(), key = lambda x:x[1])

In [174]:
d_s

[(581.0, 0.5588283785085597),
 (174.0, 0.7071067811865475),
 (609.0, 0.7632302129619272),
 (310.0, 0.7658162319438307),
 (578.0, 0.775764787380907),
 (315.0, 0.8134394345071608),
 (364.0, 0.8288497269823397),
 (256.0, 0.8320502943378437),
 (531.0, 0.8341956322956723),
 (133.0, 0.8510893423665223),
 (12.0, 0.8520563361656316),
 (599.0, 0.8522930885393416),
 (20.0, 0.8563681273218742),
 (381.0, 0.8639354603891503),
 (317.0, 0.8686190533818215),
 (269.0, 0.8691796863515645),
 (336.0, 0.8741467574762476),
 (410.0, 0.8758600359654909),
 (490.0, 0.8823529411764706),
 (477.0, 0.8846517369293828),
 (500.0, 0.8854750206725349),
 (629.0, 0.8929228114926943),
 (207.0, 0.8970381669827211),
 (348.0, 0.9023871677239282),
 (35.0, 0.9053290902707718),
 (346.0, 0.9054203007825372),
 (541.0, 0.9061882438003237),
 (167.0, 0.90990990990991),
 (330.0, 0.9102744761538445),
 (166.0, 0.9128709291752768),
 (586.0, 0.9130073672896897),
 (142.0, 0.91309524215838),
 (337.0, 0.9135274347827325),
 (6.0, 0.915951429

In [209]:
ratings[(ratings.userId==34)&(ratings.movieId.isin(films))]

Unnamed: 0,userId,movieId,rating,timestamp
6319,34,50,5.0,973748491
6328,34,296,5.0,973746450
6335,34,527,4.0,973746450
6345,34,608,4.0,973746527
6397,34,1259,4.0,973746961
6401,34,1270,3.0,973747017
6411,34,1358,5.0,973746613
6484,34,3418,3.0,973748196


In [210]:
ratings[(ratings.userId==354)&(ratings.movieId.isin(films))]

Unnamed: 0,userId,movieId,rating,timestamp
77826,539,50,4.0,956159510
77828,539,296,4.0,956858825
77831,539,527,4.0,951928689
77832,539,608,4.0,956159610
77835,539,1259,5.0,956858891
77836,539,1270,3.0,951928751
77839,539,1358,5.0,951928751
77848,539,3418,4.0,956858720


In [212]:
films = list(set(ratings[ratings.userId==354].movieId) & set(ratings[ratings.userId==34].movieId))
films

[288, 589, 527, 593, 595, 377, 442, 316, 253]

In [310]:
len(u_factors[34,:])

671

In [220]:
umean

userId
1      2.550000
2      3.486842
3      3.568627
4      4.348039
5      3.910000
         ...   
667    3.647059
668    3.750000
669    3.351351
670    3.806452
671    3.917391
Name: rating, Length: 671, dtype: float64

In [216]:
df = pd.crosstab(ratings.userId, ratings.movieId, ratings.rating, aggfunc=sum)
df = df.fillna(df.mean(axis=0))

In [214]:
umean = ratings.groupby(by='userId')['rating'].mean()

In [217]:
df

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.87247,3.401869,3.161017,2.384615,3.267857,3.884615,3.283019,3.8,3.15,3.45082,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
2,3.87247,3.401869,3.161017,2.384615,3.267857,3.884615,3.283019,3.8,3.15,4.00000,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
3,3.87247,3.401869,3.161017,2.384615,3.267857,3.884615,3.283019,3.8,3.15,3.45082,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
4,3.87247,3.401869,3.161017,2.384615,3.267857,3.884615,3.283019,3.8,3.15,4.00000,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
5,3.87247,3.401869,4.000000,2.384615,3.267857,3.884615,3.283019,3.8,3.15,3.45082,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,3.87247,3.401869,3.161017,2.384615,3.267857,4.000000,3.283019,3.8,3.15,3.45082,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
668,3.87247,3.401869,3.161017,2.384615,3.267857,3.884615,3.283019,3.8,3.15,3.45082,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
669,3.87247,3.401869,3.161017,2.384615,3.267857,3.884615,3.283019,3.8,3.15,3.45082,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
670,4.00000,3.401869,3.161017,2.384615,3.267857,3.884615,3.283019,3.8,3.15,3.45082,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0


In [228]:
df = df.subtract(umean, axis=0)

In [230]:
df

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.322470,0.851869,0.611017,-0.165385,0.717857,1.334615,0.733019,1.250000,0.600000,0.900820,...,-0.050000,-2.050000,0.450000,-1.550000,-1.050000,2.450000,1.950000,2.450000,0.450000,2.450000
2,0.385628,-0.084973,-0.325825,-1.102227,-0.218985,0.397773,-0.203823,0.313158,-0.336842,0.513158,...,-0.986842,-2.986842,-0.486842,-2.486842,-1.986842,1.513158,1.013158,1.513158,-0.486842,1.513158
3,0.303842,-0.166758,-0.407611,-1.184012,-0.300770,0.315988,-0.285609,0.231373,-0.418627,-0.117808,...,-1.068627,-3.068627,-0.568627,-2.568627,-2.068627,1.431373,0.931373,1.431373,-0.568627,1.431373
4,-0.475570,-0.946170,-1.187022,-1.963424,-1.080182,-0.463424,-1.065020,-0.548039,-1.198039,-0.348039,...,-1.848039,-3.848039,-1.348039,-3.348039,-2.848039,0.651961,0.151961,0.651961,-1.348039,0.651961
5,-0.037530,-0.508131,0.090000,-1.525385,-0.642143,-0.025385,-0.626981,-0.110000,-0.760000,-0.459180,...,-1.410000,-3.410000,-0.910000,-2.910000,-2.410000,1.090000,0.590000,1.090000,-0.910000,1.090000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.225411,-0.245190,-0.486042,-1.262443,-0.379202,0.352941,-0.364040,0.152941,-0.497059,-0.196239,...,-1.147059,-3.147059,-0.647059,-2.647059,-2.147059,1.352941,0.852941,1.352941,-0.647059,1.352941
668,0.122470,-0.348131,-0.588983,-1.365385,-0.482143,0.134615,-0.466981,0.050000,-0.600000,-0.299180,...,-1.250000,-3.250000,-0.750000,-2.750000,-2.250000,1.250000,0.750000,1.250000,-0.750000,1.250000
669,0.521118,0.050518,-0.190334,-0.966736,-0.083494,0.533264,-0.068332,0.448649,-0.201351,0.099468,...,-0.851351,-2.851351,-0.351351,-2.351351,-1.851351,1.648649,1.148649,1.648649,-0.351351,1.648649
670,0.193548,-0.404582,-0.645435,-1.421836,-0.538594,0.078164,-0.523433,-0.006452,-0.656452,-0.355632,...,-1.306452,-3.306452,-0.806452,-2.806452,-2.306452,1.193548,0.693548,1.193548,-0.806452,1.193548


In [283]:
P = np.array([])
S = np.array([])
Qh = np.array([])
u_factors = np.array([])
i_factors = np.array([])

In [284]:
R = df.to_numpy()
P, s, Qh = np.linalg.svd(R, full_matrices=False)
S = np.diag(s)

In [285]:
u_factors = np.dot(P, np.sqrt(S))
i_factors = np.dot(np.sqrt(S), Qh)

In [306]:
r_hat = umean[34]+np.dot(u_factors[34,:],i_factors[:,780])

In [307]:
r_hat

4.5358288770053665

In [308]:
umean

userId
1      2.550000
2      3.486842
3      3.568627
4      4.348039
5      3.910000
         ...   
667    3.647059
668    3.750000
669    3.351351
670    3.806452
671    3.917391
Name: rating, Length: 671, dtype: float64

In [299]:
predictions = np.dot(u_factors[34,:], i_factors) + umean[34]


In [278]:
top_idx = np.flip(np.argsort(predictions))

In [279]:
preds = predictions[top_idx]

In [292]:
umean = ratings.groupby(by='userId')['rating'].mean()

In [295]:
umean[34]

3.935828877005348