# Импортируем необходимые библиотеки

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Импортируем данные

In [None]:
movies = pd.read_csv(r'/content/movies.csv')
ratings = pd.read_csv(r'/content/ratings.csv')

In [None]:
print(movies.head())
print(ratings.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


# User-based Collaborative Filtering

### В User-Based collaborative filterring мы пытаемся найти пользователей с похожими интересами, основываясь на рейтингах, которые они поставили просмотренным фильмам. Схожесть между пользователями можно вычислять с помощью косинусного сходства или, например, корреляции Пирсона.

### Итоговый топ рекомендаций можно выстроить на основе предсказанных рейтингов, учитывая только те фильмы, которые пользователь еще не оценивал, взяв наиболее высокие рейтинги.

In [None]:
ratings.shape

(100836, 4)

In [None]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


### Разделим нашу выборку на обучающую и тестовую

In [None]:
X_train, X_test = train_test_split(ratings, test_size = 0.30, random_state = 42)

print(X_train.shape)
print(X_test.shape)

(70585, 4)
(30251, 4)


### Сформируем матрицу User_Item

In [None]:
user_data = X_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
user_data.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193571,193573,193579,193581,193583,193585,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
user_data.shape

(610, 8566)

### Скопируем обучающий и тестовый датасеты для прогнозирования и оценки.

#### Dummy_train - будет использоваться для предсказания оценок фильмов, которые еще не были оценены пользователем. Чтобы игнорировать фильмы, оцененные пользователем, мы отметим их нулями во время пронозирования, а не оценные - единичками.

#### Dummy_test - будет использоваться для оценки. Будем делать предсказания только для тех фильмов, которые пользователь уже оценил, поэтому их отметим единичками.

In [None]:
dummy_train = X_train.copy()
dummy_test = X_test.copy()

dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x > 0 else 1)
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x > 0 else 0)

# Те фильмы, которые пользователь еще не оценил отмечены 1
dummy_train = dummy_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(1)

# Те фильмы, которые пользователь еще не оценил отмечены 0
dummy_test = dummy_test.pivot(index ='userId', columns = 'movieId', values = 'rating').fillna(0)

In [None]:
dummy_train.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193571,193573,193579,193581,193583,193585,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
dummy_test.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,187595,189043,189111,189333,189547,189713,190213,190219,193567,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Вычислим косинусное расстояние между пользователями

In [None]:
user_similarity = cosine_similarity(user_data)
user_similarity[np.isnan(user_similarity)] = 0
print(user_similarity)
print(user_similarity.shape)

[[1.         0.01799262 0.02914591 ... 0.22586534 0.12184689 0.13241346]
 [0.01799262 1.         0.         ... 0.04299081 0.03472882 0.0625667 ]
 [0.02914591 0.         1.         ... 0.00378273 0.         0.02382229]
 ...
 [0.22586534 0.04299081 0.00378273 ... 1.         0.10982639 0.21962266]
 [0.12184689 0.03472882 0.         ... 0.10982639 1.         0.05416081]
 [0.13241346 0.0625667  0.02382229 ... 0.21962266 0.05416081 1.        ]]
(610, 610)


### Мы получили матрицу попарного косинусного сходства между пользователями. Т.е. user_similarity[i][j] - косинусное сходство между пользователем i и j.

$$ cosine similarity(A, B) = (A · B) / (||A|| * ||B||) $$

Интерпретация косинусного сходства векторов:
Чем ближе к 1, тем более похожие (ближе к параллельным) вектора. 0 - нет схожести (перпендикулярны). -1 - полностью противоположны.

### Вычислим матрицу предсказанных рейтингов пользователей для фильмов, которую впоследствии будем использовать для предсказания рекомендаций.

Умножим матрицу сходств пользователей на матрицу user-item.

user_predicted_ratings[i][j] содержит предсказанный рейтинг (оценку) пользователя i для фильма j, основыванный на оценках других пользователей, похожих на него

In [None]:
user_predicted_ratings = np.dot(user_similarity, user_data)
user_predicted_ratings[:5]

array([[8.52008912e+01, 3.99290227e+01, 2.08165173e+01, ...,
        2.81820351e-02, 2.81820351e-02, 1.57425084e-01],
       [2.45531356e+01, 1.05987273e+01, 2.94172315e+00, ...,
        8.74389309e-02, 8.74389309e-02, 4.49741734e-01],
       [4.22670774e+00, 2.11463396e+00, 9.59320674e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.08328071e+01, 2.37867860e+01, 9.79300114e+00, ...,
        5.08638986e-02, 5.08638986e-02, 1.06328078e-01],
       [6.07942623e+01, 3.68759800e+01, 1.23584138e+01, ...,
        0.00000000e+00, 0.00000000e+00, 1.03332342e-01]])

predicted_rating[i, j] = SUM ( user_similarity[i, k] × user_data[k, j] ) для всех k, где k пробегает по всем пользователям.

Другими словами, предсказанная оценка пользователя i для фильма j - это сумма оценок, поставленных этому фильму всеми остальными пользователями, взвешенная на сходство этих пользователей с пользователем i.

Чем больше пользователь k похож на пользователя i (больше значение user_similarity[i, k]), тем больший вес имеет оценка пользователя k при формировании предсказания для пользователя i.

In [None]:
user_predicted_ratings.shape

(610, 8566)

Осталось только настроить рекомендации так, чтобы выдавались фильмы с наиболее высокими предсказанными рейтингами и не рекомендовались уже оцененные ранее фильмы.

Чтобы не рекомендовались фильмы, оцененные пользователем ранее, умножим поэлементно полученную матрицу рейтингов на dummy_train, в которой все оцененные пользователем фильмы помечены 0.

In [None]:
user_final_ratings = np.multiply(user_predicted_ratings, dummy_train)
user_final_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193571,193573,193579,193581,193583,193585,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,39.929023,0.0,1.220732,10.483532,0.0,14.082187,2.605186,4.111435,47.981999,...,0.083324,0.036234,0.028182,0.032208,0.032208,0.028182,0.032208,0.028182,0.028182,0.157425
2,24.553136,10.598727,2.941723,0.085473,2.77306,10.458197,2.308815,0.523106,0.588578,11.698251,...,0.054182,0.112421,0.087439,0.09993,0.09993,0.087439,0.09993,0.087439,0.087439,0.449742
3,4.226708,2.114634,0.959321,0.066644,0.308448,2.85574,0.518991,0.105484,0.190006,2.550762,...,0.008707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50.832807,23.786786,9.793001,0.706291,6.609481,24.594138,10.881808,1.505819,2.196956,27.970658,...,0.00591,0.065396,0.050864,0.05813,0.05813,0.050864,0.05813,0.050864,0.050864,0.106328
5,60.794262,36.87598,12.358414,2.321889,13.215004,29.323176,14.149193,3.043103,2.351687,47.983608,...,0.108995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103332


Рекомендации будем вычислять взяв n самых высоких рейтингов для заданного пользователя.

In [None]:
def user_based_recomendation(user_index, n):
    if n > 30:
        print("sorry, we can recommend you max 30 movies")
    else:
        user_rec_movie_indx = user_final_ratings.iloc[user_index].sort_values(ascending = False)[0:n].index
        recommended_movies = pd.DataFrame({'movieId': user_rec_movie_indx})
        recommended_movies = pd.merge(recommended_movies, movies, on='movieId', how='left')
        print("Recommended Movies:")
        for title in recommended_movies['title']:
            print(title)

In [None]:
user_based_recomendation(54, 7)

Recommended Movies:
Pulp Fiction (1994)
Forrest Gump (1994)
Shawshank Redemption, The (1994)
Matrix, The (1999)
Fight Club (1999)
Star Wars: Episode IV - A New Hope (1977)
Godfather, The (1972)


# Оценка качества модели

Как нам оценить качество наших предсказаний?

Можем попробовать предсказать рейтинги фильмов, которые пользователи уже оценили и посчитать ошибки.

Собственно для этого мы и создавали матрицу dummy_test.

In [None]:
test_user_features = X_test.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0) # создали тестовую матрицу user-item для проверки точности предсказний рейтингов
test_user_similarity = cosine_similarity(test_user_features) # матрица отражающее косинусное сходство пользователей
test_user_similarity[np.isnan(test_user_similarity)] = 0

print(test_user_similarity[:5])

[[1.         0.         0.07126637 ... 0.0749648  0.         0.02105064]
 [0.         1.         0.         ... 0.02631254 0.         0.04691426]
 [0.07126637 0.         1.         ... 0.         0.         0.        ]
 [0.05034755 0.         0.00662165 ... 0.03664347 0.         0.02849273]
 [0.         0.         0.         ... 0.04009331 0.         0.0181443 ]]


In [None]:
user_predicted_ratings_test = np.dot(test_user_similarity, test_user_features)
user_predicted_ratings_test

array([[ 8.01521825,  3.22701218,  1.71422693, ...,  0.04154912,
         0.        ,  0.        ],
       [ 1.64920152,  0.91304857,  0.02113666, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.07587801,  0.07241296,  0.1867716 , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [17.86102484, 10.1363879 ,  4.48304633, ...,  0.0274908 ,
         0.        ,  0.        ],
       [ 3.10351661,  2.6934212 ,  1.20357903, ...,  0.        ,
         0.        ,  0.        ],
       [12.36110509,  5.79632466,  1.96280959, ...,  0.        ,
         0.20526264,  0.23947308]])

Теперь занулим предсказания тех фильмов, которые пользователи не оценивали поэлементно умножив получившуюся матрицу рейтингов на dummy_test.

In [None]:
test_user_final_rating = np.multiply(user_predicted_ratings_test, dummy_test)
test_user_final_rating.head(5)

movieId,1,2,3,4,5,6,7,8,9,10,...,187595,189043,189111,189333,189547,189713,190213,190219,193567,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,28.077679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
ratings['rating'].describe()

Unnamed: 0,rating
count,100836.0
mean,3.501557
std,1.042529
min,0.5
25%,3.0
50%,3.5
75%,4.0
max,5.0


Нормализуем получившиеся рейтинги, чтобы сравнить их с изначальными (изначально они в диапазоне (0.5, 5.0))

Будем использовать min max scaler
$$ Pred = (X - X.min) / (X.max - X.min) * (max - min) + min $$
max = 5.0
min = 0.5

In [None]:
from sklearn.preprocessing import MinMaxScaler

X = test_user_final_rating.copy()
X = X[X > 0] # берем только те фильмы которые пользователи оценивали

scaler = MinMaxScaler(feature_range = (0.5, 5))
scaler.fit(X)
pred = scaler.transform(X)

print(pred)

[[       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 ...
 [       nan 2.28631493        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]]


In [None]:
test = X_test.pivot(index = 'userId', columns = 'movieId', values = 'rating')
test.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,187595,189043,189111,189333,189547,189713,190213,190219,193567,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


#### Посчитаем MSE и RMSE

In [None]:
# RMSE Score

# считаем сколько ненулевых рейтингов
total_non_nan = np.count_nonzero(~np.isnan(pred))

diff_sqr_matrix = (test - pred)**2
sum_of_squares_err = diff_sqr_matrix.sum().sum() # df.sum().sum() by default ignores null values

rmse = np.sqrt(sum_of_squares_err/total_non_nan)
print(rmse)

1.5642365382544885


In [None]:
# Mean abslute error

mae = np.abs(pred - test).sum().sum()/total_non_nan
print(mae)

1.2120970142833813


## В среднем наша модель делает ошибку на 1.2 при прогнозировании рейтингов, что в целом неплохо. Далее я буду развивать модель.

Я вижу тут 3 основных проблемы:

1. Холодный старт .
2. Разреженность данных: слишком мало оценок, сходство между пользователями вычисляется неточно.
3. Смещение данных.

Также на низкую метрику может влиять вид нормализации (MinMaxScaler дает смещение).

Что я могу сделать для улучшения качества модели:

1. Предсказывать отсутствующие оценки
2. Создать доп признаки (например, признак жанра (предобработать))
3. Изменить нормализацию : сначала вычисление средних оценок для пользователей и айтемов, потом нормализация относительно средних значений
4. Использовать центрированное косинусное сходство
5. Изменить метрики на ndcg и precision
