In [None]:
# Подключаем облако
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Импорт библиотек
import numpy as np
import pandas as pd

ratings_df = pd.read_csv('/content/drive/MyDrive/Датасеты/ml-latest-small/ratings.csv')
print('Unique users count: {}'.format(len(ratings_df['userId'].unique())))
print('Unique movies count: {}'.format(len(ratings_df['movieId'].unique())))
print('DataFrame shape: {}'.format(ratings_df.shape))

ratings_df.head()

Unique users count: 610
Unique movies count: 9724
DataFrame shape: (100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
n_users = len(ratings_df['userId'].unique())
n_movies = len(ratings_df['movieId'].unique())
(n_users, n_movies)

(610, 9724)

In [None]:
# Масштабирование
movie_ids = ratings_df['movieId'].unique()

def scale_movie_id(movie_id):
    scaled = np.where(movie_ids == movie_id)[0][0] + 1
    return scaled

ratings_df['movieId'] = ratings_df['movieId'].apply(scale_movie_id)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,2,4.0,964981247
2,1,3,4.0,964982224
3,1,4,5.0,964983815
4,1,5,5.0,964982931


In [None]:
# Делим выборку на тестовую и тренировочную
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(ratings_df, test_size=0.2)

print('Train shape: {}'.format(train_data.shape))
print('Test shape: {}'.format(test_data.shape))

Train shape: (80668, 4)
Test shape: (20168, 4)


In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction, ground_truth):
    # Оставим оценки, предсказанные алгоритмом, только для соотвествующего набора данных
    prediction = np.nan_to_num(prediction)[ground_truth.nonzero()].flatten()
    # Оставим оценки, которые реально поставил пользователь, только для соотвествующего набора данных
    ground_truth = np.nan_to_num(ground_truth)[ground_truth.nonzero()].flatten()

    mse = mean_squared_error(prediction, ground_truth)
    return sqrt(mse)

In [None]:
train_data_matrix = np.zeros((n_users, n_movies))
for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, line[2] - 1] = line[3]

test_data_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

In [None]:
from  sklearn.metrics.pairwise import pairwise_distances

# считаем косинусное расстояние для пользователей и фильмов
# (построчно и поколоночно соотвественно).
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [None]:
from scipy.spatial import distance

print(distance.cosine([2,2],[1,1]))
print(distance.cosine([3,3],[2,3]))
print(distance.cosine([3, 3],[1, 1.5]))
print(distance.cosine([3, 3],[1, 3]))

0
0.01941932430907989
0.01941932430907989
0.10557280900008414


In [None]:
# Наивные рекомендации
def naive_predict(top):
    # Структура для хранения для каждого пользователя оценки фильмов top наиболее похожих на него пользователей:
    # top_similar_ratings[0][1] - оценки всех фильмов одного из наиболее похожих пользователей на пользователя с ид 0.
    # Здесь 1 - это не ид пользователя, а просто порядковый номер.
    top_similar_ratings = np.zeros((n_users, top, n_movies))

    for i in range(n_users):
        # Для каждого пользователя необходимо получить наиболее похожих пользователей:
        # Нулевой элемент не подходит, т.к. на этом месте находится похожесть пользователя самого на себя
        top_sim_users = user_similarity[i].argsort()[1:top + 1]

        # берём только оценки из "обучающей" выборки
        top_similar_ratings[i] = train_data_matrix[top_sim_users]

    pred = np.zeros((n_users, n_movies))
    for i in range(n_users):
        pred[i] = top_similar_ratings[i].sum(axis=0) / top

    return pred


def naive_predict_item(top):
    top_similar_ratings = np.zeros((n_movies, top, n_users))

    for i in range(n_movies):
        top_sim_movies = item_similarity[i].argsort()[1:top + 1]
        top_similar_ratings[i] = train_data_matrix.T[top_sim_movies]

    pred = np.zeros((n_movies, n_users))
    for i in range(n_movies):
        pred[i] = top_similar_ratings[i].sum(axis=0) / top

    return pred.T

naive_pred = naive_predict(7)
print('User-based CF RMSE: ', rmse(naive_pred, test_data_matrix))

naive_pred_item = naive_predict_item(7)
print('Item-based CF RMSE: ', rmse(naive_pred_item, test_data_matrix))

User-based CF RMSE:  2.773901626187821
Item-based CF RMSE:  2.9410672376897695


In [None]:
# Рекомендации с учётом средних оценок похожих пользователей
def k_fract_predict(top):
    top_similar = np.zeros((n_users, top))

    for i in range(n_users):
        user_sim = user_similarity[i]
        top_sim_users = user_sim.argsort()[1:top + 1]#[-top:]

        for j in range(top):
            top_similar[i, j] = top_sim_users[j]

    abs_sim = np.abs(user_similarity)
    pred = np.zeros((n_users, n_movies))

    for i in range(n_users):
        indexes = top_similar[i].astype(int)
        numerator = user_similarity[i][indexes]

        product = numerator.dot(train_data_matrix[indexes])

        denominator = abs_sim[i][top_similar[i].astype(int)].sum()

        pred[i] = product / denominator

    return pred


def k_fract_predict_item(top):
    flag = True
    top_similar = np.zeros((n_movies, top))

    for i in range(n_movies):
        movies_sim = item_similarity[i]
        top_sim_movies = movies_sim.argsort()[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_movies.T[j]

    abs_sim = np.abs(item_similarity)
    pred = np.zeros((n_movies, n_users))


    for i in range(n_users):
        indexes = top_similar[i].astype(int)
        numerator = item_similarity[i][indexes]

        product = numerator.dot(train_data_matrix.T[indexes])

        denominator = abs_sim[i][indexes].sum()
        denominator = denominator if denominator != 0 else 1

        pred[i] = product / denominator

    return pred.T


k_predict = k_fract_predict(7)
print('User-based CF RMSE: ', rmse(k_predict, test_data_matrix))

k_predict_item = k_fract_predict_item(7)
print('Item-based CF RMSE: ', rmse(k_predict_item, test_data_matrix))

User-based CF RMSE:  2.77512996175603
Item-based CF RMSE:  3.326269736593521


In [None]:
# Рекомендации на основе средних оценок пользователей и матрицы "похожести"
def k_fract_mean_predict(top):
    top_similar = np.zeros((n_users, top))

    for i in range(n_users):
        user_sim = user_similarity[i]
        top_sim_users = user_sim.argsort()[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_users[j]

    abs_sim = np.abs(user_similarity)
    pred = np.zeros((n_users, n_movies))

    for i in range(n_users):
        indexes = top_similar[i].astype(int)
        numerator = user_similarity[i][indexes]

        mean_rating = np.array([x for x in train_data_matrix[i] if x > 0]).mean()
        diff_ratings = train_data_matrix[indexes] - train_data_matrix[indexes].mean()
        numerator = numerator.dot(diff_ratings)
        denominator = abs_sim[i][top_similar[i].astype(int)].sum()

        pred[i] = mean_rating + numerator / denominator

    return pred

def k_fract_mean_predict_item(top):
    top_similar = np.zeros((n_movies, top))

    for i in range(n_movies):
        movie_sim = item_similarity[i]
        top_sim_movies = movie_sim.argsort()[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_movies[j]

    abs_sim = np.abs(item_similarity)
    pred = np.zeros((n_movies, n_users))

    for i in range(n_movies):
        indexes = top_similar[i].astype(int)
        numerator = item_similarity[i][indexes]

        diff_ratings = train_data_matrix.T[indexes] - train_data_matrix.T[indexes].mean()
        numerator = numerator.dot(diff_ratings)
        denominator = abs_sim[i][top_similar[i].astype(int)].sum()
        denominator = denominator if denominator != 0 else 1

        mean_rating = np.array([x for x in train_data_matrix.T[i] if x > 0]).mean()
        mean_rating = 0 if np.isnan(mean_rating) else mean_rating
        pred[i] = mean_rating + numerator / denominator

    return pred.T

k_predict = k_fract_mean_predict(7)
print('User-based CF RMSE: ', rmse(k_predict, test_data_matrix))

k_predict_item = k_fract_mean_predict_item(7)
print('Item-based CF RMSE: ', rmse(k_predict_item, test_data_matrix))

User-based CF RMSE:  1.5119416179531713


  mean_rating = np.array([x for x in train_data_matrix.T[i] if x > 0]).mean()
  ret = ret.dtype.type(ret / rcount)


Item-based CF RMSE:  1.4879727041574484


In [None]:
def get_recommendations_for_user(user_id):
    # Получаем рекомендации для пользователя
    top_n = k_fract_mean_predict(user_id)
    return top_n

# Использование функции в магазине
user_id = 42
recommended_books = get_recommendations_for_user(user_id)