# Рекомендательные системы. Лекция 1

In [188]:
import os

In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

## Копирование данных с репозитория на github



In [None]:
!wget https://github.com/ALKONDR/netology-recsys/archive/refs/heads/master.zip
!unzip master.zip

--2023-06-22 16:01:01--  https://github.com/ALKONDR/netology-recsys/archive/refs/heads/master.zip
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/ALKONDR/netology-recsys/zip/refs/heads/master [following]
--2023-06-22 16:01:01--  https://codeload.github.com/ALKONDR/netology-recsys/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.113.10
Connecting to codeload.github.com (codeload.github.com)|140.82.113.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘master.zip.1’

master.zip.1            [  <=>               ]   1.06M  4.23MB/s    in 0.3s    

2023-06-22 16:01:02 (4.23 MB/s) - ‘master.zip.1’ saved [1111929]

Archive:  master.zip
dfe2a910caf170a1f0fd2174867169ce737c9dc7
replace netology-recsys-master/lecture-1/.ipynb_checkpoint

In [None]:
os.listdir('netology-recsys-master')

In [None]:
prefix = 'netology-recsys-master/lecture-1'

In [None]:
links = pd.read_csv(os.path.join(prefix, 'links.csv'))
movies = pd.read_csv(os.path.join(prefix, 'movies.csv'))
ratings = pd.read_csv(os.path.join(prefix, 'ratings.csv'))
tags = pd.read_csv(os.path.join(prefix, 'tags.csv'))

In [None]:
links.head()

In [None]:
movies.head()

In [None]:
movies.set_index('movieId').index

In [None]:
ratings.head()

In [None]:
tags.head()

In [None]:
# соединим таблицу с рейтингами и названиями фильмов
joined_ratings = ratings.set_index('movieId').join(movies.set_index('movieId'), on='movieId')

In [None]:
joined_ratings.head()

In [None]:
# посмотрим на гистограмму распределения оценок
joined_ratings.rating.hist(bins =10)

In [None]:
# гистограмма по количеству оценок на фильм
joined_ratings.groupby('title').rating.count().hist(bins=100);

In [None]:
# достанем топ фильмов по оценкам
top_films = joined_ratings.groupby('title')[['rating']].mean().sort_values('rating', ascending=False)
top_films.head(10)

In [None]:
# возьмем только фильмы с наивысшей средней оценкой в 5.0
films_with_highest_marks = top_films.iloc[np.where(top_films.rating == 5.0)].index
films_with_highest_marks

In [None]:
for title, group in tqdm_notebook(joined_ratings.groupby('title')):
    print(f'Title: {title}\n\n')
    print(f'Group: {group}')
    break

In [None]:
# достанем по каждому фильму количество рейтингов
title_num_ratings = {}

for title, group in tqdm_notebook(joined_ratings.groupby('title')):
    # get every title in current
    title_num_ratings[title] = group.userId.unique().shape[0]
# title_num_ratings

In [None]:
# выведем топ фильмов со средней оценкой в 5.0 по количеству отзывов и увидим, что рейтинг получается не самый удачный
sorted([(title_num_ratings[f], f) for f in films_with_highest_marks], key=lambda x: x[0], reverse=True)[:10]

The same result with less efforts

In [None]:
top_films = joined_ratings.groupby('title').agg({'rating':('count','mean')})
top_films[top_films[('rating','mean')] == 5].sort_values(by=[('rating','count'), ('rating','mean')], ascending=[False, True])

# Улучшение метрики

### Попробуем теперь сортировать фильмы по следующей метрике: средняя оценка фильма, умноженная на нормированное количество рейтингов

In [None]:
# достанем простые статистики по количеству рейтингов
min_num_ratings = np.min([title_num_ratings[f] for f in title_num_ratings.keys()])
max_num_ratings = np.max([title_num_ratings[f] for f in title_num_ratings.keys()])
mean_num_ratings = np.mean([title_num_ratings[f] for f in title_num_ratings.keys()])
median_num_ratings = np.median([title_num_ratings[f] for f in title_num_ratings.keys()])

In [None]:
print(f"Minimum number of ratings:\t{min_num_ratings}")
print(f"Maximum number of ratings:\t{max_num_ratings}")
print(f"Average number of ratings:\t{mean_num_ratings:.2f}")
print(f"Median number of ratings:\t{median_num_ratings}")

In [None]:
# считаем средний рейтинг на каждый фильм
title_mean_rating = {}

for title, group in tqdm_notebook(joined_ratings.groupby('title')):
    title_mean_rating[title] = group.rating.mean()

Что это за нормализация?

In [None]:
film_with_our_mark = []

# посчитаем нашу метрику для каждого фильма из датасета
for f in title_num_ratings.keys():
    film_with_our_mark.append(
        (f, title_mean_rating[f] * (title_num_ratings[f] - mean_num_ratings) / (max_num_ratings - min_num_ratings))
    )

In [None]:
# выводим топ 20 и получилось уже очень неплохо
LIMIT = 20
for film, rate in list(
    sorted(film_with_our_mark, key=lambda x: x[1], reverse=True)
)[:LIMIT]:
    print(f"{film[:50]: >60}:\t{rate:.3f}")

The same results. But this type of normalization seems a strange one.

## $Rating_{i (norm)} = Rating_{i (mean)} \dot{} \frac{  (Count_{i} - Count_{mean})}{(Count_{max}-Count_{min})}$

In [None]:
top_films = joined_ratings.groupby('title').agg({'rating':('count','mean')})
top_films['norm_rating'] = top_films[('rating','mean')]*(top_films[('rating','count')] - mean_num_ratings) / (max_num_ratings - min_num_ratings)
top_films.sort_values(by='norm_rating', ascending=False)

## Min-Max normalization.
## $Rating_{norm} = Rating_{mean} \dot{} \frac{  (Count - Count_{min})}{Count_{max}-Count_{min}}$

In [None]:
top_films = joined_ratings.groupby('title').agg({'rating':('count','mean')})
top_films['norm_rating'] = top_films[('rating','mean')]*(top_films[('rating','count')] - min_num_ratings) / (max_num_ratings - min_num_ratings)
top_films.sort_values(by='norm_rating', ascending=False)

### Появилась гипотеза использовать теги в ранжировании фильмов, решили считать не только количество отзывов, а ещё и количество проставленных тегов на фильм

In [None]:
# соединим уже созданную таблицу с таблицей с проставленными тегами по фильмам
joined_with_tags = joined_ratings.join(tags.set_index('movieId'), on='movieId', lsuffix='_left', rsuffix='_right')
joined_with_tags.head(10)

In [None]:
# достанем по каждому фильму количество рейтингов
title_num_actions = {}

for title, group in tqdm_notebook(joined_with_tags.groupby('title')):
    title_num_actions[title] = group.shape[0]

In [None]:
min_num_actions = np.min([title_num_actions[f] for f in title_num_actions.keys()])
max_num_actions = np.max([title_num_actions[f] for f in title_num_actions.keys()])
mean_num_actions = np.mean([title_num_actions[f] for f in title_num_actions.keys()])
median_num_actions = np.median([title_num_actions[f] for f in title_num_actions.keys()])

In [None]:
print(f"Minimun number of actions:\t{min_num_actions}")
print(f"Maximum number of actions:\t{max_num_actions}")
print(f"Average number of actions:\t{mean_num_actions:.2f}" )
print(f"Median number of actions:\t{median_num_actions}")

In [None]:
film_with_new_mark = []

for f in title_num_actions.keys():
    # посчитаем нашу новую метрику для каждого фильма из датасета
    film_with_new_mark.append(
        (f, title_mean_rating[f] * (title_num_actions[f] - mean_num_ratings) / (max_num_ratings - min_num_ratings))
    )

In [None]:
# выведем топ фильмов по новой метрике
LIMIT = 20
for film, rate in list(
    sorted(film_with_new_mark, key=lambda x: x[1], reverse=True)
)[:LIMIT]:
    print(f"{film[:50]: >60}:\t{rate:.2f}")

In [None]:
grouped = joined_with_tags.groupby('title')['title']
tags_min = grouped.count().min()
tags_max = grouped.count().max()
tags_mean = grouped.count().mean()
tags_median = grouped.count().median()
print(tags_min)
print(tags_max)
print(tags_mean)
print(tags_median)

In [None]:
top_films2 = joined_with_tags.groupby('title').agg({'tag':('count'),'rating':'mean'})
top_films2['norm_rating'] = top_films2[('rating')]*(top_films2[('tag')] - min_num_ratings) / (max_num_ratings - min_num_ratings)
top_films2.sort_values(by='norm_rating', ascending=False)