# <center> Майнор "Интеллектуальный анализ данных" </center>

# <center> Курс "Прикладные задачи анализа данных" </center>

# <center> Лабораторная работа №2. Рекомендательные системы </center>

* В данной лабораторной работе вам предлагается разработать алгоритм рекомендации фильмов.  
* Рекомендацией пользователю будем считать подборку из 10 фильмов.  
* Ожидаемый результат - эти фильмы пользователю понравятся и он высоко их оценит (weak assumption). 

### Задания  
  
1. Выполните разведывательный анализ данных (EDA).  
2. Разработайте алгоритм рекомендации фильмов пользователю. Используйте различные подходы: collaborative filtering (user to user, item to item), content-based, гибридный.  
3. Предложите способ оценки качества алгоритма и оцените качество различных подходов и моделей из предыдущего пункта.  
4. Опишите недостатки и ограничения предложенных вами моделей/алгоритмов и предложите возможные решения для борьбы с ними. 

### Данные 

Данные находятся в архиве `ml-latest-small.zip`.  
Они представляют из себя оценки 9742 фильмов, выставленные 610 пользователями. 
Также для некоторых фильмов известна информация о жанрах в виде тегов.

**Tip:** Вы можете самостоятельно добыть дополнительную информацию о фильмах, используя открытые (или закрытые) источники. Например, данные с IMDb, для взаимодействия с которой есть [специальный модуль](https://github.com/alberanid/imdbpy).  
**NB:** Использование дополнительных данных положительно скажется на оценке.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set_style('white')

In [84]:
ratings = pd.read_csv("data/ratings.csv")
links = pd.read_csv("data/links.csv")
movies = pd.read_csv("data/movies.csv")
tags = pd.read_csv("data/tags.csv")

In [3]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [4]:
links.head(3)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0


In [5]:
tags.head(3)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992


In [7]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [8]:
movies.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [9]:
tags.describe()

Unnamed: 0,userId,movieId,timestamp
count,3683.0,3683.0,3683.0
mean,431.149335,27252.013576,1320032000.0
std,158.472553,43490.558803,172102500.0
min,2.0,1.0,1137179000.0
25%,424.0,1262.5,1137521000.0
50%,474.0,4454.0,1269833000.0
75%,477.0,39263.0,1498457000.0
max,610.0,193565.0,1537099000.0


In [10]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [59]:
ratings.groupby("movieId").mean().sort_values("rating", ascending=False).merge(movies, on="movieId")[["title", "rating"]].head()

Unnamed: 0,title,rating
0,Paper Birds (Pájaros de papel) (2010),5.0
1,"Act of Killing, The (2012)",5.0
2,Jump In! (2007),5.0
3,Human (2015),5.0
4,L.A. Slasher (2015),5.0


In [85]:
tmp = movies["genres"].str.get_dummies(sep='|')
tmp.head(2)

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [86]:
movies_dummy = pd.concat([movies, tmp], axis=1)
movies_dummy.drop("genres", axis = 1, inplace = True)
movies_dummy.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
movies["ratingCount"] = movies.merge(ratings.groupby('movieId')['rating'].count(), on="movieId", how="left")["rating"]

In [88]:
movies = movies.merge(movies_dummy.drop("title", axis=1), on="movieId", how = "left")

In [89]:
movies[movies["title"].str.contains('\(')==False]

Unnamed: 0,movieId,title,genres,ratingCount,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
6059,40697,Babylon 5,Sci-Fi,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9031,140956,Ready Player One,Action|Sci-Fi|Thriller,4.0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
9091,143410,Hyena Road,(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9138,147250,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9179,149334,Nocturnal Animals,Drama|Thriller,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9259,156605,Paterson,(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9367,162414,Moonlight,Drama,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9448,167570,The OA,(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9514,171495,Cosmos,(no genres listed),2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9515,171631,Maria Bamford: Old Baby,(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
movies[movies["(no genres listed)"]==1]

Unnamed: 0,movieId,title,genres,ratingCount,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
8517,114335,La cravate (1957),(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8684,122888,Ben-hur (2016),(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8687,122896,Pirates of the Caribbean: Dead Men Tell No Tal...,(no genres listed),7.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8782,129250,Superfast! (2015),(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8836,132084,Let It Be Me (1995),(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8902,134861,Trevor Noah: African American (2013),(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9033,141131,Guardians (2016),(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9053,141866,Green Room (2015),(no genres listed),3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9070,142456,The Brand New Testament (2015),(no genres listed),2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9091,143410,Hyena Road,(no genres listed),1.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
def parse_title(row):
    yearStart = int(row["title"].rfind("("))
    yearEnd = int(row["title"].rfind(")"))
    if(yearStart!=-1 and yearEnd!=-1):
        year = row["title"][yearStart+1: yearEnd]
        row["title"] = row["title"][0:yearStart]
        if(year.isdigit()):
            return int(year)
        else:
            return int(-1)
    else:
        return int(-1)
def drop_year(row):
    yearStart = int(row["title"].rfind("("))
    if(yearStart!=-1):
        return row["title"][0:yearStart-1]

In [92]:
movies["year"] = movies.apply(lambda x: parse_title(x), axis=1)

In [93]:
movies["title"] = movies.apply(lambda x: drop_year(x), axis=1)

In [94]:
movies

Unnamed: 0,movieId,title,genres,ratingCount,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,215.0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1995
1,2,Jumanji,Adventure|Children|Fantasy,110.0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1995
2,3,Grumpier Old Men,Comedy|Romance,52.0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,7.0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1995
4,5,Father of the Bride Part II,Comedy,49.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
5,6,Heat,Action|Crime|Thriller,102.0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
6,7,Sabrina,Comedy|Romance,54.0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1995
7,8,Tom and Huck,Adventure|Children,8.0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1995
8,9,Sudden Death,Action,16.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
9,10,GoldenEye,Action|Adventure|Thriller,132.0,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
