In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('data/movies.csv', index_col='movie_id', header=0, names=['movie_id', 'title', 'genres'])
ratings = pd.read_csv('data/ratings.csv', index_col='user_id', header=0, names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [3]:
movies.head()

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931


In [5]:
ratings.describe()

Unnamed: 0,movie_id,rating,timestamp
count,100836.0,100836.0,100836.0
mean,19435.295718,3.501557,1205946000.0
std,35530.987199,1.042529,216261000.0
min,1.0,0.5,828124600.0
25%,1199.0,3.0,1019124000.0
50%,2991.0,3.5,1186087000.0
75%,8122.0,4.0,1435994000.0
max,193609.0,5.0,1537799000.0


## Primeira tentativa de recomendação: heurística de total de votos

In [6]:
total_ratings = ratings['movie_id'].value_counts()
total_ratings.head()

356     329
318     317
296     307
593     279
2571    278
Name: movie_id, dtype: int64

In [7]:
movies['total_ratings'] = total_ratings
movies.head()

Unnamed: 0_level_0,title,genres,total_ratings
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0
2,Jumanji (1995),Adventure|Children|Fantasy,110.0
3,Grumpier Old Men (1995),Comedy|Romance,52.0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0
5,Father of the Bride Part II (1995),Comedy,49.0


In [8]:
movies.sort_values('total_ratings', ascending=False).head(10)

Unnamed: 0_level_0,title,genres,total_ratings
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0
110,Braveheart (1995),Action|Drama|War,237.0
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0
527,Schindler's List (1993),Drama|War,220.0


In [9]:
mean_ratings = ratings.groupby('movie_id')['rating'].mean()

movies['mean_ratings'] = mean_ratings
movies.head()

Unnamed: 0_level_0,title,genres,total_ratings,mean_ratings
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.92093
2,Jumanji (1995),Adventure|Children|Fantasy,110.0,3.431818
3,Grumpier Old Men (1995),Comedy|Romance,52.0,3.259615
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0,2.357143
5,Father of the Bride Part II (1995),Comedy,49.0,3.071429


In [10]:
movies.sort_values('total_ratings', ascending=False).head(10)

Unnamed: 0_level_0,title,genres,total_ratings,mean_ratings
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0,4.164134
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0,4.197068
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0,4.16129
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0,4.192446
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0,4.231076
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0,3.75
110,Braveheart (1995),Action|Drama|War,237.0,4.031646
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0,3.970982
527,Schindler's List (1993),Drama|War,220.0,4.225


## Uma segunda heurística: nota média e filtrando votos

In [11]:
movies.sort_values('mean_ratings', ascending=False).head(10)

Unnamed: 0_level_0,title,genres,total_ratings,mean_ratings
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
88448,Paper Birds (Pájaros de papel) (2010),Comedy|Drama,1.0,5.0
100556,"Act of Killing, The (2012)",Documentary,1.0,5.0
143031,Jump In! (2007),Comedy|Drama|Romance,1.0,5.0
143511,Human (2015),Documentary,1.0,5.0
143559,L.A. Slasher (2015),Comedy|Crime|Fantasy,1.0,5.0
6201,Lady Jane (1986),Drama|Romance,1.0,5.0
102217,Bill Hicks: Revelations (1993),Comedy,1.0,5.0
102084,Justice League: Doom (2012),Action|Animation|Fantasy,1.0,5.0
6192,Open Hearts (Elsker dig for evigt) (2002),Romance,1.0,5.0
145994,Formula of Love (1984),Comedy,1.0,5.0


In [12]:
movies_more_than_50_total_ratings = movies.query('total_ratings >= 50')
movies_more_than_50_total_ratings.sort_values('mean_ratings', ascending=False).head(10)

Unnamed: 0_level_0,title,genres,total_ratings,mean_ratings
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
858,"Godfather, The (1972)",Crime|Drama,192.0,4.289062
2959,Fight Club (1999),Action|Crime|Drama|Thriller,218.0,4.272936
1276,Cool Hand Luke (1967),Drama,57.0,4.27193
750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,97.0,4.268041
904,Rear Window (1954),Mystery|Thriller,84.0,4.261905
1221,"Godfather: Part II, The (1974)",Crime|Drama,129.0,4.25969
48516,"Departed, The (2006)",Crime|Drama|Thriller,107.0,4.252336
1213,Goodfellas (1990),Crime|Drama,126.0,4.25
912,Casablanca (1942),Drama|Romance,100.0,4.24


In [13]:
i_watch = [59315, 59615, 72998, 86332, 8368, 40815, 260]
movies.loc[i_watch]

Unnamed: 0_level_0,title,genres,total_ratings,mean_ratings
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
59315,Iron Man (2008),Action|Adventure|Sci-Fi,94.0,3.824468
59615,Indiana Jones and the Kingdom of the Crystal S...,Action|Adventure|Comedy|Sci-Fi,39.0,2.833333
72998,Avatar (2009),Action|Adventure|Sci-Fi|IMAX,97.0,3.603093
86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,34.0,3.514706
8368,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX,93.0,3.913978
40815,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX,71.0,3.816901
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0,4.231076


In [14]:
movies_more_than_50_total_ratings.query(
    "genres=='Action|Adventure|Sci-Fi'"
).drop(i_watch, errors='ignore').sort_values('mean_ratings', ascending=False).head(10)

Unnamed: 0_level_0,title,genres,total_ratings,mean_ratings
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,211.0,4.21564
1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,196.0,4.137755
112852,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi,59.0,4.050847
34405,Serenity (2005),Action|Adventure|Sci-Fi,50.0,3.94
3793,X-Men (2000),Action|Adventure|Sci-Fi,133.0,3.699248
2640,Superman (1978),Action|Adventure|Sci-Fi,61.0,3.606557
33493,Star Wars: Episode III - Revenge of the Sith (...,Action|Adventure|Sci-Fi,78.0,3.429487
316,Stargate (1994),Action|Adventure|Sci-Fi,140.0,3.375
2105,Tron (1982),Action|Adventure|Sci-Fi,50.0,3.34
2628,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Sci-Fi,140.0,3.107143


## Procurar usuários "similares"

In [15]:
import numpy as np
from collections import namedtuple


Distance = namedtuple('Distance', ['from_', 'to_', 'distance'])


def vectors_distance(a, b):
    return np.linalg.norm(a - b)


def user_ratings(user_id):
    __ratings = ratings.loc[user_id]
    __ratings = __ratings[['movie_id', 'rating']].set_index('movie_id')
    
    return __ratings


def user_distance(user_1_id, user_2_id, min_common=5):
    user_1 = user_ratings(user_1_id)
    user_2 = user_ratings(user_2_id)
    
    common_movies = user_1.join(user_2, lsuffix='__left',
                                rsuffix='__right', how='inner')

    distance = float('inf')
    
    if common_movies.shape[0] >= min_common:
        distance = vectors_distance(common_movies.rating__left.values,
                                    common_movies.rating__right.values) / common_movies.shape[0]

    return Distance(from_=user_1_id, to_=user_2_id, distance=distance)

In [16]:
user_distance(1, 4)

Distance(from_=1, to_=4, distance=0.2474561939035565)

## Distância entre usuários do dataset

In [17]:
def distance_from_all(user_id):
    return [
        user_distance(user_id, _id)
        for _id in ratings.index.unique()
        if _id != user_id
    ]


def n_closest(user_id, n=10):
    distances = distance_from_all(user_id)
    distances = sorted(distances, key=lambda x: x.distance)
    
    return distances[:n]


def get_suggestions(user_id, k=10):
    closest_user = n_closest(user_searched, n=5)[0].to_

    user_searched_ratings = user_ratings(user_searched)
    closest_user_ratings = user_ratings(closest_user)
    
    new_movies = closest_user_ratings.drop(user_searched_ratings.index, errors='ignore')
    new_movies_ordered_index = new_movies.sort_values(
        'rating', ascending=False
    ).head(k).index
    
    return movies.loc[new_movies_ordered_index].title.values

In [18]:
user_searched = 1

In [19]:
get_suggestions(user_searched)

array(['Spider-Man 2 (2004)', 'Dark Knight, The (2008)',
       'Batman Begins (2005)',
       'Lord of the Rings: The Fellowship of the Ring, The (2001)',
       'Spider-Man (2002)',
       'Star Wars: Episode II - Attack of the Clones (2002)',
       'Incredibles, The (2004)',
       'Harry Potter and the Chamber of Secrets (2002)',
       'Lord of the Rings: The Two Towers, The (2002)',
       'Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)'],
      dtype=object)

## Sugerindo baseado em vários usuários

In [20]:
def get_suggestions(user_id, k_samples=5, n_compare=10):
    closest_users_ids = [distance.to_ for distance in n_closest(user_searched, n=n_compare)]

    user_searched_ratings = user_ratings(user_searched)
    closest_users_ratings = pd.concat([user_ratings(_id) for _id in closest_users_ids])
    closest_users_ratings = closest_users_ratings.groupby('movie_id').rating.mean()
    
    new_movies = closest_users_ratings.drop(user_searched_ratings.index, errors='ignore')
    new_movies_ordered_index = new_movies.sort_values(ascending=False).head(k_samples).index
    
    return movies.loc[new_movies_ordered_index].title.values

In [21]:
get_suggestions(user_searched, k_samples=10, n_compare=1)

array(['Spider-Man 2 (2004)', 'Dark Knight, The (2008)',
       'Batman Begins (2005)',
       'Lord of the Rings: The Fellowship of the Ring, The (2001)',
       'Spider-Man (2002)',
       'Star Wars: Episode II - Attack of the Clones (2002)',
       'Incredibles, The (2004)',
       'Harry Potter and the Chamber of Secrets (2002)',
       'Lord of the Rings: The Two Towers, The (2002)',
       'Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)'],
      dtype=object)