In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy

# Classic Recommendations

영화 점보와 평점 정보를 pandas를 이용해서 불러옵니다

In [3]:
ds_movies = pd.read_csv("../ml-latest-small/movies.csv", index_col=0)
ds_ratings = pd.read_csv("../ml-latest-small/ratings.csv")

영화 평점과 평가 수를 집계한 후 영화 제목을 데이터프레임에 추가합니다.

In [4]:
movie_ratings = ds_ratings.groupby("movieId") \
                          .agg({"rating": ["count", "mean"]})
movie_ratings["title"] = ds_movies.title
movie_ratings

Unnamed: 0_level_0,rating,rating,title
Unnamed: 0_level_1,count,mean,Unnamed: 3_level_1
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,215,3.920930,Toy Story (1995)
2,110,3.431818,Jumanji (1995)
3,52,3.259615,Grumpier Old Men (1995)
4,7,2.357143,Waiting to Exhale (1995)
5,49,3.071429,Father of the Bride Part II (1995)
...,...,...,...
193581,1,4.000000,Black Butler: Book of the Atlantic (2017)
193583,1,3.500000,No Game No Life: Zero (2017)
193585,1,3.500000,Flint (2017)
193587,1,3.500000,Bungo Stray Dogs: Dead Apple (2018)


평점이 50개 이상한 영화 중 가장 평점이 좋은 영화 10개를 가져옵니다.<br/>
평점이 같다면 평가 수가 많은 영화 순으로 정렬합니다.

In [5]:
movie_ratings[movie_ratings[("rating", "count")] >= 50] \
             .sort_values([("rating", "mean"), ("rating", "count")], ascending=[False, False]) \
             .head(10)

Unnamed: 0_level_0,rating,rating,title
Unnamed: 0_level_1,count,mean,Unnamed: 3_level_1
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
318,317,4.429022,"Shawshank Redemption, The (1994)"
858,192,4.289062,"Godfather, The (1972)"
2959,218,4.272936,Fight Club (1999)
1276,57,4.27193,Cool Hand Luke (1967)
750,97,4.268041,Dr. Strangelove or: How I Learned to Stop Worr...
904,84,4.261905,Rear Window (1954)
1221,129,4.25969,"Godfather: Part II, The (1974)"
48516,107,4.252336,"Departed, The (2006)"
1213,126,4.25,Goodfellas (1990)
912,100,4.24,Casablanca (1942)


클래스로 구현

1. 가장 많이 본 영화
2. 가장 평점이 높은 영화

In [34]:

class ClassicRecommender:
    def __init__(self, data):
        self.data = data
        
    def get_most_rated(self, top_k=100):
        x = self.data.sort_values(["ratings_count", "ratings_mean"], ascending=[False, False])
        return x.head(top_k)
    
    def get_top_rated(self, top_k=100, min_ratings_count=50):
        x = self.data.sort_values(["ratings_mean", "ratings_count"], ascending=[False, False])
        x = x[x.ratings_count >= min_ratings_count]
        return x.head(top_k)

In [35]:
rec = ClassicRecommender(pd.read_csv("../data/movies.csv", index_col=0))

In [36]:
rec.get_most_rated(10)
# rec.get_top_rated(10)

Unnamed: 0_level_0,title,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,no,noir,romance,sci,thriller,war,western,ratings_mean,ratings_count,years
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
356,Forrest Gump (1994),0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,1,0,4.164134,329.0,1994
318,"Shawshank Redemption, The (1994)",0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,4.429022,317.0,1994
296,Pulp Fiction (1994),0,0,0,0,1,1,0,1,0,...,0,0,0,0,1,0,0,4.197068,307.0,1994
593,"Silence of the Lambs, The (1991)",0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,4.16129,279.0,1991
2571,"Matrix, The (1999)",1,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,4.192446,278.0,1999
260,Star Wars: Episode IV - A New Hope (1977),1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,4.231076,251.0,1977
480,Jurassic Park (1993),1,1,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,3.75,238.0,1993
110,Braveheart (1995),1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,4.031646,237.0,1995
589,Terminator 2: Judgment Day (1991),1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,3.970982,224.0,1991
527,Schindler's List (1993),0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,4.225,220.0,1993


## Content-Based

1. 유사한 Genre를 갖고 있는 영화

In [10]:
from sklearn.neighbors import NearestNeighbors

class ContentBasedRecommender:
    
    def __init__(self, data, max_neighbors):
        self.data = data
        self.genre_columns = data.columns[1:25]
        self.nn = NearestNeighbors(max_neighbors)
        self.nn.fit(self.data[self.genre_columns])
        
    def recommend_by_genre(self, movie_id, min_ratings=50):
        movie = self.data.loc[[movie_id], self.genre_columns]
        dists, ids = self.nn.kneighbors(movie)
        
        dists = pd.Series(dists[0], index=self.data.index[ids[0]], name="distance")
        dists.drop(movie_id, inplace=True)
        
        movies = self.data.iloc[ids[0], :]
        movies = pd.concat([movies, dists], axis=1)
        movies = movies.sort_values(by=["distance", "ratings_mean", "ratings_count"], 
                                    ascending=[True,False,False])
        movies = movies[movies.ratings_count >= min_ratings]
        
        return movies
    

In [11]:
rec = ContentBasedRecommender(pd.read_csv("../data/movies.csv", index_col=0), 100)

In [13]:
rec.recommend_by_genre(1)

Unnamed: 0_level_0,title,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,noir,romance,sci,thriller,war,western,ratings_mean,ratings_count,years,distance
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4886,"Monsters, Inc. (2001)",0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,3.871212,132.0,2001,0.0
3114,Toy Story 2 (1999),0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,3.860825,97.0,1999,0.0
78499,Toy Story 3 (2010),0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,4.109091,55.0,2010,1.0
6377,Finding Nemo (2003),0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,3.960993,141.0,2003,1.0
4306,Shrek (2001),0,1,1,1,1,0,0,0,1,...,0,1,0,0,0,0,3.867647,170.0,2001,1.0
5218,Ice Age (2002),0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,3.688235,85.0,2002,1.0
2355,"Bug's Life, A (1998)",0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,3.516304,92.0,1998,1.0
3052,Dogma (1999),0,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,3.64557,79.0,1999,1.414214
1,Toy Story (1995),0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,3.92093,215.0,1995,


In [42]:
from scipy.spatial.distance import squareform, pdist
pd.DataFrame(squareform(pdist(genres)), index=genres.index, columns=genres.index)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,1.414214,2.236068,2.449490,2.000000,2.828427,2.236068,1.732051,2.449490,2.449490,...,2.449490,2.236068,2.236068,2.000000,2.449490,1.732051,1.414214,2.449490,2.236068,2.000000
2,1.414214,0.000000,2.236068,2.449490,2.000000,2.449490,2.236068,1.000000,2.000000,2.000000,...,2.828427,2.236068,2.236068,2.000000,2.000000,2.236068,2.000000,2.000000,2.236068,2.000000
3,2.236068,2.236068,0.000000,1.000000,1.000000,2.236068,0.000000,2.000000,1.732051,2.236068,...,2.236068,2.000000,1.414214,1.732051,1.732051,2.000000,1.732051,1.732051,2.000000,1.000000
4,2.449490,2.449490,1.000000,0.000000,1.414214,2.449490,1.000000,2.236068,2.000000,2.449490,...,2.449490,1.732051,1.000000,2.000000,2.000000,2.236068,2.000000,1.414214,2.236068,1.414214
5,2.000000,2.000000,1.000000,1.414214,0.000000,2.000000,1.000000,1.732051,1.414214,2.000000,...,2.000000,1.732051,1.000000,1.414214,1.414214,1.732051,1.414214,1.414214,1.732051,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,1.732051,2.236068,2.000000,2.236068,1.732051,2.236068,2.000000,2.449490,1.732051,2.236068,...,1.732051,2.000000,2.000000,1.732051,2.236068,0.000000,1.000000,2.236068,1.414214,1.732051
193583,1.414214,2.000000,1.732051,2.000000,1.414214,2.449490,1.732051,2.236068,2.000000,2.449490,...,2.000000,1.732051,1.732051,1.414214,2.000000,1.000000,0.000000,2.000000,1.732051,1.414214
193585,2.449490,2.000000,1.732051,1.414214,1.414214,2.000000,1.732051,1.732051,1.414214,2.000000,...,2.449490,1.000000,1.000000,1.414214,1.414214,2.236068,2.000000,0.000000,1.732051,1.414214
193587,2.236068,2.236068,2.000000,2.236068,1.732051,1.732051,2.000000,2.000000,1.000000,1.732051,...,1.732051,1.414214,2.000000,1.000000,1.732051,1.414214,1.732051,1.732051,0.000000,1.732051
