# 콘텐츠 기반 필터링

## movies 데이터 불러오기

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [52]:
movies = pd.read_csv('ml_data/tmdb_5000_movies.csv')
movies[:2]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [53]:
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

## literal_eval()
: string을 python 데이터 타입으로 변환시켜준다

In [54]:
from ast import literal_eval

In [165]:
literal_eval('{"id": 28, "name": "Action"}')

{'id': 28, 'name': 'Action'}

In [56]:
literal_eval(movies['genres'][0])[0]['name']

'Action'

In [66]:
movie_df = movies.iloc[:, [3,17,1,18,19,8,4,7]]
movie_df[:2]

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",7.2,11800,150.437577,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",6.9,4500,139.082615,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha..."


In [67]:
movie_df['genres'] = movie_df['genres'].apply(literal_eval)
movie_df['keywords'] = movie_df['keywords'].apply(literal_eval)

In [68]:
[i['name'] for i in movie_df['genres'][0]]

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [69]:
movie_df['genres'].apply(lambda x: [y['name'] for y in x])

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4798                        [Action, Crime, Thriller]
4799                                [Comedy, Romance]
4800               [Comedy, Drama, Romance, TV Movie]
4801                                               []
4802                                    [Documentary]
Name: genres, Length: 4803, dtype: object

In [70]:
movie_df['genres'] = movie_df['genres'].apply(lambda x: [y['name'] for y in x])
movie_df['keywords'] = movie_df['keywords'].apply(lambda x: [y['name'] for y in x])

In [71]:
movie_df[:3]

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2,11800,150.437577,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",6.9,4500,139.082615,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,"[Action, Adventure, Crime]",6.3,4466,107.376788,"[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...


In [73]:
#CountVectorizer 적용 위해 공백문자로 word 단위가 구분되는 문자열로 변환
movie_df['genres_literal'] = movie_df['genres'].apply(lambda x: ' '.join(x))
movie_df[:2]

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview,genres_literal
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2,11800,150.437577,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Action Adventure Fantasy Science Fiction
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",6.9,4500,139.082615,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...",Adventure Fantasy Action


## CountVectorizer()

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movie_df['genres_literal'])
genre_mat.shape

(4803, 276)

## cosine_similarity

In [81]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(genre_mat[0], genre_mat)

array([[1.        , 0.59628479, 0.4472136 , ..., 0.        , 0.        ,
        0.        ]])

In [89]:
genre_sim = cosine_similarity(genre_mat, genre_mat)
genre_sim

array([[1.        , 0.59628479, 0.4472136 , ..., 0.        , 0.        ,
        0.        ],
       [0.59628479, 1.        , 0.4       , ..., 0.        , 0.        ,
        0.        ],
       [0.4472136 , 0.4       , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

## np.argsort()

In [90]:
import numpy as np

In [93]:
#similarity 작은순으로 index 번호
np.argsort(genre_sim)

array([[2401, 3037, 3038, ...,  813, 3494,    0],
       [2401, 3067, 3069, ...,  129,    1,  262],
       [2401, 2999, 3000, ..., 1542, 1740,    2],
       ...,
       [   0, 2230, 2229, ..., 1895, 3809, 4800],
       [   0, 3205, 3204, ..., 1596, 1594, 4802],
       [   0, 3141, 3140, ..., 4521, 4710, 4802]], dtype=int64)

In [94]:
np.argsort(genre_sim)[:, ::-1]

array([[   0, 3494,  813, ..., 3038, 3037, 2401],
       [ 262,    1,  129, ..., 3069, 3067, 2401],
       [   2, 1740, 1542, ..., 3000, 2999, 2401],
       ...,
       [4800, 3809, 1895, ..., 2229, 2230,    0],
       [4802, 1594, 1596, ..., 3204, 3205,    0],
       [4802, 4710, 4521, ..., 3140, 3141,    0]], dtype=int64)

In [123]:
genre_sim_sorted_ind = np.argsort(genre_sim)[:, ::-1]
genre_sim_sorted_ind

array([[   0, 3494,  813, ..., 3038, 3037, 2401],
       [ 262,    1,  129, ..., 3069, 3067, 2401],
       [   2, 1740, 1542, ..., 3000, 2999, 2401],
       ...,
       [4800, 3809, 1895, ..., 2229, 2230,    0],
       [4802, 1594, 1596, ..., 3204, 3205,    0],
       [4802, 4710, 4521, ..., 3140, 3141,    0]], dtype=int64)

In [104]:
movie_df.iloc[genre_sim_sorted_ind[0], :]

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview,genres_literal
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2,11800,150.437577,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Action Adventure Fantasy Science Fiction
3494,27549,Beastmaster 2: Through the Portal of Time,"[Action, Adventure, Fantasy, Science Fiction]",4.6,17,1.478505,"[based on novel, time travel, sequel, psychotr...","Mark Singer returns as Dar, the warrior who ca...",Action Adventure Fantasy Science Fiction
813,1924,Superman,"[Action, Adventure, Fantasy, Science Fiction]",6.9,1022,48.507081,"[saving the world, journalist, dc comics, crim...",Mild-mannered Clark Kent works as a reporter a...,Action Adventure Fantasy Science Fiction
870,8536,Superman II,"[Action, Adventure, Fantasy, Science Fiction]",6.5,629,30.515175,"[saving the world, dc comics, sequel, superher...",Three escaped criminals from the planet Krypto...,Action Adventure Fantasy Science Fiction
46,127585,X-Men: Days of Future Past,"[Action, Adventure, Fantasy, Science Fiction]",7.5,6032,118.078691,"[1970s, mutant, time travel, marvel comic, bas...",The ultimate X-Men ensemble fights a war for t...,Action Adventure Fantasy Science Fiction
...,...,...,...,...,...,...,...,...,...
3041,27322,Love Jones,"[Comedy, Drama, Romance]",8.1,12,1.000178,"[sex, ex-boyfriend, independent film, african ...",Darius Lovehall is a young black poet in Chica...,Comedy Drama Romance
3039,75900,My Week with Marilyn,[Drama],6.6,406,21.006078,"[based on novel, biography, historical figure,...",Sir Laurence Olivier is making a movie in Lond...,Drama
3038,17710,Hey Arnold! The Movie,"[Animation, Family]",5.6,62,5.856363,[],When a powerful developer named Mr. Scheck wan...,Animation Family
3037,19905,"The Goods: Live Hard, Sell Hard",[Comedy],5.4,58,3.352702,[duringcreditsstinger],Who is Don Ready? Salesman? Lover? Song Stylis...,Comedy


## 유사한 영화 추천 함수

In [143]:
def find_sim_movie(df, sorted_ind, title_name, top_n):
    
    # movies_df에서 'title' 컬럼이 입력된 title_name 값인 DataFrame추출
    title_movie = df[df['title'] == title_name]
    
    # title_named을 가진 DataFrame의 index 객체를 ndarray로 반환하고 
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n 개의 index 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, : top_n]
    
    #dataframe에서 index로 사용하기 위해서 1차원 array로 변경
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes] 

In [145]:
find_sim_movie(movie_df, genre_sim_sorted_ind, 'The Godfather', 10)[['title', 'vote_average']]

Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


In [153]:
movie_df[:2]

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview,genres_literal,weighted_vote
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2,11800,150.437577,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Action Adventure Fantasy Science Fiction,4928.15304
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",6.9,4500,139.082615,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...",Adventure Fantasy Action,4817.103094


## 가중평점
(v/v+m)*R + (m/v+m)*C
- v:개별 영화에 투표한 횟수
- m: 최소 투표 횟수
- R: 개별 영화에 대한 평균 평점
- C: 전체 영화에 대한 평균 평점

In [155]:
C = movie_df['vote_average'].mean()
m = movie_df['vote_count'].quantile(0.6)
print(C, m)

6.092171559442011 370.1999999999998


In [156]:
def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ((v/v+m)*R + (m/v+m)*C)

In [157]:
movie_df['vote_count'] = movie_df['vote_count'].replace(0,1)

In [160]:
movie_df['weighted_vote'] = movie_df.apply(weighted_vote_average, axis = 1)

In [164]:
movie_df.sort_values('weighted_vote', ascending=False)[:5]

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview,genres_literal,weighted_vote
4662,40963,Little Big Top,[Comedy],10.0,1,0.0921,[independent film],An aging out of work clown returns to his smal...,Comedy,8222.643823
4045,78373,"Dancer, Texas Pop. 81","[Comedy, Drama, Family]",10.0,1,0.376662,"[small town, texas]","Four guys, best friends, have grown up togethe...",Comedy Drama Family,8222.643823
3519,89861,Stiff Upper Lips,[Comedy],10.0,1,0.356495,"[italy, victorian england, young woman]",Stiff Upper Lips is a broad parody of British ...,Comedy,8222.643823
4405,357441,Karachi se Lahore,"[Family, Comedy]",8.0,1,0.060003,[],A road trip from Karachi to Lahore where 5 fri...,Family Comedy,7480.243823
3875,108346,Dreaming of Joseph Lees,"[Romance, Drama]",8.0,1,0.11615,"[lust, love crime]",Set in rural England in the 1950s Eva (Samanth...,Romance Drama,7480.243823


# 협업 필터링

## 아이템 기반 최근접 이웃 협업 필터링

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('ml_data/movies.csv')
ratings = pd.read_csv('ml_data/ratings.csv')
print(movies.columns, ratings.columns)

Index(['movieId', 'title', 'genres'], dtype='object') Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')


In [5]:
ratings.pivot_table(index = 'userId', columns = 'movieId', values = 'rating')[:3]

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


In [6]:
ratings_movies = pd.merge(ratings, movies, on = 'movieId')
ratings_movies[:2]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [16]:
ratings_item  = ratings_movies.pivot_table(index = 'title', columns = 'userId', values = 'rating')
ratings_item[:2]

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,


In [17]:
ratings_item = ratings_item.fillna(0)
ratings_item[:2]

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(ratings_item, ratings_item)
item_sim_df = pd.DataFrame(data = item_sim, index = ratings_item.index, columns = ratings_item.index)
item_sim_df[:3]

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
item_sim_df['Godfather, The (1972)'].sort_values(ascending=False)

title
Godfather, The (1972)                                       1.000000
Godfather: Part II, The (1974)                              0.821773
Goodfellas (1990)                                           0.664841
One Flew Over the Cuckoo's Nest (1975)                      0.620536
Star Wars: Episode IV - A New Hope (1977)                   0.595317
                                                              ...   
Going Clear: Scientology and the Prison of Belief (2015)    0.000000
T-Rex: Back to the Cretaceous (1998)                        0.000000
Golmaal (2006)                                              0.000000
Good Copy Bad Copy (2007)                                   0.000000
À nous la liberté (Freedom for Us) (1931)                   0.000000
Name: Godfather, The (1972), Length: 9719, dtype: float64

## 잠재 요인 협업 필터링