In [60]:
import warnings
warnings.filterwarnings("ignore")
import pandas_datareader.data as web
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False

%matplotlib inline

from fbprophet import Prophet
from datetime import datetime
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import ast

In [2]:
# 한글 폰트 문제 해결
import platform

from matplotlib import font_manager, rc
# plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~')

In [21]:
data = pd.read_csv('./data/movies_metadata.csv')

In [22]:
data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [23]:
data.shape

(45466, 24)

In [67]:
#전처리
fix_data = data[['id','genres','vote_average','vote_count','popularity','title','tagline','overview']]

In [68]:
#모수에 따른 평점 불균형 조정(IMDB 가중치 참고)
# r: 개별영화 평점, v:개별 영화평점에 투표한 횟수, m: 특정순위안에 들어야하는 최소 투표, c:전체 영화평균평점
m = fix_data['vote_count'].quantile(0.99)
fix_data = data.loc[data['vote_count'] >= m]

In [69]:
fix_data.shape # 500위 안 목표

(455, 24)

In [70]:
c = fix_data['vote_average'].mean()

In [40]:
print(c)
print(m)

6.9958241758241755
2183.8199999999924


In [71]:
def weighted_rating(x, m=m, c=c):
    v = x['vote_count']
    r = x['vote_average']
    return (v / (v+m) * r) + (m / (m+v) * c)

In [72]:
fix_data['score'] = fix_data.apply(weighted_rating, axis = 1)

In [45]:
fix_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,score
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,7.497627
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,6.945523
31,False,,29500000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",,63,tt0114746,en,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...",...,168840000.0,129.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The future is history.,Twelve Monkeys,False,7.4,2470.0,7.210339
46,False,,33000000,"[{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na...",http://www.sevenmovie.com/,807,tt0114369,en,Se7en,Two homicide detectives are on a desperate hun...,...,327311859.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Seven deadly sins. Seven ways to die.,Se7en,False,8.1,5915.0,7.802263
49,False,,6000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",http://www.mgm.com/#/our-titles/2083/The-Usual...,629,tt0114814,en,The Usual Suspects,"Held in an L.A. interrogation room, Verbal Kin...",...,23341568.0,106.0,"[{'iso_639_1': 'es', 'name': 'Español'}, {'iso...",Released,Five Criminals. One Line Up. No Coincidence.,The Usual Suspects,False,8.1,3334.0,7.662994


In [47]:
fix_data.shape

(455, 25)

In [65]:
fix_data[['genres']].head()

Unnamed: 0,genres
0,Animation Comedy Family
1,Adventure Fantasy Family
31,Science Fiction Thriller Mystery
46,Crime Mystery Thriller
49,Drama Crime Thriller


In [73]:
fix_data['genres'] = fix_data['genres'].apply(ast.literal_eval)

In [74]:
fix_data[['genres']].head()

Unnamed: 0,genres
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
31,"[{'id': 878, 'name': 'Science Fiction'}, {'id'..."
46,"[{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na..."
49,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name..."


In [75]:
fix_data['genres'] = fix_data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))

In [76]:
fix_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,score
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,Animation Comedy Family,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,7.497627
1,False,,65000000,Adventure Fantasy Family,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,6.945523
31,False,,29500000,Science Fiction Thriller Mystery,,63,tt0114746,en,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...",...,168840000.0,129.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The future is history.,Twelve Monkeys,False,7.4,2470.0,7.210339
46,False,,33000000,Crime Mystery Thriller,http://www.sevenmovie.com/,807,tt0114369,en,Se7en,Two homicide detectives are on a desperate hun...,...,327311859.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Seven deadly sins. Seven ways to die.,Se7en,False,8.1,5915.0,7.802263
49,False,,6000000,Drama Crime Thriller,http://www.mgm.com/#/our-titles/2083/The-Usual...,629,tt0114814,en,The Usual Suspects,"Held in an L.A. interrogation room, Verbal Kin...",...,23341568.0,106.0,"[{'iso_639_1': 'es', 'name': 'Español'}, {'iso...",Released,Five Criminals. One Line Up. No Coincidence.,The Usual Suspects,False,8.1,3334.0,7.662994


In [95]:
#콘텐츠기반 필터링
fix_data.genres.head()

0              Animation Comedy Family
1             Adventure Fantasy Family
31    Science Fiction Thriller Mystery
46              Crime Mystery Thriller
49                Drama Crime Thriller
Name: genres, dtype: object

In [79]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(ngram_range=(1,3)) #벡터화

In [80]:
c_vector_genres = count_vector.fit_transform(fix_data['genres'])

In [81]:
c_vector_genres.shape

(455, 354)

In [83]:
from sklearn.metrics.pairwise import cosine_similarity
#코사인 유사도를 구한 벡터를 미리 저장
genre_c_sim = cosine_similarity(c_vector_genres, c_vector_genres).argsort()[:,::-1]

In [84]:
genre_c_sim.shape

(455, 455)

In [88]:
def get_recommend_movie_list(df, movie_title, top=30):
    # 특정 영화와 비슷한 영화를 추천해야 하기때문에 특정 영화정보를 추출
    target_movie_index = df[df['title'] == movie_title].index.values
    
    # 코사인 유사도 중 비슷한 코사인 유사도를 가진 정보를 추출
    sim_index = genre_c_sim[target_movie_index, :top].reshape(-1)
    
    # 본인을 제외
    sim_index = sim_index[sim_index != target_movie_index]
    
    # 데이터프레임으로 만들고 vote_count로 정렬한 뒤 리턴
    result = df.iloc[sim_index].sort_values('score', ascending=False)[:10]
    return result

In [96]:
get_recommend_movie_list(fix_data, movie_title='Toy Story') #토이스토리와 비슷한 장르의 영화 추천

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,score
13724,False,,175000000,Animation Comedy Family Adventure,http://disney.go.com/disneypictures/up/,14160,tt1049413,en,Up,Carl Fredricksen spent his entire life dreamin...,...,735099100.0,96.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Up,False,7.8,7048.0,7.609769
6232,False,"{'id': 137697, 'name': 'Finding Nemo Collectio...",94000000,Animation Family,http://movies.disney.com/finding-nemo,12,tt0266543,en,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",...,940335500.0,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,There are 3.7 trillion fish in the ocean. They...,Finding Nemo,False,7.6,6292.0,7.444332
15348,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",200000000,Animation Family Comedy,http://disney.go.com/toystory/,10193,tt0435761,en,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven...",...,1066970000.0,103.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No toy gets left behind.,Toy Story 3,False,7.6,4710.0,7.40861
4756,False,"{'id': 137696, 'name': 'Monsters, Inc. Collect...",115000000,Animation Comedy Family,http://movies.disney.com/monsters-inc,585,tt0198781,en,"Monsters, Inc.","James Sullivan and Mike Wazowski are monsters,...",...,562816300.0,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,We Scare Because We Care.,"Monsters, Inc.",False,7.5,6150.0,7.367884
23557,False,"{'id': 89137, 'name': 'How to Train Your Drago...",145000000,Fantasy Action Adventure Animation Comedy Family,http://www.howtotrainyourdragon.com/,82702,tt1646971,en,How to Train Your Dragon 2,The thrilling second chapter of the epic How T...,...,609123000.0,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The training is over.,How to Train Your Dragon 2,False,7.6,3163.0,7.353234
11567,False,,150000000,Animation Comedy Family Fantasy,http://disney.go.com/disneypictures/ratatouille/,2062,tt0382932,en,Ratatouille,A rat named Remy dreams of becoming a great Fr...,...,623722800.0,111.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,He's dying to become a chef.,Ratatouille,False,7.5,4510.0,7.335516
22718,False,"{'id': 325470, 'name': 'The Lego Movie Collect...",60000000,Adventure Animation Comedy Family Fantasy,http://www.thelegomovie.com,137106,tt1490017,en,The Lego Movie,"An ordinary Lego mini-figure, mistakenly thoug...",...,469160700.0,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The story of a nobody who saved everybody.,The Lego Movie,False,7.5,3127.0,7.292682
16130,False,"{'id': 463959, 'name': 'Tangled Collection', '...",260000000,Animation Family,http://disney.go.com/disneypictures/tangled/,38757,tt0398286,en,Tangled,When the kingdom's most wanted-and most charmi...,...,591794900.0,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,They're taking adventure to new lengths.,Tangled,False,7.4,3419.0,7.242464
4178,False,"{'id': 2150, 'name': 'Shrek Collection', 'post...",60000000,Adventure Animation Comedy Family Fantasy,http://www.shrek.com/,808,tt0126029,en,Shrek,It ain't easy bein' green -- especially if you...,...,484409200.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The greatest fairy tale never told.,Shrek,False,7.3,4183.0,7.195668
2997,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",90000000,Animation Comedy Family,http://toystory.disney.com/toy-story-2,863,tt0120363,en,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy...",...,497366900.0,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The toys are back!,Toy Story 2,False,7.3,3914.0,7.191065
