# 코사인 유사도 - 영화 추천 시스템(줄거리, 감독, 주연배우 포함)

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
movie = pd.read_csv('data/movies_metadata.csv', low_memory=False)
info = pd.read_csv('data/credits.csv')
info.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


In [3]:
movie.shape, info.shape

((45466, 24), (45476, 3))

In [4]:
df = movie[['id','title','overview']]
df[df.id=='1997-08-20']

Unnamed: 0,id,title,overview
19730,1997-08-20,,Released


In [5]:
df.dropna(inplace=True)
df['id'] = df.id.astype(int)

In [6]:
info['id'] = info.id.astype(int)
df = df.merge(info, on='id')
df.set_index('id', inplace=True)
df.head(3)

Unnamed: 0_level_0,title,overview,cast,crew
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."


In [7]:
df = df.head(20000)

## 데이터 전처리

- 주연배우

In [8]:
from ast import literal_eval
df['cast'] = df.cast.apply(literal_eval)
df.head(3)

Unnamed: 0_level_0,title,overview,cast,crew
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."


In [9]:
import re

def get_3cast(x):
    cast = []
    for item in x:
        if item['name'] not in cast:
            cast.append(item['name'])
    cast = cast if len(cast) <= 3 else cast[:3]
    cast = list(map(lambda x: re.sub(' ','',x), cast))
    return ' '.join(cast)

In [10]:
df['cast3'] = df.cast.apply(get_3cast)

- 감독

In [11]:
df['crew'] = df.crew.apply(literal_eval)

In [12]:
def get_director(x):
    for item in x:
        if item['job'] == 'Director':
            return item['name'].replace(' ','')
    return ''

In [13]:
df['director'] = df.crew.apply(get_director)

In [14]:
df.reset_index(inplace=True)
df.head(3)

Unnamed: 0,id,title,overview,cast,crew,cast3,director
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",TomHanks TimAllen DonRickles,JohnLasseter
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",RobinWilliams JonathanHyde KirstenDunst,JoeJohnston
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",WalterMatthau JackLemmon Ann-Margret,HowardDeutch


In [15]:
df['total'] = df.overview + ' ' + df.director + ' ' + df.cast3
df.head(3)

Unnamed: 0,id,title,overview,cast,crew,cast3,director,total
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",TomHanks TimAllen DonRickles,JohnLasseter,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",RobinWilliams JonathanHyde KirstenDunst,JoeJohnston,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",WalterMatthau JackLemmon Ann-Margret,HowardDeutch,A family wedding reignites the ancient feud be...


## 텍스트 변환

In [16]:
# Overview + 감독 + 주연배우
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(stop_words='english')
tfidf_matrix = tvect.fit_transform(df.total)
tfidf_matrix.shape

(20000, 76313)

## 영화의 타이틀과 인덱스를 가진 테이블 생성

In [17]:
indices = pd.Series(df.index, index=df.title).drop_duplicates()
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

## 코사인 유사도를 통해 유사한 영화를 찾는 함수 생성

In [18]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [19]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df.title.iloc[movie_indices]

In [20]:
get_recommendations('The Dark Knight Rises')

12491                            The Dark Knight
149                               Batman Forever
1323                              Batman Returns
15509                 Batman: Under the Red Hood
10128                              Batman Begins
583                                       Batman
9237          Batman Beyond: Return of the Joker
18002                           Batman: Year One
3088                Batman: Mask of the Phantasm
19733    Batman: The Dark Knight Returns, Part 1
Name: title, dtype: object

In [21]:
get_recommendations('Toy Story')

15347               Toy Story 3
2990                Toy Story 2
10307    The 40 Year Old Virgin
8338                  The Champ
1067      Rebel Without a Cause
1924                  Condorman
11412    For Your Consideration
11618              Factory Girl
17171                 Group Sex
483                      Malice
Name: title, dtype: object