In [89]:
import pandas as pd
import numpy as np
import ast
import warnings; warnings.filterwarnings('ignore')

from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

movies = pd.read_csv("/content/tmdb_5000_movies.csv")
credits = pd.read_csv("/content/tmdb_5000_credits.csv")

In [90]:
#movie 데이터셋과 credit 데이터셋의 병합
movie_credits = pd.merge(movies, credits, left_on='id', right_on='movie_id', how='left') # 같은 id 기준으로 데이터 읽기
movie_credits = pd.merge(movies, credits, left_on='original_title', right_on='title', how='left')
movie_credits = movie_credits.drop(columns=['homepage', 'status', 'production_companies', 'production_countries',
                                            'cast', 'crew', 'original_language', 'tagline', 'revenue',
                                            'budget', 'spoken_languages','runtime', 'release_date'])

#병합된 데이터셋에서 필요한 항목만 추출
movie_credits_df = movie_credits[['id', 'genres', 'original_title', 'vote_average', 'vote_count', 'popularity', 'keywords']]
movie_credits.info()
# movie_credits.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4808 entries, 0 to 4807
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   genres          4808 non-null   object 
 1   id              4808 non-null   int64  
 2   keywords        4808 non-null   object 
 3   original_title  4808 non-null   object 
 4   overview        4805 non-null   object 
 5   popularity      4808 non-null   float64
 6   title_x         4808 non-null   object 
 7   vote_average    4808 non-null   float64
 8   vote_count      4808 non-null   int64  
 9   movie_id        4547 non-null   float64
 10  title_y         4547 non-null   object 
dtypes: float64(3), int64(2), object(6)
memory usage: 413.3+ KB


In [91]:
# genre, keyword등의 칼럼 형태 파악

pd.set_option('max_colwidth', 100)
movie_credits_df[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."


In [92]:
import json
# genres column에 genre 정보 이외에 id 정보도 포함되어 있어 genre 정보만 따로 추출하여 새로운 column으로 생성
def add_genre_name(j):
  genres = []
  ar = json.loads(j) # json string을 python dictionary로 변환
  for a in ar:
    genres.append(a.get("name"))
  return " ".join(sorted(genres))

# movies중 genres에 함수 적용
movies['genres_name'] = movies.apply(lambda x: add_genre_name(x.genres), axis=1)
movies[['genres_name']].head()

Unnamed: 0,genres_name
0,Action Adventure Fantasy Science Fiction
1,Action Adventure Fantasy
2,Action Adventure Crime
3,Action Crime Drama Thriller
4,Action Adventure Science Fiction


In [93]:
movie_credits_df['genres'] = movie_credits_df['genres'].apply(literal_eval)
movie_credits_df['keywords'] = movie_credits_df['keywords'].apply(literal_eval)

In [94]:
# apply lambda -> genre, keyword column에서 name 값 추출

movie_credits_df['genres'] = movie_credits_df['genres'].apply(lambda x : [ y['name'] for y in x])
movie_credits_df['keywords'] = movie_credits_df['keywords'].apply(lambda x : [y['name'] for y in x])
movie_credits_df[['genres', 'keywords']][:5]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa..."
1,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india trading company, love of one's life, traitor, ship..."
2,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom]"
3,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham..."
4,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel, princess, alien, steampunk, martian, escape, edg..."


In [95]:
# TF-IDF 벡터라이저를 사용하여 장르를 벡터화 
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [96]:
# CountVectorizer로 변환 -> 개별 단어 feature로 구성
movie_credits_df['genres_literal'] = movie_credits_df['genres'].apply(lambda x: (' ').join(x))

count_vect = CountVectorizer(min_df=1, ngram_range=(1, 2))
genre_mat = count_vect.fit_transform(movie_credits_df['genres_literal'])
print(genre_mat.shape)

(4808, 276)


In [97]:
# consine_similarity: 코사인 유사도 계산

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:2])

(4808, 4808)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]
 [0.59628479 1.         0.4        ... 0.         0.         0.        ]]


In [98]:
# genre_sim -> movies_df의 genre_mat의 데이터 별 유사도 정보
# genre_sim_sorted_ind -> 각 레코드의 장르 cosine 유사도가 가장 높은 순의 index 값
# argsort() 함수 -> 유사도가 높은 순으로 정렬

genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:1])

[[   0   14  870 ... 3041 3040 2403]]


In [99]:
# find_sim_movie(): 장르 유사도에 따라 영화 추천
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
  # title column이 입력된 title_name의 값 출력
  title_movie = df[df['original_title'] == title_name]
  # title named를 가진 index를 ndarray로 변환 후
  # genre_sim_sorted_ind에서 유사도 순으로 n개의 index 추출
  title_index = title_movie.index.values
  similar_indexes = sorted_ind[title_index, :(top_n)]

  # 추출된 top_n_index 출력 
  # 2차원 데이터 형태임으로 movies에서 새롭게 사용하기 위해 1차원 array로 변환
  print(similar_indexes)
  similar_indexes = similar_indexes.reshape(-1)

  return df.iloc[similar_indexes]

In [100]:
# find_sim_movie()를 통해 영화 10개 추천

similar_movies = find_sim_movie(movie_credits_df, genre_sim_sorted_ind, 'Avatar', 10)
similar_movies[['original_title', 'vote_average', 'genres']]

[[   0   14  870  813   46 3496 1297 1654  419  420]]


Unnamed: 0,original_title,vote_average,genres
0,Avatar,7.2,"[Action, Adventure, Fantasy, Science Fiction]"
14,Man of Steel,6.5,"[Action, Adventure, Fantasy, Science Fiction]"
870,Superman II,6.5,"[Action, Adventure, Fantasy, Science Fiction]"
813,Superman,6.9,"[Action, Adventure, Fantasy, Science Fiction]"
46,X-Men: Days of Future Past,7.5,"[Action, Adventure, Fantasy, Science Fiction]"
3496,Beastmaster 2: Through the Portal of Time,4.6,"[Action, Adventure, Fantasy, Science Fiction]"
1297,Superman III,5.3,"[Comedy, Action, Adventure, Fantasy, Science Fiction]"
1654,Dragonball Evolution,2.9,"[Action, Adventure, Fantasy, Science Fiction, Thriller]"
419,Jumper,5.9,"[Adventure, Fantasy, Science Fiction]"
420,Hellboy II: The Golden Army,6.5,"[Adventure, Fantasy, Science Fiction]"


In [101]:
# sort_values() -> vote_average 내림차순 10개 추출

movie_credits_df[['original_title', 'vote_average', 'vote_count', 'genres']].sort_values('vote_average', ascending=False)[:10]

Unnamed: 0,original_title,vote_average,vote_count,genres
4251,Me You and Five Bucks,10.0,2,"[Romance, Comedy, Drama]"
4049,"Dancer, Texas Pop. 81",10.0,1,"[Comedy, Drama, Family]"
3521,Stiff Upper Lips,10.0,1,[Comedy]
4667,Little Big Top,10.0,1,[Comedy]
3996,Sardaarji,9.5,2,[]
2388,One Man's Hero,9.3,2,"[Western, Action, Drama, History]"
2972,There Goes My Baby,8.5,2,"[Drama, Comedy]"
1883,The Shawshank Redemption,8.5,8205,"[Drama, Crime]"
3339,The Godfather,8.4,5893,"[Drama, Crime]"
2798,The Prisoner of Zenda,8.4,11,"[Adventure, Drama, Romance]"


In [102]:
# m: 전체 중 상위 70%에 해당하는 횟수

C = movie_credits_df['vote_average'].mean()
m = movie_credits_df['vote_count'].quantile(0.7)
print('C: ', round(C, 3), 'm:', round(m, 3))

C:  6.092 m: 581.0


In [103]:
# 새로운 평점 정보: vote_weighted & 함수 명: weighted_vote_average()
# vote_count, vote_average, m, C 값을 바탕으로 record별 평점 반환
# V: vote_count, R: vote_average

def weighted_vote_average(record):
  V = record['vote_count']
  R = record['vote_average']

  return ((V/(V+m)) * R) + ((m/(m+V)) * C)

movie_credits_df['weighted_vote'] = movie_credits_df.apply(weighted_vote_average, axis=1)

In [105]:
# 새롭게 부여된 weighted_vote 평점 순으로 10개 추출
# weighted_vote를 기반으로 'Avatar'와 유사한 영화들을 높은 순서로 추천
def find_sim_movie_by_weighted_vote(df, sorted_ind, title_name, top_n=10):
    title_movie = df[df['original_title'] == title_name]

    # 해당 영화의 인덱스 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :(top_n * 2)]
    similar_indexes = similar_indexes.reshape(-1)

    # 본인 제외하고 top_n개 유사 영화 추천
    similar_movies = df.iloc[similar_indexes].drop(title_index)

    # weighted_vote 기준으로 상위 top_n개 반환
    return similar_movies.sort_values('weighted_vote', ascending=False).head(top_n)

# 'Avatar'와 유사한 영화 추천
similar_movies_by_weighted_vote = find_sim_movie_by_weighted_vote(movie_credits_df, genre_sim_sorted_ind, 'Avatar', 10)
similar_movies_by_weighted_vote[['original_title', 'vote_average', 'weighted_vote', 'genres']]

# movie_credits_df[['original_title', 'vote_average', 'weighted_vote', 'vote_count', 'genres']].sort_values('weighted_vote', ascending = False)[:10]

Unnamed: 0,original_title,vote_average,weighted_vote,genres
46,X-Men: Days of Future Past,7.5,7.376331,"[Action, Adventure, Fantasy, Science Fiction]"
158,Star Trek,7.4,7.251006,"[Science Fiction, Action, Adventure]"
813,Superman,6.9,6.607285,"[Action, Adventure, Fantasy, Science Fiction]"
14,Man of Steel,6.5,6.465876,"[Action, Adventure, Fantasy, Science Fiction]"
420,Hellboy II: The Golden Army,6.5,6.387655,"[Adventure, Fantasy, Science Fiction]"
870,Superman II,6.5,6.304279,"[Action, Adventure, Fantasy, Science Fiction]"
232,The Wolverine,6.3,6.27397,"[Action, Science Fiction, Adventure, Fantasy]"
3210,Star Wars: Clone Wars (Volume 1),8.0,6.177101,"[Action, Adventure, Animation, Fantasy, Science Fiction]"
1192,Small Soldiers,6.2,6.142745,"[Comedy, Adventure, Fantasy, Science Fiction, Action]"
1934,Sheena,5.0,6.052533,"[Action, Adventure, Comedy, Fantasy, Science Fiction]"
