In [15]:
# =========================================================================== [ setting ] =============================================================
import pandas as pd
import numpy as np 
import warnings 
warnings.filterwarnings("ignore")

movies = pd.read_csv("./tmdb_5000_movies.csv")
movies_df = movies[["id", "title", "genres", "vote_average", "vote_count", "popularity", "keywords", "overview"]] 

pd.set_option("max_colwidth", 100)
movies_df[["genres", "keywords"]][:1]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."


In [16]:
# =========================================================================== [ preprocessing ] =============================================================
from ast import literal_eval

movies_df["genres"] = movies_df["genres"].apply(literal_eval) # 문자열 -> 객체 변환 
movies_df["keywords"] = movies_df["keywords"].apply(literal_eval)

movies_df["genres"] = movies_df["genres"].apply(lambda x : [y["name"] for y in x]) # 같은 장르면만 추출 
movies_df["keywords"] = movies_df["keywords"].apply(lambda x : [y["name"] for y in x])

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

movies_df["genres_literal"] = movies_df["genres"].apply(lambda x : (" ").join(x)) # 문자열 형태로 변경
count_vect = CountVectorizer(min_df = 0, ngram_range = (1, 2)) # 피처를 벡터화한 핼렬 데이터 값을 코사인 유사도로 비교
genre_mat = count_vect.fit_transform(movies_df["genres_literal"])
print(genre_mat.shape)

(4803, 276)


In [19]:
genre_sim = cosine_similarity(genre_mat, genre_mat) # 행별 장르 유사도 
print(genre_sim[:1]) 

[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]]


In [21]:
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1] # 유사도가  높은 순으로 정리된 행 index 반환 
print(genre_sim_sorted_ind[:1])

[[   0 3494  813 ... 3038 3037 2401]]


In [23]:
# =========================================================================== [ recommend model(1) ] =============================================================
def find_sim_movie(df, sorted_ind, title_name, top_n = 10) : 
  title_movie = df[df["title"] == title_name] 
  title_index = title_movie.index.values
  similar_indexes = sorted_ind[title_index, :(top_n)]
  print(similar_indexes)

  similar_indexes = similar_indexes.reshape(-1)
  return df.iloc[similar_indexes]

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, "The Godfather", 10)
similar_movies[["title", "vote_average"]]

[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


In [24]:
# =========================================================================== [ recommend model(2) ] =============================================================
## + todo [ 가중평균을 통한 왜곡 보정 ] ====================================
C = movies_df["vote_average"].mean()
m = movies_df["vote_count"].quantile(0.6)
print("C : ", round(C, 3), "m : ", round(m, 3)) 

C :  6.092 m :  370.2


In [31]:
percentile = 0.6
m = movies["vote_count"].quantile(percentile)
C = movies["vote_average"].mean()

def weighted_vote_average(record) : 
  v = record["vote_count"]
  R = record["vote_average"]
  return ((v / (v + m)) * R + ((m / (m + v)) * C))

movies_df["weighted_vote"] = movies.apply(weighted_vote_average, axis = 1)

In [32]:
movies_df[["title", "vote_average", "weighted_vote", "vote_count"]].sort_values("weighted_vote", ascending = False)[:10]

Unnamed: 0,title,vote_average,weighted_vote,vote_count
1881,The Shawshank Redemption,8.5,8.396052,8205
3337,The Godfather,8.4,8.263591,5893
662,Fight Club,8.3,8.216455,9413
3232,Pulp Fiction,8.3,8.207102,8428
65,The Dark Knight,8.2,8.13693,12002
1818,Schindler's List,8.3,8.126069,4329
3865,Whiplash,8.3,8.123248,4254
809,Forrest Gump,8.2,8.105954,7927
2294,Spirited Away,8.3,8.105867,3840
2731,The Godfather: Part II,8.3,8.079586,3338


In [33]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10) : 
  title_movie = df[df["title"] == title_name]
  title_index = title_movie.index.values

  similar_indexes = sorted_ind[title_index, :(top_n * 2)]
  similar_indexes = similar_indexes.reshape(-1)
  similar_indexes = similar_indexes[similar_indexes != title_index]
  return df.iloc[similar_indexes].sort_values("weighted_vote", ascending = False)[:top_n]

similar_movies = find_sim_movie(movies, genre_sim_sorted_ind, "The Godfather", 10)
similar_movies[["title", "vote_average", "weighted_vote"]]

Unnamed: 0,title,vote_average,weighted_vote
2731,The Godfather: Part II,8.3,8.079586
1847,GoodFellas,8.2,7.976937
3866,City of God,8.1,7.759693
1663,Once Upon a Time in America,8.2,7.657811
883,Catch Me If You Can,7.7,7.557097
281,American Gangster,7.4,7.141396
4041,This Is England,7.4,6.739664
1149,American Hustle,6.8,6.717525
1243,Mean Streets,7.2,6.626569
2839,Rounders,6.9,6.530427
