# CB 기반 추천

In [1]:
import pandas as pd
print(f'pandas v{pd.__version__}')

import sklearn
print(f'scikit-learn v{sklearn.__version__}')

# Only for specifying versions
import sys; print(f'Python v{sys.version}')

pandas v1.5.3
scikit-learn v1.2.0
Python v3.9.16 (main, May 17 2023, 17:49:16) [MSC v.1916 64 bit (AMD64)]


## 2.5 내용 기반 필터링 추천
- 내용 기반 필터링 (Content-based filtering: CB)
- CB 추천을 위해서는 텍스트 정보가 필요하다. 우리가 앞에서 사용했던 MovieLens 데이터에는 텍스트 정보가 없기 때문에 영화의 줄거리가 있는 다른 데이터(movies_metadata.csv)를 사용한다. 이 예에서는 TF-iDF 유사도를 사용한다.

In [2]:
# Data 읽기
movies = pd.read_csv('../Data/movies_metadata.csv', encoding='latin-1', low_memory=False)
movies = movies[['id', 'title', 'overview']]
movies.head(10)

Unnamed: 0,id,title,overview
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
5,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."
6,45325,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses..."
7,9091,Sudden Death,International action superstar Jean Claude Van...
8,710,GoldenEye,James Bond must unmask the mysterious head of ...
9,9087,The American President,"Widowed U.S. president Andrew Shepherd, one of..."


In [3]:
len(movies)

45442

In [4]:
# 데이터 전처리
movies.drop_duplicates(inplace=True)
movies.dropna(inplace=True)
movies['overview'].fillna('', inplace=True)
len(movies)

44300

In [5]:
# 불용어를 english로 지정하고 tf-idf 계산
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['overview'])
tfidf_matrix.shape

(44300, 74686)

In [6]:
# Cosine 유사도 계산
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim = pd.DataFrame(cosine_sim, index=movies.index, columns=movies.index)
cosine_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45432,45433,45434,45435,45436,45437,45438,45439,45440,45441
0,1.000000,0.014981,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.005955,0.000000
1,0.014981,1.000000,0.046968,0.000000,0.000000,0.050222,0.000000,0.102622,0.000000,0.007219,...,0.0,0.0,0.0,0.011276,0.0,0.000000,0.066866,0.0,0.022018,0.009356
2,0.000000,0.046968,1.000000,0.000000,0.025070,0.000000,0.006414,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.014077,0.000000
3,0.000000,0.000000,0.000000,1.000000,0.000000,0.007214,0.008982,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.021457,0.0,0.026478,0.000000,0.0,0.009531,0.016436
4,0.000000,0.000000,0.025070,0.000000,1.000000,0.000000,0.000000,0.032820,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.007014,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45437,0.000000,0.000000,0.000000,0.026478,0.000000,0.025460,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,1.000000,0.000000,0.0,0.000000,0.000000
45438,0.000000,0.066866,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.050056,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,1.000000,0.0,0.000000,0.000000
45439,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.038175,0.000000,0.000000,...,0.0,0.0,0.0,0.031201,0.0,0.000000,0.000000,1.0,0.000000,0.000000
45440,0.005955,0.022018,0.014077,0.009531,0.007014,0.000000,0.005263,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,1.000000,0.000000


In [7]:
# index-title을 뒤집는다
indices = pd.Series(movies.index, index=movies['title'])
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [8]:
# 영화제목을 받아서 추천 영화를 돌려주는 함수
def content_recommender(title, n_of_recomm):
    # title에서 영화 index 받아오기
    idx = indices[title]
    # 주어진 영화와 다른 영화의 similarity를 가져온다
    sim_scores = cosine_sim[idx]
    # similarity 기준으로 정렬하고 n_of_recomm만큼 가져오기 (자기자신은 빼기)
    sim_scores = sim_scores.sort_values(ascending=False)[1:n_of_recomm+1]
    # 영화 title 반환
    return movies.loc[sim_scores.index]['title']

In [9]:
# 추천받기
content_recommender('The Lion King', 5)

34664    How the Lion Cub and the Turtle Sang a Song
9339                               The Lion King 1Â½
9101                  The Lion King 2: Simba's Pride
42806                                           Prey
25637                                 Fearless Fagan
Name: title, dtype: object

In [10]:
content_recommender('The Dark Knight Rises', 10)

12468                                      The Dark Knight
149                                         Batman Forever
1321                                        Batman Returns
15497                           Batman: Under the Red Hood
584                                                 Batman
21179    Batman Unmasked: The Psychology of the Dark Kn...
9216                    Batman Beyond: Return of the Joker
18021                                     Batman: Year One
19778              Batman: The Dark Knight Returns, Part 1
3085                          Batman: Mask of the Phantasm
Name: title, dtype: object