## 영화 장르 및 줄거리 기반 추천 모델 구현

### 0) 데이터셋 설명
#### tmdb_movies
- id : 각 영화에 대한 고유 ID
- title : 영화 제목
- runtime : 상영 시간
- genres : 영화 장르
- overview : 영화에 대한 간략한 설명
- popularity : TMDB에서 제공하는 인기도
- vote_avearage : TMDB에서 받은 평점 평균
- vote_count : TMDB에서 받은 투표수

#### tmdb_credits
- movie_id : 각 영화에 대한 고유 ID
- cast : 모든 출연진
- crew : 모든 제작진

In [1]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_parquet('../dataset/tmdb_movies.parquet')
credits = pd.read_parquet('../dataset/tmdb_credits.parquet')

print(f"movies: {movies.shape}")
display(movies.head())

print(f"credits: {credits.shape}")
display(credits.head())

movies: (11813, 9)


Unnamed: 0,id,title,runtime,genres,overview,popularity,vote_average,vote_count,score
0,65216,Bloody Cartoons,0,[],Bloody Cartoons is a documentary about how and...,0.0071,7.0,2,4.81
1,69733,The Man-Eater,80,"[{'id': 35, 'name': 'Comedy'}]",This is the story of a Sicilian woman that tri...,4.2009,5.0,39,4.81
2,202777,Peace in the Fields,91,"[{'id': 18, 'name': 'Drama'}]",This is a film which was made in Belgium in th...,0.2568,6.1,4,4.81
3,115134,The Scent,117,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...","A detective Seon-woo, who moonlights as a priv...",3.2696,5.8,23,4.83
4,130394,Here Come the Girls,78,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",Bob Hope stars as an inept member of the choru...,0.9136,4.4,7,4.8


credits: (11813, 3)


Unnamed: 0,id,cast,crew
0,65216,[],"[{'adult': False, 'gender': 0, 'id': 240716, '..."
1,69733,"[{'adult': False, 'gender': 1, 'id': 557004, '...","[{'adult': False, 'gender': 2, 'id': 126973, '..."
2,202777,"[{'adult': False, 'gender': 2, 'id': 44509, 'k...","[{'adult': False, 'gender': 0, 'id': 1185016, ..."
3,115134,"[{'adult': False, 'gender': 2, 'id': 93999, 'k...","[{'adult': False, 'gender': 0, 'id': 1292890, ..."
4,130394,"[{'adult': False, 'gender': 2, 'id': 82388, 'k...","[{'adult': False, 'gender': 2, 'id': 16042, 'k..."


### 1) 데이터 전처리

In [3]:
# overview -> 줄거리 컬럼
movies['overview'].iloc[2]

'This is a film which was made in Belgium in the early \'60s and was never released. However, it somehow got included in the American Oscar category for "Best Foreign Film," and was finally released in its home country in 1971. It explores the issues of prejudice and superstition in the Belgian countryside through the troubles of a middle-aged farmer whose mother has been accused of being a witch. In French, this picture is based on a true story which took place in the late 1920s and early \'30s.'

In [4]:
## null값 확인
movies['overview'].isnull().sum()

574

In [5]:
## 줄거리 결측치 처리 - dropna
movies.dropna(subset=['overview'], inplace=True)

movies['overview'].isnull().sum()

0

In [6]:
movies.head()

Unnamed: 0,id,title,runtime,genres,overview,popularity,vote_average,vote_count,score
0,65216,Bloody Cartoons,0,[],Bloody Cartoons is a documentary about how and...,0.0071,7.0,2,4.81
1,69733,The Man-Eater,80,"[{'id': 35, 'name': 'Comedy'}]",This is the story of a Sicilian woman that tri...,4.2009,5.0,39,4.81
2,202777,Peace in the Fields,91,"[{'id': 18, 'name': 'Drama'}]",This is a film which was made in Belgium in th...,0.2568,6.1,4,4.81
3,115134,The Scent,117,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...","A detective Seon-woo, who moonlights as a priv...",3.2696,5.8,23,4.83
4,130394,Here Come the Girls,78,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",Bob Hope stars as an inept member of the choru...,0.9136,4.4,7,4.8


In [7]:
# 'genres' 컬럼의 문자열을 list 형식으로 변환

movies['genres'] = movies['genres'].apply(eval)

In [8]:
# 장르 추출하여 'genre' 컬럼에 저장

def get_genre_names(val):
        return ' '.join([i.get('name','').lower() for i in val])

movies['genres'] = movies['genres'].apply(get_genre_names)

movies.head()

Unnamed: 0,id,title,runtime,genres,overview,popularity,vote_average,vote_count,score
0,65216,Bloody Cartoons,0,,Bloody Cartoons is a documentary about how and...,0.0071,7.0,2,4.81
1,69733,The Man-Eater,80,comedy,This is the story of a Sicilian woman that tri...,4.2009,5.0,39,4.81
2,202777,Peace in the Fields,91,drama,This is a film which was made in Belgium in th...,0.2568,6.1,4,4.81
3,115134,The Scent,117,comedy crime thriller,"A detective Seon-woo, who moonlights as a priv...",3.2696,5.8,23,4.83
4,130394,Here Come the Girls,78,comedy music,Bob Hope stars as an inept member of the choru...,0.9136,4.4,7,4.8


In [9]:
# overview와 genres를 합쳐서 content 컬럼 생성

movies['overview'] = movies['overview'].str.lower()
movies['content'] = movies['overview'] + ' ' + movies['genres']

movies.head()

Unnamed: 0,id,title,runtime,genres,overview,popularity,vote_average,vote_count,score,content
0,65216,Bloody Cartoons,0,,bloody cartoons is a documentary about how and...,0.0071,7.0,2,4.81,bloody cartoons is a documentary about how and...
1,69733,The Man-Eater,80,comedy,this is the story of a sicilian woman that tri...,4.2009,5.0,39,4.81,this is the story of a sicilian woman that tri...
2,202777,Peace in the Fields,91,drama,this is a film which was made in belgium in th...,0.2568,6.1,4,4.81,this is a film which was made in belgium in th...
3,115134,The Scent,117,comedy crime thriller,"a detective seon-woo, who moonlights as a priv...",3.2696,5.8,23,4.83,"a detective seon-woo, who moonlights as a priv..."
4,130394,Here Come the Girls,78,comedy music,bob hope stars as an inept member of the choru...,0.9136,4.4,7,4.8,bob hope stars as an inept member of the choru...


### 2) 데이터 벡터화

In [10]:
## TF-IDF 벡터화

tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words='english'
)
tfidf_matrix = tfidf.fit_transform(movies['content'])

tfidf_matrix.shape

(11239, 36469)

### 3) 유사도 계산

In [12]:
# 코사인 유사도 계산

contents_cos_sim = cosine_similarity(tfidf_matrix)

contents_cos_sim.shape

(11239, 11239)

In [13]:
contents_cos_sim[:5, :5]

array([[1.        , 0.        , 0.01272785, 0.0180803 , 0.        ],
       [0.        , 1.        , 0.01069214, 0.0317936 , 0.00641917],
       [0.01272785, 0.01069214, 1.        , 0.        , 0.        ],
       [0.0180803 , 0.0317936 , 0.        , 1.        , 0.0571824 ],
       [0.        , 0.00641917, 0.        , 0.0571824 , 1.        ]])

In [20]:
# 특정 영화의 인덱스 추출

movie_name = 'Inception'

idx = movies[movies['title'] == movie_name].index[0]
idx

2083

In [21]:
# 유사도 점수 정렬

sim_scores = list(enumerate(contents_cos_sim[idx]))

sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)

sim_scores

[(2083, 1.0000000000000002),
 (7825, 0.2791455780288197),
 (9475, 0.14958135826628638),
 (7245, 0.123466729541866),
 (8487, 0.12154475270455184),
 (4011, 0.12011085680610352),
 (5583, 0.11787429652906047),
 (10602, 0.11715961164239438),
 (8397, 0.11585353937980021),
 (8379, 0.1155260714521966),
 (9998, 0.11397615846636805),
 (2758, 0.11368027329262569),
 (7465, 0.11346031620620646),
 (8571, 0.11159387258429213),
 (9350, 0.11098006446291908),
 (10559, 0.1061484298214679),
 (7120, 0.10595001706248562),
 (7081, 0.1042708914764037),
 (6283, 0.10418944231593705),
 (4142, 0.10358854530661746),
 (10020, 0.10294485513610702),
 (5911, 0.1026152420990822),
 (10796, 0.09905963014436406),
 (8169, 0.09880640823170178),
 (10144, 0.09773815074647468),
 (8491, 0.09542224813911007),
 (8131, 0.09513520045230997),
 (6824, 0.09501324366990722),
 (8855, 0.09399996800630188),
 (9461, 0.09390357707608268),
 (9719, 0.09386727759891819),
 (7193, 0.09301178427200955),
 (10099, 0.0928288049912659),
 (2542, 0.092

In [None]:
# 추천 영화 정보 출력 (자기 자신 제외)

movie_idx = [i[0] for i in sim_scores if i[0] != idx][:10]

movies.iloc[movie_idx]

### 4) 추천 함수 생성

In [23]:
## 컨텐츠 기반 영화 추천 함수 생성

def contents_recommendation(
    dataframe: pd.DataFrame, 
    movie_name: str,
    sim_matrix,
    top_n: int = 10,
    ) -> pd.Series :

    # movie_name이 대소문자 구분 없이 필터링되도록 처리
    idx = dataframe[dataframe['title'].str.lower() == movie_name.lower()].index[0]

    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)

    movie_idx = [i[0] for i in sim_scores if i[0] != idx][:top_n]

    return dataframe['title'].iloc[movie_idx]

contents_recommendation(
    dataframe=movies,
    movie_name=movie_name,
    sim_matrix=contents_cos_sim,
    top_n=10
)

1285                          Mortal Kombat: Annihilation
2935                                             Hercules
705                                          Dragonslayer
1947                                        The Wolverine
4282                                            Eye Music
6037    Victorious Battle for the Conquest of the Magh...
4062                                 Fighting Tommy Riley
1857        In the Name of the King: A Dungeon Siege Tale
1839                                           Foodfight!
3458                                            Immortals
Name: title, dtype: object

### 5) 유사도 행렬 저장

In [24]:
joblib.dump(contents_cos_sim, "../models/contents_cos_sim.pkl")

['../models/contents_cos_sim.pkl']