## 영화 인물 관련 정보(감독, 등장인물 등) 기반 추천 모델 구현

### 0) 데이터셋 설명
#### tmdb_movies
- id : 각 영화에 대한 고유 ID
- title : 영화 제목
- runtime : 상영 시간
- genres : 영화 장르
- overview : 영화에 대한 간략한 설명
- popularity : TMDB에서 제공하는 인기도
- vote_avearage : TMDB에서 받은 평점 평균
- vote_count : TMDB에서 받은 투표수

#### tmdb_credits
- movie_id : 각 영화에 대한 고유 ID
- cast : 모든 출연진
- crew : 모든 제작진

In [1]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_parquet('../dataset/tmdb_movies.parquet')
credits = pd.read_parquet('../dataset/tmdb_credits.parquet')

print(f"movies: {movies.shape}")
display(movies.head())

print(f"credits: {credits.shape}")
display(credits.head())

movies: (11813, 9)


Unnamed: 0,id,title,runtime,genres,overview,popularity,vote_average,vote_count,score
0,65216,Bloody Cartoons,0,[],Bloody Cartoons is a documentary about how and...,0.0071,7.0,2,4.81
1,69733,The Man-Eater,80,"[{'id': 35, 'name': 'Comedy'}]",This is the story of a Sicilian woman that tri...,4.2009,5.0,39,4.81
2,202777,Peace in the Fields,91,"[{'id': 18, 'name': 'Drama'}]",This is a film which was made in Belgium in th...,0.2568,6.1,4,4.81
3,115134,The Scent,117,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...","A detective Seon-woo, who moonlights as a priv...",3.2696,5.8,23,4.83
4,130394,Here Come the Girls,78,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",Bob Hope stars as an inept member of the choru...,0.9136,4.4,7,4.8


credits: (11813, 3)


Unnamed: 0,id,cast,crew
0,65216,[],"[{'adult': False, 'gender': 0, 'id': 240716, '..."
1,69733,"[{'adult': False, 'gender': 1, 'id': 557004, '...","[{'adult': False, 'gender': 2, 'id': 126973, '..."
2,202777,"[{'adult': False, 'gender': 2, 'id': 44509, 'k...","[{'adult': False, 'gender': 0, 'id': 1185016, ..."
3,115134,"[{'adult': False, 'gender': 2, 'id': 93999, 'k...","[{'adult': False, 'gender': 0, 'id': 1292890, ..."
4,130394,"[{'adult': False, 'gender': 2, 'id': 82388, 'k...","[{'adult': False, 'gender': 2, 'id': 16042, 'k..."


### 1) 데이터 전처리

In [3]:
df = movies.merge(credits, on='id')

In [4]:
# 필요 컬럼 추출

info_col = ['genres', 'cast', 'crew']

df = df[info_col]

df.head()

Unnamed: 0,genres,cast,crew
0,[],[],"[{'adult': False, 'gender': 0, 'id': 240716, '..."
1,"[{'id': 35, 'name': 'Comedy'}]","[{'adult': False, 'gender': 1, 'id': 557004, '...","[{'adult': False, 'gender': 2, 'id': 126973, '..."
2,"[{'id': 18, 'name': 'Drama'}]","[{'adult': False, 'gender': 2, 'id': 44509, 'k...","[{'adult': False, 'gender': 0, 'id': 1185016, ..."
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...","[{'adult': False, 'gender': 2, 'id': 93999, 'k...","[{'adult': False, 'gender': 0, 'id': 1292890, ..."
4,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...","[{'adult': False, 'gender': 2, 'id': 82388, 'k...","[{'adult': False, 'gender': 2, 'id': 16042, 'k..."


In [5]:
# 각 컬럼의 문자열을 list 형식으로 변환

for col in info_col:
    df[col] = df[col].apply(eval)
    
df.dtypes

genres    object
cast      object
crew      object
dtype: object

### 2) genre 컬럼 전처리
- lower
- 공백 기준 join
- 특수문자 제거

In [6]:
# 장르 추출하여 'genre_names' 컬럼에 저장

def get_genre_names(val):
        return ' '.join([i.get('name','').lower() for i in val])

df['genres'] = df['genres'].apply(get_genre_names)

In [7]:
df.head()

Unnamed: 0,genres,cast,crew
0,,[],"[{'adult': False, 'gender': 0, 'id': 240716, '..."
1,comedy,"[{'adult': False, 'gender': 1, 'id': 557004, '...","[{'adult': False, 'gender': 2, 'id': 126973, '..."
2,drama,"[{'adult': False, 'gender': 2, 'id': 44509, 'k...","[{'adult': False, 'gender': 0, 'id': 1185016, ..."
3,comedy crime thriller,"[{'adult': False, 'gender': 2, 'id': 93999, 'k...","[{'adult': False, 'gender': 0, 'id': 1292890, ..."
4,comedy music,"[{'adult': False, 'gender': 2, 'id': 82388, 'k...","[{'adult': False, 'gender': 2, 'id': 16042, 'k..."


### 3) cast 컬럼 전처리

In [10]:
print(df['cast'].loc[1])

[{'adult': False, 'gender': 1, 'id': 557004, 'known_for_department': 'Acting', 'name': 'Loredana Cannata', 'original_name': 'Loredana Cannata', 'popularity': 1.0935, 'profile_path': '/vOGrntUNUw3WDAIH3x62NbCcHic.jpg', 'cast_id': 4, 'character': 'Giulia', 'credit_id': '52fe47d8c3a368484e0dcc0d', 'order': 0}, {'adult': False, 'gender': 2, 'id': 232666, 'known_for_department': 'Production', 'name': 'Arturo Paglia', 'original_name': 'Arturo Paglia', 'popularity': 0.1611, 'profile_path': '/ikyxWD98uLPpYXxlGXpTqbznUwp.jpg', 'cast_id': 5, 'character': 'Valerio', 'credit_id': '52fe47d8c3a368484e0dcc11', 'order': 1}, {'adult': False, 'gender': 2, 'id': 105342, 'known_for_department': 'Acting', 'name': 'Pascal Persiano', 'original_name': 'Pascal Persiano', 'popularity': 2.5802, 'profile_path': '/kIaJ65txI2f89bW4NrQX8LxZCRK.jpg', 'cast_id': 7, 'character': "l'amante della piscina", 'credit_id': '5522acb9c3a3685733001977', 'order': 2}, {'adult': False, 'gender': 1, 'id': 557005, 'known_for_departm

In [11]:
import re

# cast 컬럼의 값에서 name 키만 추출하여 빈칸 기준으로 join하는 함수
def extract_cast(val):
    # val이 리스트가 아닐 경우(결측치 등) 예외 처리
    if not isinstance(val, list):
        return_val =  ""
    
    # name 키의 값을 추출하여 빈킨 기준으로 join
    names = [i.get('name', '').lower().strip().replace(" ", "") for i in val]
    
    # 알파벳과 숫자를 제외한 모든 문자를 제거하는 정규표현식 적용(특수문자, 공백 등을 모두 제거)
    names = [re.sub(r'[^a-zA-Z0-9]', '',i) for i in names]
    
    # 등장 인물 6명까지만 포함
    if len(names) > 6:
        names = names[:6]
    
    return_val = ' '.join(names)
    
    return return_val
    
df['cast'] = df['cast'].apply(extract_cast)

In [12]:
df.head(5)

Unnamed: 0,genres,cast,crew
0,,,"[{'adult': False, 'gender': 0, 'id': 240716, '..."
1,comedy,loredanacannata arturopaglia pascalpersiano lo...,"[{'adult': False, 'gender': 2, 'id': 126973, '..."
2,drama,christianbarbier georgespoujouly clairewauthio...,"[{'adult': False, 'gender': 0, 'id': 1185016, ..."
3,comedy crime thriller,parkheesoon parksiyeon jusanguk kimjungtae lee...,"[{'adult': False, 'gender': 0, 'id': 1292890, ..."
4,comedy music,bobhope tonymartin arlenedahl rosemaryclooney ...,"[{'adult': False, 'gender': 2, 'id': 16042, 'k..."


### 4) crew 컬럼 전처리

In [13]:
print(df['crew'].iloc[10])

[{'adult': False, 'gender': 2, 'id': 89602, 'known_for_department': 'Acting', 'name': "Roscoe 'Fatty' Arbuckle", 'original_name': "Roscoe 'Fatty' Arbuckle", 'popularity': 0.131, 'profile_path': '/p0pR1wMRfRfLWZfdHywQcqph8vp.jpg', 'credit_id': '52fe4cc19251416c75124987', 'department': 'Directing', 'job': 'Director'}, {'adult': False, 'gender': 2, 'id': 89602, 'known_for_department': 'Acting', 'name': "Roscoe 'Fatty' Arbuckle", 'original_name': "Roscoe 'Fatty' Arbuckle", 'popularity': 0.131, 'profile_path': '/p0pR1wMRfRfLWZfdHywQcqph8vp.jpg', 'credit_id': '56f0680dc3a3687177000998', 'department': 'Writing', 'job': 'Writer'}]


In [None]:
# crew 컬럼에서 director 키의 값만 추출
def extract_crew(val):
    # val이 리스트가 아닐 경우(결측치 등) 예외 처리
    if not isinstance(val, list):
        return ""
    for i in val:
        if i['job'] == 'Director':
            return i['name'].lower().strip().replace(" ", "")
        
df['crew'] = df['crew'].apply(extract_crew)

In [15]:
df.head()

Unnamed: 0,genres,cast,crew
0,,,karstenkjær
1,comedy,loredanacannata arturopaglia pascalpersiano lo...,aureliogrimaldi
2,drama,christianbarbier georgespoujouly clairewauthio...,jacquesboigelot
3,comedy crime thriller,parkheesoon parksiyeon jusanguk kimjungtae lee...,kimhyoung-jun
4,comedy music,bobhope tonymartin arlenedahl rosemaryclooney ...,claudebinyon


### 5) feature 컬럼 생성 (genre + cast + crew)

In [16]:
# genres, cast, crew 컬럼을 빈칸 기준으로 합쳐서 feature 컬럼 생성
df['feature'] = df['genres'].astype(str) + ' ' + df['cast'].astype(str) + ' ' + df['crew'].astype(str)

df.head()

Unnamed: 0,genres,cast,crew,feature
0,,,karstenkjær,karstenkjær
1,comedy,loredanacannata arturopaglia pascalpersiano lo...,aureliogrimaldi,comedy loredanacannata arturopaglia pascalpers...
2,drama,christianbarbier georgespoujouly clairewauthio...,jacquesboigelot,drama christianbarbier georgespoujouly clairew...
3,comedy crime thriller,parkheesoon parksiyeon jusanguk kimjungtae lee...,kimhyoung-jun,comedy crime thriller parkheesoon parksiyeon j...
4,comedy music,bobhope tonymartin arlenedahl rosemaryclooney ...,claudebinyon,comedy music bobhope tonymartin arlenedahl ros...


In [18]:
df['feature'].loc[10]

"comedy alstjohn bartineburkett waltercreed johnsinclair roscoe'fatty'arbuckle"

### 6) 데이터 벡터화

In [19]:
## TF-IDF 벡터화

tfidf = TfidfVectorizer(
    lowercase=True,
)
tfidf_matrix = tfidf.fit_transform(df['feature'])

tfidf_matrix.shape

(11813, 40120)

### 7) 유사도 계산

In [20]:
info_cos_sim = cosine_similarity(tfidf_matrix)

info_cos_sim.shape

(11813, 11813)

### 8) 추천 함수 생성

In [21]:
def info_recommendation(
    dataframe: pd.DataFrame, 
    movie_name: str,
    sim_matrix,
    top_n: int,
    ) -> pd.Series :

    # movie_name이 대소문자 구분 없이 필터링되도록 처리
    idx = dataframe[dataframe['title'].str.lower() == movie_name.lower()].index[0]

    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)

    movie_idx = [i[0] for i in sim_scores if i[0] != idx][:top_n]

    return dataframe['title'].iloc[movie_idx]

movie_name = 'titanic'

info_recommendation(
    dataframe=movies,
    movie_name=movie_name,
    sim_matrix=info_cos_sim,
    top_n=10
)

2698                                      Trial Marriages
7045                         Simpallag Innondh Love Story
5438                                     Anything for You
6518                                 Genevieve of Brabant
2005    Monty Python's Flying Circus—John Cleese's Per...
2628                            Dave & Tim - Storytellers
3676                                 Friggin' Mafia Movie
3716     Couscous aux lardons : un mariage, deux cultures
4632                                    I Fetch the Bread
4886                    Juste pour rire 2014 - Les anglos
Name: title, dtype: object

### 9) 유사도 행렬 저장

In [22]:
joblib.dump(info_cos_sim, "../models/info_cos_sim.pkl")

['../models/info_cos_sim.pkl']