In [8]:
import pandas as pd
import numpy as np
import json

# JSON으로 받아온 장르 데이터 정돈: 문자열 중 '를 "로 대체한 뒤 name 값만 추출하여 반환하는 함수
def parse_genres(genres_str):
    genres = json.loads(genres_str.replace('\'', '"'))

    genres_list =[]
    for g in genres:
        genres_list.append(g['name'])

    return genres_list


meta = pd.read_csv('./the-movies-dataset/movies_metadata.csv', dtype='unicode')
meta = meta[['id', 'original_title', 'original_language', 'genres']]
meta = meta.rename(columns={'id': 'movieId'})
meta = meta[meta['original_language'] == 'en']

ratings = pd.read_csv('./the-movies-dataset/ratings_small.csv')
ratings = ratings[['userId', 'movieId', 'rating']]

# print(ratings.describe())
# describe()시 대략적인 평균, 최대 최소값 등의 정보 테이블 제공됨.

meta.movieId = pd.to_numeric(meta.movieId, errors='coerce')
ratings.movieId = pd.to_numeric(ratings.movieId, errors='coerce')

meta['genres'] = meta['genres'].apply(parse_genres)


In [9]:
meta.head()

Unnamed: 0,movieId,original_title,original_language,genres
0,862,Toy Story,en,"[Animation, Comedy, Family]"
1,8844,Jumanji,en,"[Adventure, Fantasy, Family]"
2,15602,Grumpier Old Men,en,"[Romance, Comedy]"
3,31357,Waiting to Exhale,en,"[Comedy, Drama, Romance]"
4,11862,Father of the Bride Part II,en,[Comedy]


In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [11]:
data = pd.merge(ratings, meta, on='movieId', how='inner') # movieId를 기준으로 + inner방식으로 테이블 merge
data.head()

Unnamed: 0,userId,movieId,rating,original_title,original_language,genres
0,1,1371,2.5,Rocky III,en,[Drama]
1,4,1371,4.0,Rocky III,en,[Drama]
2,7,1371,3.0,Rocky III,en,[Drama]
3,19,1371,4.0,Rocky III,en,[Drama]
4,21,1371,3.0,Rocky III,en,[Drama]


In [12]:
# 피벗 테이블을 만든다. 행: 유저 아이디, 열: 영화 제목
matrix = data.pivot_table(index='userId', columns='original_title', values='rating')
matrix.head(10)

original_title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 + 1,12 Angry Men,1408,...,Young and Innocent,Zaat,Zabriskie Point,Zapped Again!,Zardoz,Zodiac,eXistenZ,xXx,¡Three Amigos!,Мой сводный брат Франкенштейн
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,3.5,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,3.5,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,4.5,,,,,,,,,
9,,,,,,,,,,,...,4.0,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [13]:
# 장르 가중치 설정: 추천받으려는 영화와 같은 장르를 가진 영화라면 일치하는 장르당 0.05씩 상관계수에 더한다.
GENRE_WEIGHT = 0.05

# 피어슨 상관계수 구하는 함수
def pearsonR(s1, s2):
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c*s2_c) / np.sqrt(np.sum(s1_c**2) * np.sum(s2_c**2))

# 추천할 영화를 찾는 함수, n = 추천할 영화의 수
def recommend(input_movie, matrix, n, similar_genre=True):
    input_genres = meta[meta['original_title'] == input_movie]['genres'].iloc(0)[0]
    
    result = []
    
    for title in matrix.columns:
        if title == input_movie: # 입력한 영화와 같은 영화일 경우 패스(추천 목록에 넣지 않는다.)
            continue
        
        cor = pearsonR(matrix[input_movie], matrix[title])
        
        # 비슷한 장르 영화에 가중치를 줄 것인지 여부가 True이고, 입력한 영화의 장르 값이 데이터셋에 입력되어 있다면
        if similar_genre and len(input_genres) > 0:
            temp_genres = meta[meta['original_title'] == title]['genres'].iloc(0)[0]
            
            same_count = np.sum(np.isin(input_genres, temp_genres)) # 몇 개의 장르가 겹치는지 센다.
            cor += (GENRE_WEIGHT * same_count)
            
        if np.isnan(cor):
            continue
        else:
            result.append((title, '{:.2f}'.format(cor), temp_genres))
            # format함수: 문자열의 대괄호 자리에 format 뒤의 괄호안에 들어있는 값을 하나씩 넣는다
    
    # 결과값을 내림차순으로 정렬. 
    result.sort(key=lambda r: r[1], reverse=True)
    
    return result[:n]

In [14]:
recommend_result = recommend('The Dark Knight', matrix, 10, similar_genre=True)

pd.DataFrame(recommend_result, columns = ['Title', 'Correlation', 'Genre'])


  


Unnamed: 0,Title,Correlation,Genre
0,Prom Night,0.82,"[Horror, Mystery, Thriller]"
1,Wild Wild West,0.82,"[Action, Adventure, Comedy, Science Fiction, W..."
2,Blue Thunder,0.53,"[Science Fiction, Action, Thriller, Crime, Drama]"
3,Topaz,0.53,"[Action, Drama, Mystery, Thriller]"
4,Yamakasi - Les samouraïs des temps modernes,0.53,"[Action, Crime, Drama]"
5,Ludwig,0.52,"[Drama, History]"
6,Midnight in the Garden of Good and Evil,0.52,"[Crime, Drama, Mystery, Thriller]"
7,Sneakers,0.52,"[Comedy, Crime, Drama]"
8,Godzilla,0.51,"[Science Fiction, Action, Thriller]"
9,Bushwhacked,0.5,"[Adventure, Action, Comedy, Crime, Family]"
